xref: /aosp_15_r20/external/XNNPACK/bench/qs8-gemm.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <chrono>
9 #include <cmath>
10 #include <functional>
11 #include <limits>
12 #include <mutex>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif  // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24 
25 #include <xnnpack.h>
26 #include <xnnpack/aligned-allocator.h>
27 #include <xnnpack/common.h>
28 #include <xnnpack/gemm.h>
29 #include <xnnpack/math.h>
30 #include <xnnpack/microfnptr.h>
31 #include <xnnpack/microparams-init.h>
32 #include <xnnpack/pack.h>
33 
34 
GEMMBenchmark(benchmark::State & state,xnn_qs8_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr,bool extended_weights=false)35 static void GEMMBenchmark(benchmark::State& state,
36   xnn_qs8_gemm_minmax_ukernel_function gemm,
37   size_t mr, size_t nr, size_t kr, size_t sr,
38   xnn_init_qs8_conv_minmax_params_fn init_params,
39   benchmark::utils::IsaCheckFunction isa_check = nullptr,
40   bool extended_weights = false)
41 {
42   if (!cpuinfo_initialize()) {
43     state.SkipWithError("cpuinfo initialization failed");
44     return;
45   }
46   if (isa_check && !isa_check(state)) {
47     return;
48   }
49 
50   const size_t mc = state.range(0);
51   const size_t nc = state.range(1);
52   const size_t kc = state.range(2);
53 
54   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
55   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
56 
57   std::random_device random_device;
58   auto rng = std::mt19937(random_device());
59   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
60   auto i8rng = std::bind(
61     std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
62 
63   std::vector<int8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(int8_t));
64   std::generate(a.begin(), a.end(), std::ref(i8rng));
65   std::vector<int8_t> k(nc * kc);
66   std::generate(k.begin(), k.end(), std::ref(i8rng));
67   std::vector<int32_t> b(nc);
68   std::generate(b.begin(), b.end(), std::ref(i32rng));
69 
70   const size_t w_element_size = extended_weights ? sizeof(int16_t) : sizeof(int8_t);
71   const size_t w_size = nc_stride * sizeof(int32_t) + kc_stride * nc_stride * w_element_size;
72   const size_t c_elements = mc * nc;
73   const size_t num_buffers = 1 +
74     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), w_size + c_elements * sizeof(int8_t));
75 
76   std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
77   std::fill(w.begin(), w.end(), 0);
78   const xnn_qs8_packing_params packing_params = { 127 };
79   if (extended_weights) {
80     xnn_pack_qs8_gemm_xw_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
81   } else {
82     xnn_pack_qs8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
83   }
84   std::vector<int8_t> c(c_elements * num_buffers);
85   std::fill(c.begin(), c.end(), 0xA5);
86 
87   union xnn_qs8_conv_minmax_params quantization_params;
88   init_params(&quantization_params, 0.75f, 127, -127, 126);
89 
90   size_t buffer_index = 0;
91   for (auto _ : state) {
92     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
93     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
94     // - W is not in cache (for any cache level)
95     // - C is not in cache (for any cache level)
96     state.PauseTiming();
97     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
98     buffer_index = (buffer_index + 1) % num_buffers;
99     state.ResumeTiming();
100 
101     for (uint32_t m = 0; m < mc; m += mr) {
102       const uint32_t mb = min(mc - m, mr);
103       for (uint32_t n = 0; n < nc; n += nr) {
104         const uint32_t nb = min(nc - n, nr);
105         gemm(
106           mb, nb, kc * sizeof(int8_t),
107           a.data() + m * kc, kc * sizeof(int8_t),
108           w.data() + w_size * buffer_index + n * (kc_stride * w_element_size + sizeof(int32_t)),
109           c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(int8_t), nr * sizeof(int8_t),
110           &quantization_params);
111       }
112     }
113   }
114 
115   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
116   if (cpu_frequency != 0) {
117     state.counters["cpufreq"] = cpu_frequency;
118   }
119 
120   state.counters["OPS"] = benchmark::Counter(
121     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
122 }
123 
124 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)125 static void RuyBenchmark(benchmark::State& state, size_t threads)
126 {
127   const size_t mc = state.range(0);
128   const size_t nc = state.range(1);
129   const size_t kc = state.range(2);
130 
131   std::random_device random_device;
132   auto rng = std::mt19937(random_device());
133   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
134   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
135 
136   const size_t num_buffers = 1 +
137     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
138       nc * (sizeof(int8_t) * (mc + kc) + sizeof(int32_t)));
139 
140   std::vector<int8_t> a(mc * kc);
141   std::generate(a.begin(), a.end(), std::ref(u8rng));
142   std::vector<int8_t> k(num_buffers * nc * kc);
143   std::generate(k.begin(), k.end(), std::ref(u8rng));
144   std::vector<int32_t> b(num_buffers * nc);
145   std::generate(b.begin(), b.end(), std::ref(i32rng));
146   std::vector<int8_t> c(num_buffers * nc * mc);
147   std::fill(c.begin(), c.end(), std::nanf(""));
148 
149   // Note: context must be static to avoid the cost of re-creating it for each benchmark.
150   static ruy::Context context;
151   context.set_max_num_threads(threads);
152 
153   ruy::Matrix<int8_t> ruy_a;
154   ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
155   ruy_a.set_zero_point(127);
156   ruy::Matrix<int8_t> ruy_b;
157   ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
158   ruy_b.set_data(a.data());
159   ruy_b.set_zero_point(127);
160   ruy::Matrix<int8_t> ruy_c;
161   ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
162   ruy_c.set_zero_point(127);
163 
164   ruy::MulParams<int32_t, int8_t> mul_params;
165   mul_params.set_multiplier_fixedpoint(0x40000000);
166 
167   // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
168   // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
169   // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
170   // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
171   static std::once_flag warmup;
172   std::call_once(warmup, [&](){
173     auto start = std::chrono::steady_clock::now();
174     do {
175       ruy_a.set_data(k.data());
176       ruy_c.set_data(c.data());
177       mul_params.set_bias(b.data());
178 
179       ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
180     } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
181   });
182 
183   size_t buffer_index = 0;
184   for (auto _ : state) {
185     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
186     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
187     // - K is not in cache (for any cache level)
188     // - B is not in cache (for any cache level)
189     // - C is not in cache (for any cache level)
190     state.PauseTiming();
191     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
192     buffer_index = (buffer_index + 1) % num_buffers;
193     state.ResumeTiming();
194 
195     ruy_a.set_data(k.data() + buffer_index * nc * kc);
196     ruy_c.set_data(c.data() + buffer_index * mc * nc);
197     mul_params.set_bias(b.data() + buffer_index * nc);
198 
199     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
200   }
201 
202   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
203   if (cpu_frequency != 0) {
204     state.counters["cpufreq"] = cpu_frequency;
205   }
206 
207   state.counters["OPS"] = benchmark::Counter(
208     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
209 }
210 
ruy_st(benchmark::State & state,const char * net)211 static void ruy_st(benchmark::State& state, const char* net)
212 {
213   RuyBenchmark(state, 1);
214 }
215 #endif  // BENCHMARK_RUY
216 
217 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)218   static void GEMMBenchmark(benchmark::State& state,
219     xnn_jit_gemm_code_generator_function generator,
220     size_t mr, size_t nr, size_t kr, size_t sr,
221     xnn_init_qs8_conv_minmax_params_fn  init_params,
222     benchmark::utils::IsaCheckFunction isa_check = nullptr)
223   {
224     xnn_initialize(/*allocator=*/nullptr);
225     xnn_code_buffer code_buffer;
226     xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
227     const size_t nc = state.range(1);
228     const size_t kc = state.range(2);
229     generator(&code_buffer, mr, nc % nr, kc, nullptr);
230     xnn_finalize_code_memory(&code_buffer);
231     GEMMBenchmark(
232         state,
233         reinterpret_cast<xnn_qs8_gemm_minmax_ukernel_function>(code_buffer.start),
234         mr, nr, kr, sr, init_params, isa_check);
235     xnn_release_code_memory(&code_buffer);
236   }
237 
jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,const char * net)238   static void jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
239     GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
240       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
241   }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)242   static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
243     GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
244       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
245   }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)246   static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
247     GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
248       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
249   }
250   BENCHMARK_GEMM(jit_qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)251   BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
252   BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
253 #endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
254 
255 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
256   static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
257     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
258       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
259   }
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,const char * net)260   static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, const char* net) {
261     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55, 4, 8, 4, 1,
262       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
263   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)264   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
265     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, 4, 8, 1, 1,
266       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
267   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)268   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
269     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, 4, 8, 1, 1,
270       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
271   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)272   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
273     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, 4, 8, 1, 1,
274       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
275   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)276   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
277     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, 4, 8, 1, 1,
278       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
279   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)280   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
281     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
282       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
283   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)284   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
285     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
286       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
287   }
qs8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)288   static void qs8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
289     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, 1, 8, 1, 1,
290       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
291   }
qs8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)292   static void qs8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
293     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7, 1, 8, 1, 1,
294       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
295   }
296 
297   BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)298   BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
299   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
300   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
301   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
302   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
303   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
304   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
305   BENCHMARK_GEMM(qs8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7)
306   BENCHMARK_GEMM(qs8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
307 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
308 
309 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
310   static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
311     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1,
312       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
313   }
qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)314   static void qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
315     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32, 1, 16, 4, 1,
316       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
317   }
qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)318   static void qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
319     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64, 1, 16, 4, 1,
320       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
321   }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)322   static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
323     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32, 4, 16, 4, 1,
324       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
325   }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)326   static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
327     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64, 4, 16, 4, 1,
328       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
329   }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)330   static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
331     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, 4, 16, 4, 1,
332       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
333   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)334   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
335     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, 4, 8, 1, 1,
336       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
337   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)338   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
339     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
340       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
341   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)342   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
343     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, 4, 16, 1, 1,
344       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
345   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)346   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
347     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, 4, 16, 1, 1,
348       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
349   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)350   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
351     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, 4, 16, 1, 1,
352       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
353   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)354   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
355     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, 4, 16, 1, 1,
356       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
357   }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)358   static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
359     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, 1, 8, 8, 1,
360       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
361   }
qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)362   static void qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
363     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, 1, 8, 8, 1,
364       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
365   }
qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)366   static void qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
367     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, 1, 8, 8, 1,
368       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
369   }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)370   static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
371     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, 1, 8, 8, 1,
372       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
373   }
qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State & state,const char * net)374   static void qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State& state, const char* net) {
375     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, 2, 8, 8, 1,
376       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
377   }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)378   static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
379     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, 2, 8, 8, 1,
380       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
381   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)382   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
383     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, 2, 8, 8, 1,
384       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
385   }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)386   static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
387     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, 2, 8, 8, 1,
388       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
389   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)390   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
391     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, 2, 8, 8, 1,
392       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
393   }
qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State & state,const char * net)394   static void qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State& state, const char* net) {
395     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, 2, 8, 16, 1,
396       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
397   }
398 
399   BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld32)
BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)400   BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)
401   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld32)
402   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
403   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld128)
404   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
405   BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
406   BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
407   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
408   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
409   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
410   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
411   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm)
412   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal)
413   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53)
414   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53)
415   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mull)
416   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal)
417   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
418   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
419   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
420   BENCHMARK_GEMM(qs8_gemm_2x8c16__aarch64_neon_mlal)
421 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
422 
423 
424 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
425   static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
426     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, 1, 8, 4, 1,
427       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
428   }
qs8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)429   static void qs8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
430     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, 4, 8, 4, 1,
431       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
432   }
qs8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)433   static void qs8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
434     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, 6, 8, 4, 1,
435       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
436   }
qs8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)437   static void qs8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
438     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, 8, 8, 4, 1,
439       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
440   }
qs8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)441   static void qs8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
442     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, 1, 16, 4, 1,
443       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
444   }
qs8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)445   static void qs8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
446     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, 4, 16, 4, 1,
447       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
448   }
qs8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)449   static void qs8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
450     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, 6, 16, 4, 1,
451       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
452   }
qs8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)453   static void qs8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
454     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
455       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
456   }
457 
458   BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)459   BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
460   BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)
461   BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
462   BENCHMARK_GEMM(qs8_gemm_1x16c4__neondot)
463   BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
464   BENCHMARK_GEMM(qs8_gemm_6x16c4__neondot)
465   BENCHMARK_GEMM(qs8_gemm_8x16c4__neondot)
466 #endif  // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
467 
468 
469 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
470   static void qs8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
471     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, 1, 8, 1, 1,
472       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
473   }
qs8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)474   static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
475     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, 2, 8, 1, 1,
476       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
477   }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)478   static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
479     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, 3, 8, 1, 1,
480       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
481   }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)482   static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
483     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, 4, 8, 1, 1,
484       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
485   }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)486   static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
487     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, 6, 8, 1, 1,
488       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
489   }
qs8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)490   static void qs8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
491     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, 1, 16, 1, 1,
492       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
493   }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)494   static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
495     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1,
496       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
497   }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)498   static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
499     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, 3, 16, 1, 1,
500       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
501   }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)502   static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
503     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, 4, 16, 1, 1,
504       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
505   }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)506   static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
507     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, 6, 16, 1, 1,
508       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
509   }
qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)510   static void qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
511     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm, 1, 8, 1, 1,
512       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
513   }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)514   static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
515     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm, 2, 8, 1, 1,
516       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
517   }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)518   static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
519     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, 3, 8, 1, 1,
520       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
521   }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)522   static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
523     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm, 4, 8, 1, 1,
524       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
525   }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)526   static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
527     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, 6, 8, 1, 1,
528       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
529   }
qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)530   static void qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
531     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, 1, 16, 1, 1,
532       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
533   }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)534   static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
535     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, 2, 16, 1, 1,
536       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
537   }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)538   static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
539     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm, 3, 16, 1, 1,
540       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
541   }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)542   static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
543     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm, 4, 16, 1, 1,
544       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
545   }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)546   static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
547     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm, 6, 16, 1, 1,
548       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
549   }
qs8_gemm_1x8c2__neon_mull_dup(benchmark::State & state,const char * net)550   static void qs8_gemm_1x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
551     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, 1, 8, 2, 1,
552       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
553   }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,const char * net)554   static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
555     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, 2, 8, 2, 1,
556       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
557   }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,const char * net)558   static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
559     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, 3, 8, 2, 1,
560       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
561   }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,const char * net)562   static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
563     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, 4, 8, 2, 1,
564       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
565   }
qs8_gemm_1x16c2__neon_mull_dup(benchmark::State & state,const char * net)566   static void qs8_gemm_1x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
567     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, 1, 16, 2, 1,
568       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
569   }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,const char * net)570   static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
571     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, 2, 16, 2, 1,
572       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
573   }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,const char * net)574   static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
575     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, 3, 16, 2, 1,
576       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
577   }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,const char * net)578   static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
579     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, 4, 16, 2, 1,
580       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
581   }
qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State & state,const char * net)582   static void qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
583     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, 1, 8, 2, 1,
584       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
585   }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,const char * net)586   static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
587     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, 2, 8, 2, 1,
588       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
589   }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,const char * net)590   static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
591     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, 3, 8, 2, 1,
592       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
593   }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,const char * net)594   static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
595     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, 4, 8, 2, 1,
596       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
597   }
qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State & state,const char * net)598   static void qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
599     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, 1, 16, 2, 1,
600       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
601   }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,const char * net)602   static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
603     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, 2, 16, 2, 1,
604       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
605   }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,const char * net)606   static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
607     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, 3, 16, 2, 1,
608       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
609   }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,const char * net)610   static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
611     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, 4, 16, 2, 1,
612       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
613   }
qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)614   static void qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
615     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, 1, 8, 2, 1,
616       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
617   }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)618   static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
619     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, 2, 8, 2, 1,
620       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
621   }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)622   static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
623     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, 3, 8, 2, 1,
624       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
625   }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)626   static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
627     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, 4, 8, 2, 1,
628       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
629   }
qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)630   static void qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
631     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r, 1, 16, 2, 1,
632       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
633   }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)634   static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
635     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r, 2, 16, 2, 1,
636       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
637   }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)638   static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
639     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r, 3, 16, 2, 1,
640       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
641   }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)642   static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
643     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, 4, 16, 2, 1,
644       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
645   }
qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)646   static void qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
647     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, 1, 8, 2, 1,
648       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
649   }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)650   static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
651     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, 2, 8, 2, 1,
652       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
653   }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)654   static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
655     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, 3, 8, 2, 1,
656       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
657   }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)658   static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
659     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, 4, 8, 2, 1,
660       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
661   }
qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)662   static void qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
663     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, 1, 16, 2, 1,
664       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
665   }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)666   static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
667     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r, 2, 16, 2, 1,
668       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
669   }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)670   static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
671     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r, 3, 16, 2, 1,
672       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
673   }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)674   static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
675     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r, 4, 16, 2, 1,
676       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
677   }
qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)678   static void qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
679     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, 1, 8, 2, 1,
680       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
681   }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)682   static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
683     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r, 2, 8, 2, 1,
684       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
685   }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)686   static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
687     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r, 3, 8, 2, 1,
688       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
689   }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)690   static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
691     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r, 4, 8, 2, 1,
692       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
693   }
qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)694   static void qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
695     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, 1, 16, 2, 1,
696       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
697   }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)698   static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
699     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, 2, 16, 2, 1,
700       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
701   }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)702   static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
703     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, 3, 16, 2, 1,
704       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
705   }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)706   static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
707     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, 4, 16, 2, 1,
708       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
709   }
qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)710   static void qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
711     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, 1, 8, 2, 1,
712       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
713   }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)714   static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
715     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r, 2, 8, 2, 1,
716       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
717   }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)718   static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
719     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r, 3, 8, 2, 1,
720       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
721   }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)722   static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
723     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r, 4, 8, 2, 1,
724       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
725   }
qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)726   static void qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
727     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, 1, 16, 2, 1,
728       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
729   }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)730   static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
731     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, 2, 16, 2, 1,
732       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
733   }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)734   static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
735     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, 3, 16, 2, 1,
736       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
737   }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)738   static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
739     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, 4, 16, 2, 1,
740       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
741   }
qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)742   static void qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
743     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, 1, 8, 2, 1,
744       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
745   }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)746   static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
747     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, 2, 8, 2, 1,
748       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
749   }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)750   static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
751     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r, 3, 8, 2, 1,
752       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
753   }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)754   static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
755     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r, 4, 8, 2, 1,
756       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
757   }
qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)758   static void qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
759     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r, 1, 16, 2, 1,
760       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
761   }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)762   static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
763     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r, 2, 16, 2, 1,
764       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
765   }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)766   static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
767     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r, 3, 16, 2, 1,
768       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
769   }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)770   static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
771     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r, 4, 16, 2, 1,
772       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
773   }
qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)774   static void qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
775     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, 1, 8, 2, 1,
776       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
777   }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)778   static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
779     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r, 2, 8, 2, 1,
780       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
781   }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)782   static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
783     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r, 3, 8, 2, 1,
784       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
785   }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)786   static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
787     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r, 4, 8, 2, 1,
788       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
789   }
qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)790   static void qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
791     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, 1, 16, 2, 1,
792       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
793   }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)794   static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
795     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, 2, 16, 2, 1,
796       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
797   }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)798   static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
799     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, 3, 16, 2, 1,
800       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
801   }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)802   static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
803     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r, 4, 16, 2, 1,
804       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
805   }
qs8_gemm_1x8c2s4__neon_mull(benchmark::State & state,const char * net)806   static void qs8_gemm_1x8c2s4__neon_mull(benchmark::State& state, const char* net) {
807     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, 1, 8, 2, 4,
808       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
809   }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,const char * net)810   static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, const char* net) {
811     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, 2, 8, 2, 4,
812       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
813   }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,const char * net)814   static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, const char* net) {
815     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, 3, 8, 2, 4,
816       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
817   }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,const char * net)818   static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, const char* net) {
819     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, 4, 8, 2, 4,
820       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
821   }
qs8_gemm_1x16c2s4__neon_mull(benchmark::State & state,const char * net)822   static void qs8_gemm_1x16c2s4__neon_mull(benchmark::State& state, const char* net) {
823     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, 1, 16, 2, 4,
824       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
825   }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,const char * net)826   static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, const char* net) {
827     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, 2, 16, 2, 4,
828       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
829   }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,const char * net)830   static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, const char* net) {
831     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, 3, 16, 2, 4,
832       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
833   }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,const char * net)834   static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, const char* net) {
835     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, 4, 16, 2, 4,
836       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
837   }
qs8_gemm_1x8c2s4__neon_mlal(benchmark::State & state,const char * net)838   static void qs8_gemm_1x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
839     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, 1, 8, 2, 4,
840       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
841   }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,const char * net)842   static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
843     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, 2, 8, 2, 4,
844       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
845   }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,const char * net)846   static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
847     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, 3, 8, 2, 4,
848       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
849   }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,const char * net)850   static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
851     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, 4, 8, 2, 4,
852       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
853   }
qs8_gemm_1x16c2s4__neon_mlal(benchmark::State & state,const char * net)854   static void qs8_gemm_1x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
855     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, 1, 16, 2, 4,
856       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
857   }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,const char * net)858   static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
859     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, 2, 16, 2, 4,
860       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
861   }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,const char * net)862   static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
863     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, 3, 16, 2, 4,
864       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
865   }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,const char * net)866   static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
867     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, 4, 16, 2, 4,
868       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
869   }
qs8_gemm_1x8c4__neon_mull_dup(benchmark::State & state,const char * net)870   static void qs8_gemm_1x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
871     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, 1, 8, 4, 1,
872       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
873   }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,const char * net)874   static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
875     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, 2, 8, 4, 1,
876       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
877   }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,const char * net)878   static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
879     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, 3, 8, 4, 1,
880       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
881   }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,const char * net)882   static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
883     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, 4, 8, 4, 1,
884       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
885   }
qs8_gemm_1x16c4__neon_mull_dup(benchmark::State & state,const char * net)886   static void qs8_gemm_1x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
887     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, 1, 16, 4, 1,
888       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
889   }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,const char * net)890   static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
891     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, 2, 16, 4, 1,
892       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
893   }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,const char * net)894   static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
895     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, 3, 16, 4, 1,
896       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
897   }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,const char * net)898   static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
899     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, 4, 16, 4, 1,
900       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
901   }
qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State & state,const char * net)902   static void qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
903     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, 1, 8, 4, 1,
904       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
905   }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,const char * net)906   static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
907     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, 2, 8, 4, 1,
908       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
909   }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,const char * net)910   static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
911     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, 3, 8, 4, 1,
912       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
913   }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,const char * net)914   static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
915     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, 4, 8, 4, 1,
916       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
917   }
qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State & state,const char * net)918   static void qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
919     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, 1, 16, 4, 1,
920       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
921   }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,const char * net)922   static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
923     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, 2, 16, 4, 1,
924       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
925   }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,const char * net)926   static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
927     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, 3, 16, 4, 1,
928       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
929   }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,const char * net)930   static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
931     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, 4, 16, 4, 1,
932       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
933   }
qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)934   static void qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
935     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, 1, 8, 4, 1,
936       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
937   }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)938   static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
939     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, 2, 8, 4, 1,
940       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
941   }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)942   static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
943     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, 3, 8, 4, 1,
944       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
945   }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)946   static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
947     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, 4, 8, 4, 1,
948       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
949   }
qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)950   static void qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
951     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r, 1, 16, 4, 1,
952       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
953   }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)954   static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
955     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r, 2, 16, 4, 1,
956       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
957   }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)958   static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
959     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r, 3, 16, 4, 1,
960       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
961   }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)962   static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
963     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, 4, 16, 4, 1,
964       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
965   }
qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)966   static void qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
967     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, 1, 8, 4, 1,
968       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
969   }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)970   static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
971     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, 2, 8, 4, 1,
972       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
973   }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)974   static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
975     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, 3, 8, 4, 1,
976       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
977   }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)978   static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
979     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, 4, 8, 4, 1,
980       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
981   }
qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)982   static void qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
983     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r, 1, 16, 4, 1,
984       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
985   }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)986   static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
987     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r, 2, 16, 4, 1,
988       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
989   }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)990   static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
991     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r, 3, 16, 4, 1,
992       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
993   }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)994   static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
995     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, 4, 16, 4, 1,
996       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
997   }
qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)998   static void qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
999     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, 1, 8, 4, 1,
1000       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1001   }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)1002   static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1003     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r, 2, 8, 4, 1,
1004       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1005   }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)1006   static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1007     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r, 3, 8, 4, 1,
1008       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1009   }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)1010   static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1011     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r, 4, 8, 4, 1,
1012       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1013   }
qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1014   static void qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1015     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, 1, 16, 4, 1,
1016       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1017   }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1018   static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1019     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, 2, 16, 4, 1,
1020       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1021   }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1022   static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1023     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, 3, 16, 4, 1,
1024       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1025   }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1026   static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1027     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, 4, 16, 4, 1,
1028       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1029   }
qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1030   static void qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1031     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, 1, 8, 4, 1,
1032       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1033   }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1034   static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1035     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r, 2, 8, 4, 1,
1036       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1037   }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1038   static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1039     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r, 3, 8, 4, 1,
1040       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1041   }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1042   static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1043     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r, 4, 8, 4, 1,
1044       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1045   }
qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1046   static void qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1047     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, 1, 16, 4, 1,
1048       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1049   }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1050   static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1051     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, 2, 16, 4, 1,
1052       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1053   }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1054   static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1055     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, 3, 16, 4, 1,
1056       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1057   }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1058   static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1059     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, 4, 16, 4, 1,
1060       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1061   }
qs8_gemm_1x8c8__neon_mull(benchmark::State & state,const char * net)1062   static void qs8_gemm_1x8c8__neon_mull(benchmark::State& state, const char* net) {
1063     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull, 1, 8, 8, 1,
1064       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1065   }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,const char * net)1066   static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, const char* net) {
1067     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull, 2, 8, 8, 1,
1068       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1069   }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,const char * net)1070   static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, const char* net) {
1071     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull, 3, 8, 8, 1,
1072       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1073   }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,const char * net)1074   static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, const char* net) {
1075     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull, 4, 8, 8, 1,
1076       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1077   }
qs8_gemm_1x16c8__neon_mull(benchmark::State & state,const char * net)1078   static void qs8_gemm_1x16c8__neon_mull(benchmark::State& state, const char* net) {
1079     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull, 1, 16, 8, 1,
1080       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1081   }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,const char * net)1082   static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, const char* net) {
1083     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull, 2, 16, 8, 1,
1084       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1085   }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,const char * net)1086   static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, const char* net) {
1087     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull, 3, 16, 8, 1,
1088       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1089   }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,const char * net)1090   static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, const char* net) {
1091     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull, 4, 16, 8, 1,
1092       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1093   }
qs8_gemm_1x8c8__neon_mlal(benchmark::State & state,const char * net)1094   static void qs8_gemm_1x8c8__neon_mlal(benchmark::State& state, const char* net) {
1095     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, 1, 8, 8, 1,
1096       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1097   }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,const char * net)1098   static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, const char* net) {
1099     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, 2, 8, 8, 1,
1100       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1101   }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,const char * net)1102   static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, const char* net) {
1103     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, 3, 8, 8, 1,
1104       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1105   }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,const char * net)1106   static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, const char* net) {
1107     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, 4, 8, 8, 1,
1108       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1109   }
qs8_gemm_1x16c8__neon_mlal(benchmark::State & state,const char * net)1110   static void qs8_gemm_1x16c8__neon_mlal(benchmark::State& state, const char* net) {
1111     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal, 1, 16, 8, 1,
1112       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1113   }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,const char * net)1114   static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, const char* net) {
1115     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal, 2, 16, 8, 1,
1116       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1117   }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,const char * net)1118   static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, const char* net) {
1119     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, 3, 16, 8, 1,
1120       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1121   }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,const char * net)1122   static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, const char* net) {
1123     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, 4, 16, 8, 1,
1124       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1125   }
qs8_gemm_1x8c16__neon_mlal(benchmark::State & state,const char * net)1126   static void qs8_gemm_1x8c16__neon_mlal(benchmark::State& state, const char* net) {
1127     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, 1, 8, 16, 1,
1128       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1129   }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,const char * net)1130   static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, const char* net) {
1131     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, 2, 8, 16, 1,
1132       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1133   }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,const char * net)1134   static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, const char* net) {
1135     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, 3, 8, 16, 1,
1136       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1137   }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,const char * net)1138   static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, const char* net) {
1139     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, 4, 8, 16, 1,
1140       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1141   }
qs8_gemm_1x16c16__neon_mlal(benchmark::State & state,const char * net)1142   static void qs8_gemm_1x16c16__neon_mlal(benchmark::State& state, const char* net) {
1143     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, 1, 16, 16, 1,
1144       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1145   }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,const char * net)1146   static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, const char* net) {
1147     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, 2, 16, 16, 1,
1148       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1149   }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,const char * net)1150   static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, const char* net) {
1151     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal, 3, 16, 16, 1,
1152       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1153   }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,const char * net)1154   static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, const char* net) {
1155     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal, 4, 16, 16, 1,
1156       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1157   }
1158 
1159   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_dup)
BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)1160   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)
1161   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_dup)
1162   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_dup)
1163   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_dup)
1164   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_dup)
1165   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_dup)
1166   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_dup)
1167   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_dup)
1168   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_dup)
1169   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_dup)
1170   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_dup)
1171   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_dup)
1172   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_dup)
1173   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_dup)
1174   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_dup)
1175   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld1r)
1176   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld1r)
1177   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld1r)
1178   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld1r)
1179   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld1r)
1180   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld1r)
1181   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld1r)
1182   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld1r)
1183   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld1r)
1184   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld1r)
1185   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld1r)
1186   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld1r)
1187   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld1r)
1188   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld1r)
1189   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld1r)
1190   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld1r)
1191   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld2r)
1192   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld2r)
1193   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld2r)
1194   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld2r)
1195   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld2r)
1196   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld2r)
1197   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld2r)
1198   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld2r)
1199   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld2r)
1200   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld2r)
1201   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld2r)
1202   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld2r)
1203   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld2r)
1204   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld2r)
1205   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld2r)
1206   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld2r)
1207   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_dup)
1208   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_dup)
1209   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_dup)
1210   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_dup)
1211   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_dup)
1212   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_dup)
1213   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_dup)
1214   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_dup)
1215   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_dup)
1216   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_dup)
1217   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_dup)
1218   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_dup)
1219   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_dup)
1220   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_dup)
1221   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_dup)
1222   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_dup)
1223   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld1r)
1224   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld1r)
1225   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld1r)
1226   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld1r)
1227   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld1r)
1228   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld1r)
1229   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld1r)
1230   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld1r)
1231   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld1r)
1232   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld1r)
1233   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld1r)
1234   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld1r)
1235   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld1r)
1236   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld1r)
1237   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld1r)
1238   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld1r)
1239   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld2r)
1240   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld2r)
1241   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld2r)
1242   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld2r)
1243   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld2r)
1244   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld2r)
1245   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld2r)
1246   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld2r)
1247   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld2r)
1248   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld2r)
1249   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld2r)
1250   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld2r)
1251   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld2r)
1252   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld2r)
1253   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld2r)
1254   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld2r)
1255   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld4r)
1256   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld4r)
1257   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld4r)
1258   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld4r)
1259   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld4r)
1260   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld4r)
1261   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld4r)
1262   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld4r)
1263   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld4r)
1264   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld4r)
1265   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld4r)
1266   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld4r)
1267   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld4r)
1268   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld4r)
1269   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld4r)
1270   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld4r)
1271   BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mull)
1272   BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mull)
1273   BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mull)
1274   BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mull)
1275   BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mull)
1276   BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mull)
1277   BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mull)
1278   BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mull)
1279   BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mlal)
1280   BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mlal)
1281   BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mlal)
1282   BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mlal)
1283   BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mlal)
1284   BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mlal)
1285   BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mlal)
1286   BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mlal)
1287   BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane)
1288   BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane)
1289   BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane)
1290   BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
1291   BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane)
1292   BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane)
1293   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
1294   BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane)
1295   BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane)
1296   BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane)
1297   BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane_prfm)
1298   BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane_prfm)
1299   BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane_prfm)
1300   BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane_prfm)
1301   BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane_prfm)
1302   BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane_prfm)
1303   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane_prfm)
1304   BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane_prfm)
1305   BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane_prfm)
1306   BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane_prfm)
1307   BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mull)
1308   BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mull)
1309   BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mull)
1310   BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mull)
1311   BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mull)
1312   BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mull)
1313   BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mull)
1314   BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mull)
1315   BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mlal)
1316   BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mlal)
1317   BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mlal)
1318   BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mlal)
1319   BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mlal)
1320   BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mlal)
1321   BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mlal)
1322   BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mlal)
1323   BENCHMARK_GEMM(qs8_gemm_1x8c16__neon_mlal)
1324   BENCHMARK_GEMM(qs8_gemm_2x8c16__neon_mlal)
1325   BENCHMARK_GEMM(qs8_gemm_3x8c16__neon_mlal)
1326   BENCHMARK_GEMM(qs8_gemm_4x8c16__neon_mlal)
1327   BENCHMARK_GEMM(qs8_gemm_1x16c16__neon_mlal)
1328   BENCHMARK_GEMM(qs8_gemm_2x16c16__neon_mlal)
1329   BENCHMARK_GEMM(qs8_gemm_3x16c16__neon_mlal)
1330   BENCHMARK_GEMM(qs8_gemm_4x16c16__neon_mlal)
1331 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1332 
1333 
1334 #if XNN_ARCH_ARM
1335   static void qs8_gemm_1x1c4__armsimd32(benchmark::State& state, const char* net) {
1336     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, 1, 1, 4, 1,
1337       xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1338   }
qs8_gemm_2x1c4__armsimd32(benchmark::State & state,const char * net)1339   static void qs8_gemm_2x1c4__armsimd32(benchmark::State& state, const char* net) {
1340     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, 2, 1, 4, 1,
1341       xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1342   }
qs8_gemm_1x2c4__armsimd32(benchmark::State & state,const char * net)1343   static void qs8_gemm_1x2c4__armsimd32(benchmark::State& state, const char* net) {
1344     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, 1, 2, 4, 1,
1345       xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1346   }
qs8_gemm_2x2c4__armsimd32(benchmark::State & state,const char * net)1347   static void qs8_gemm_2x2c4__armsimd32(benchmark::State& state, const char* net) {
1348     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, 2, 2, 4, 1,
1349       xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1350   }
1351 
1352   BENCHMARK_GEMM(qs8_gemm_1x1c4__armsimd32)
BENCHMARK_GEMM(qs8_gemm_2x1c4__armsimd32)1353   BENCHMARK_GEMM(qs8_gemm_2x1c4__armsimd32)
1354   BENCHMARK_GEMM(qs8_gemm_1x2c4__armsimd32)
1355   BENCHMARK_GEMM(qs8_gemm_2x2c4__armsimd32)
1356 #endif  // XNN_ARCH_ARM
1357 
1358 
1359 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1360   static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
1361     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, 2, 16, 8, 1,
1362       xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1363   }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)1364   static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
1365     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, 3, 16, 8, 1,
1366       xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1367   }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)1368   static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
1369     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, 4, 16, 8, 1,
1370       xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1371   }
1372 
qs8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)1373   static void qs8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
1374     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1375       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1376   }
qs8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)1377   static void qs8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
1378     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1379       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1380   }
1381 
qs8_gemm_xw_2x8c8__avx2(benchmark::State & state,const char * net)1382   static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
1383     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1384       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1385   }
qs8_gemm_xw_3x8c8__avx2(benchmark::State & state,const char * net)1386   static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
1387     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1388       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1389   }
1390 
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)1391   static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
1392     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
1393       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1394   }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)1395   static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
1396     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
1397       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1398   }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)1399   static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
1400     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
1401       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1402   }
1403 
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)1404   static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
1405     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
1406       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1407   }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)1408   static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
1409     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
1410       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1411   }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)1412   static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
1413     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
1414       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1415   }
1416 
qs8_gemm_xw_2x4c2__xop(benchmark::State & state,const char * net)1417   static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
1418     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, 2, 4, 2, 1,
1419       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1420   }
qs8_gemm_xw_3x4c2__xop(benchmark::State & state,const char * net)1421   static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
1422     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, 3, 4, 2, 1,
1423       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1424   }
qs8_gemm_xw_4x4c2__xop(benchmark::State & state,const char * net)1425   static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
1426     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, 4, 4, 2, 1,
1427       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1428   }
1429 
qs8_gemm_2x4c2s4__xop_ld64(benchmark::State & state,const char * net)1430   static void qs8_gemm_2x4c2s4__xop_ld64(benchmark::State& state, const char* net) {
1431     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, 2, 4, 2, 4,
1432       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1433   }
qs8_gemm_3x4c2s4__xop_ld64(benchmark::State & state,const char * net)1434   static void qs8_gemm_3x4c2s4__xop_ld64(benchmark::State& state, const char* net) {
1435     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld64, 3, 4, 2, 4,
1436       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1437   }
qs8_gemm_4x4c2s4__xop_ld64(benchmark::State & state,const char * net)1438   static void qs8_gemm_4x4c2s4__xop_ld64(benchmark::State& state, const char* net) {
1439     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, 4, 4, 2, 4,
1440       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1441   }
1442 
qs8_gemm_2x4c2s4__xop_ld128(benchmark::State & state,const char * net)1443   static void qs8_gemm_2x4c2s4__xop_ld128(benchmark::State& state, const char* net) {
1444     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld128, 2, 4, 2, 4,
1445       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1446   }
qs8_gemm_3x4c2s4__xop_ld128(benchmark::State & state,const char * net)1447   static void qs8_gemm_3x4c2s4__xop_ld128(benchmark::State& state, const char* net) {
1448     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, 3, 4, 2, 4,
1449       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1450   }
qs8_gemm_4x4c2s4__xop_ld128(benchmark::State & state,const char * net)1451   static void qs8_gemm_4x4c2s4__xop_ld128(benchmark::State& state, const char* net) {
1452     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, 4, 4, 2, 4,
1453       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1454   }
1455 
qs8_gemm_xw_2x4c2s4__xop(benchmark::State & state,const char * net)1456   static void qs8_gemm_xw_2x4c2s4__xop(benchmark::State& state, const char* net) {
1457     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2s4__xop, 2, 4, 2, 4,
1458       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1459   }
qs8_gemm_xw_3x4c2s4__xop(benchmark::State & state,const char * net)1460   static void qs8_gemm_xw_3x4c2s4__xop(benchmark::State& state, const char* net) {
1461     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2s4__xop, 3, 4, 2, 4,
1462       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1463   }
qs8_gemm_xw_4x4c2s4__xop(benchmark::State & state,const char * net)1464   static void qs8_gemm_xw_4x4c2s4__xop(benchmark::State& state, const char* net) {
1465     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2s4__xop, 4, 4, 2, 4,
1466       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1467   }
1468 
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)1469   static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
1470     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
1471       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1472   }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)1473   static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
1474     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
1475       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1476   }
1477 
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)1478   static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
1479     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
1480       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1481   }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)1482   static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
1483     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
1484       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1485   }
1486 
qs8_gemm_xw_2x4c8__xop(benchmark::State & state,const char * net)1487   static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
1488     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, 2, 4, 8, 1,
1489       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1490   }
qs8_gemm_xw_3x4c8__xop(benchmark::State & state,const char * net)1491   static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
1492     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, 3, 4, 8, 1,
1493       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1494   }
1495 
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)1496   static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
1497     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
1498       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1499   }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)1500   static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
1501     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
1502       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1503   }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)1504   static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
1505     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
1506       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1507   }
1508 
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)1509   static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
1510     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
1511       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1512   }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)1513   static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
1514     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
1515       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1516   }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)1517   static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
1518     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
1519       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1520   }
1521 
qs8_gemm_xw_2x4c2__avx(benchmark::State & state,const char * net)1522   static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
1523     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, 2, 4, 2, 1,
1524       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1525   }
qs8_gemm_xw_3x4c2__avx(benchmark::State & state,const char * net)1526   static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
1527     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, 3, 4, 2, 1,
1528       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1529   }
qs8_gemm_xw_4x4c2__avx(benchmark::State & state,const char * net)1530   static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
1531     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, 4, 4, 2, 1,
1532       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1533   }
1534 
qs8_gemm_2x4c2s4__avx_ld64(benchmark::State & state,const char * net)1535   static void qs8_gemm_2x4c2s4__avx_ld64(benchmark::State& state, const char* net) {
1536     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, 2, 4, 2, 4,
1537       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1538   }
qs8_gemm_3x4c2s4__avx_ld64(benchmark::State & state,const char * net)1539   static void qs8_gemm_3x4c2s4__avx_ld64(benchmark::State& state, const char* net) {
1540     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64, 3, 4, 2, 4,
1541       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1542   }
qs8_gemm_4x4c2s4__avx_ld64(benchmark::State & state,const char * net)1543   static void qs8_gemm_4x4c2s4__avx_ld64(benchmark::State& state, const char* net) {
1544     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, 4, 4, 2, 4,
1545       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1546   }
1547 
qs8_gemm_2x4c2s4__avx_ld128(benchmark::State & state,const char * net)1548   static void qs8_gemm_2x4c2s4__avx_ld128(benchmark::State& state, const char* net) {
1549     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, 2, 4, 2, 4,
1550       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1551   }
qs8_gemm_3x4c2s4__avx_ld128(benchmark::State & state,const char * net)1552   static void qs8_gemm_3x4c2s4__avx_ld128(benchmark::State& state, const char* net) {
1553     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128, 3, 4, 2, 4,
1554       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1555   }
qs8_gemm_4x4c2s4__avx_ld128(benchmark::State & state,const char * net)1556   static void qs8_gemm_4x4c2s4__avx_ld128(benchmark::State& state, const char* net) {
1557     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128, 4, 4, 2, 4,
1558       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1559   }
1560 
qs8_gemm_xw_2x4c2s4__avx(benchmark::State & state,const char * net)1561   static void qs8_gemm_xw_2x4c2s4__avx(benchmark::State& state, const char* net) {
1562     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2s4__avx, 2, 4, 2, 4,
1563       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1564   }
qs8_gemm_xw_3x4c2s4__avx(benchmark::State & state,const char * net)1565   static void qs8_gemm_xw_3x4c2s4__avx(benchmark::State& state, const char* net) {
1566     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2s4__avx, 3, 4, 2, 4,
1567       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1568   }
qs8_gemm_xw_4x4c2s4__avx(benchmark::State & state,const char * net)1569   static void qs8_gemm_xw_4x4c2s4__avx(benchmark::State& state, const char* net) {
1570     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2s4__avx, 4, 4, 2, 4,
1571       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1572   }
1573 
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)1574   static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
1575     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
1576       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1577   }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)1578   static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
1579     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
1580       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1581   }
1582 
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)1583   static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
1584     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
1585       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1586   }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)1587   static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
1588     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
1589       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1590   }
1591 
qs8_gemm_xw_2x4c8__avx(benchmark::State & state,const char * net)1592   static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
1593     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, 2, 4, 8, 1,
1594       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1595   }
qs8_gemm_xw_3x4c8__avx(benchmark::State & state,const char * net)1596   static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
1597     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, 3, 4, 8, 1,
1598       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1599   }
1600 
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)1601   static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1602     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
1603       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1604   }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)1605   static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1606     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
1607       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1608   }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)1609   static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1610     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
1611       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1612   }
1613 
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)1614   static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1615     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
1616       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1617   }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)1618   static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1619     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
1620       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1621   }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)1622   static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1623     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
1624       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1625   }
1626 
qs8_gemm_xw_2x4c2__sse41(benchmark::State & state,const char * net)1627   static void qs8_gemm_xw_2x4c2__sse41(benchmark::State& state, const char* net) {
1628     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, 2, 4, 2, 1,
1629       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1630   }
qs8_gemm_xw_3x4c2__sse41(benchmark::State & state,const char * net)1631   static void qs8_gemm_xw_3x4c2__sse41(benchmark::State& state, const char* net) {
1632     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, 3, 4, 2, 1,
1633       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1634   }
qs8_gemm_xw_4x4c2__sse41(benchmark::State & state,const char * net)1635   static void qs8_gemm_xw_4x4c2__sse41(benchmark::State& state, const char* net) {
1636     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, 4, 4, 2, 1,
1637       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1638   }
1639 
qs8_gemm_2x4c2s4__sse41_ld64(benchmark::State & state,const char * net)1640   static void qs8_gemm_2x4c2s4__sse41_ld64(benchmark::State& state, const char* net) {
1641     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64, 2, 4, 2, 4,
1642       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1643   }
qs8_gemm_3x4c2s4__sse41_ld64(benchmark::State & state,const char * net)1644   static void qs8_gemm_3x4c2s4__sse41_ld64(benchmark::State& state, const char* net) {
1645     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64, 3, 4, 2, 4,
1646       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1647   }
qs8_gemm_4x4c2s4__sse41_ld64(benchmark::State & state,const char * net)1648   static void qs8_gemm_4x4c2s4__sse41_ld64(benchmark::State& state, const char* net) {
1649     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, 4, 4, 2, 4,
1650       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1651   }
1652 
qs8_gemm_2x4c2s4__sse41_ld128(benchmark::State & state,const char * net)1653   static void qs8_gemm_2x4c2s4__sse41_ld128(benchmark::State& state, const char* net) {
1654     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, 2, 4, 2, 4,
1655       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1656   }
qs8_gemm_3x4c2s4__sse41_ld128(benchmark::State & state,const char * net)1657   static void qs8_gemm_3x4c2s4__sse41_ld128(benchmark::State& state, const char* net) {
1658     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, 3, 4, 2, 4,
1659       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1660   }
qs8_gemm_4x4c2s4__sse41_ld128(benchmark::State & state,const char * net)1661   static void qs8_gemm_4x4c2s4__sse41_ld128(benchmark::State& state, const char* net) {
1662     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128, 4, 4, 2, 4,
1663       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1664   }
1665 
qs8_gemm_xw_2x4c2s4__sse41(benchmark::State & state,const char * net)1666   static void qs8_gemm_xw_2x4c2s4__sse41(benchmark::State& state, const char* net) {
1667     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2s4__sse41, 2, 4, 2, 4,
1668       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1669   }
qs8_gemm_xw_3x4c2s4__sse41(benchmark::State & state,const char * net)1670   static void qs8_gemm_xw_3x4c2s4__sse41(benchmark::State& state, const char* net) {
1671     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2s4__sse41, 3, 4, 2, 4,
1672       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1673   }
qs8_gemm_xw_4x4c2s4__sse41(benchmark::State & state,const char * net)1674   static void qs8_gemm_xw_4x4c2s4__sse41(benchmark::State& state, const char* net) {
1675     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2s4__sse41, 4, 4, 2, 4,
1676       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1677   }
1678 
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)1679   static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1680     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
1681       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1682   }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)1683   static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1684     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
1685       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1686   }
1687 
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)1688   static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1689     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
1690       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1691   }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)1692   static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1693     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
1694       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1695   }
1696 
qs8_gemm_xw_2x4c8__sse41(benchmark::State & state,const char * net)1697   static void qs8_gemm_xw_2x4c8__sse41(benchmark::State& state, const char* net) {
1698     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, 2, 4, 8, 1,
1699       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1700   }
qs8_gemm_xw_3x4c8__sse41(benchmark::State & state,const char * net)1701   static void qs8_gemm_xw_3x4c8__sse41(benchmark::State& state, const char* net) {
1702     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, 3, 4, 8, 1,
1703       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1704   }
1705 
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,const char * net)1706   static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
1707     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
1708       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1709   }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,const char * net)1710   static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
1711     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
1712       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1713   }
1714 
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,const char * net)1715   static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
1716     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
1717       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1718   }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,const char * net)1719   static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
1720     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
1721       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1722   }
1723 
qs8_gemm_xw_2x4c8__ssse3(benchmark::State & state,const char * net)1724   static void qs8_gemm_xw_2x4c8__ssse3(benchmark::State& state, const char* net) {
1725     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
1726       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
1727   }
qs8_gemm_xw_3x4c8__ssse3(benchmark::State & state,const char * net)1728   static void qs8_gemm_xw_3x4c8__ssse3(benchmark::State& state, const char* net) {
1729     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
1730       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
1731   }
1732 
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)1733   static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1734     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
1735       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1736   }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)1737   static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1738     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
1739       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1740   }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)1741   static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1742     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
1743       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1744   }
1745 
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1746   static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1747     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
1748       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1749   }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1750   static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1751     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
1752       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1753   }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1754   static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1755     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
1756       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1757   }
1758 
qs8_gemm_xw_2x4c2__sse2(benchmark::State & state,const char * net)1759   static void qs8_gemm_xw_2x4c2__sse2(benchmark::State& state, const char* net) {
1760     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, 2, 4, 2, 1,
1761       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1762   }
qs8_gemm_xw_3x4c2__sse2(benchmark::State & state,const char * net)1763   static void qs8_gemm_xw_3x4c2__sse2(benchmark::State& state, const char* net) {
1764     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, 3, 4, 2, 1,
1765       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1766   }
qs8_gemm_xw_4x4c2__sse2(benchmark::State & state,const char * net)1767   static void qs8_gemm_xw_4x4c2__sse2(benchmark::State& state, const char* net) {
1768     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, 4, 4, 2, 1,
1769       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1770   }
1771 
qs8_gemm_2x4c2s4__sse2_ld64(benchmark::State & state,const char * net)1772   static void qs8_gemm_2x4c2s4__sse2_ld64(benchmark::State& state, const char* net) {
1773     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, 2, 4, 2, 4,
1774       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1775   }
qs8_gemm_3x4c2s4__sse2_ld64(benchmark::State & state,const char * net)1776   static void qs8_gemm_3x4c2s4__sse2_ld64(benchmark::State& state, const char* net) {
1777     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64, 3, 4, 2, 4,
1778       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1779   }
qs8_gemm_4x4c2s4__sse2_ld64(benchmark::State & state,const char * net)1780   static void qs8_gemm_4x4c2s4__sse2_ld64(benchmark::State& state, const char* net) {
1781     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64, 4, 4, 2, 4,
1782       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1783   }
1784 
qs8_gemm_2x4c2s4__sse2_ld128(benchmark::State & state,const char * net)1785   static void qs8_gemm_2x4c2s4__sse2_ld128(benchmark::State& state, const char* net) {
1786     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, 2, 4, 2, 4,
1787       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1788   }
qs8_gemm_3x4c2s4__sse2_ld128(benchmark::State & state,const char * net)1789   static void qs8_gemm_3x4c2s4__sse2_ld128(benchmark::State& state, const char* net) {
1790     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, 3, 4, 2, 4,
1791       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1792   }
qs8_gemm_4x4c2s4__sse2_ld128(benchmark::State & state,const char * net)1793   static void qs8_gemm_4x4c2s4__sse2_ld128(benchmark::State& state, const char* net) {
1794     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128, 4, 4, 2, 4,
1795       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1796   }
1797 
qs8_gemm_xw_2x4c2s4__sse2(benchmark::State & state,const char * net)1798   static void qs8_gemm_xw_2x4c2s4__sse2(benchmark::State& state, const char* net) {
1799     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2s4__sse2, 2, 4, 2, 4,
1800       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1801   }
qs8_gemm_xw_3x4c2s4__sse2(benchmark::State & state,const char * net)1802   static void qs8_gemm_xw_3x4c2s4__sse2(benchmark::State& state, const char* net) {
1803     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2s4__sse2, 3, 4, 2, 4,
1804       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1805   }
qs8_gemm_xw_4x4c2s4__sse2(benchmark::State & state,const char * net)1806   static void qs8_gemm_xw_4x4c2s4__sse2(benchmark::State& state, const char* net) {
1807     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2s4__sse2, 4, 4, 2, 4,
1808       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1809   }
1810 
qs8_gemm_2x4c8__sse2_ld64(benchmark::State & state,const char * net)1811   static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1812     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1,
1813       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1814   }
qs8_gemm_3x4c8__sse2_ld64(benchmark::State & state,const char * net)1815   static void qs8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1816     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, 3, 4, 8, 1,
1817       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1818   }
1819 
qs8_gemm_2x4c8__sse2_ld128(benchmark::State & state,const char * net)1820   static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1821     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1,
1822       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1823   }
qs8_gemm_3x4c8__sse2_ld128(benchmark::State & state,const char * net)1824   static void qs8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1825     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, 3, 4, 8, 1,
1826       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1827   }
1828 
qs8_gemm_xw_2x4c8__sse2(benchmark::State & state,const char * net)1829   static void qs8_gemm_xw_2x4c8__sse2(benchmark::State& state, const char* net) {
1830     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, 2, 4, 8, 1,
1831       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1832   }
qs8_gemm_xw_3x4c8__sse2(benchmark::State & state,const char * net)1833   static void qs8_gemm_xw_3x4c8__sse2(benchmark::State& state, const char* net) {
1834     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, 3, 4, 8, 1,
1835       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1836   }
1837 
1838   BENCHMARK_GEMM(qs8_gemm_2x16c8__avx512skx)
BENCHMARK_GEMM(qs8_gemm_3x16c8__avx512skx)1839   BENCHMARK_GEMM(qs8_gemm_3x16c8__avx512skx)
1840   BENCHMARK_GEMM(qs8_gemm_4x16c8__avx512skx)
1841 
1842   BENCHMARK_GEMM(qs8_gemm_2x8c8__avx2)
1843   BENCHMARK_GEMM(qs8_gemm_3x8c8__avx2)
1844   BENCHMARK_GEMM(qs8_gemm_xw_2x8c8__avx2)
1845   BENCHMARK_GEMM(qs8_gemm_xw_3x8c8__avx2)
1846 
1847   BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld64)
1848   BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld64)
1849   BENCHMARK_GEMM(qs8_gemm_4x4c2__xop_ld64)
1850   BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld128)
1851   BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld128)
1852   BENCHMARK_GEMM(qs8_gemm_4x4c2__xop_ld128)
1853   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__xop)
1854   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__xop)
1855   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__xop)
1856   BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld64)
1857   BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld64)
1858   BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld128)
1859   BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld128)
1860   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__xop)
1861   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__xop)
1862 
1863   BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld64)
1864   BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld64)
1865   BENCHMARK_GEMM(qs8_gemm_4x4c2__avx_ld64)
1866   BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld128)
1867   BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld128)
1868   BENCHMARK_GEMM(qs8_gemm_4x4c2__avx_ld128)
1869   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__avx)
1870   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__avx)
1871   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__avx)
1872   BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld64)
1873   BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld64)
1874   BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld128)
1875   BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld128)
1876   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__avx)
1877   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__avx)
1878 
1879   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld64)
1880   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld64)
1881   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld64)
1882   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld128)
1883   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld128)
1884   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld128)
1885   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse41)
1886   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse41)
1887   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse41)
1888   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld64)
1889   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld64)
1890   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld128)
1891   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld128)
1892   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse41)
1893   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse41)
1894 
1895   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld64)
1896   BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld64)
1897   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld128)
1898   BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld128)
1899   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__ssse3)
1900   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__ssse3)
1901 
1902   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld64)
1903   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld64)
1904   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld64)
1905   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld128)
1906   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld128)
1907   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld128)
1908   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse2)
1909   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse2)
1910   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse2)
1911   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld64)
1912   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld64)
1913   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld128)
1914   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld128)
1915   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse2)
1916   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse2)
1917 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1918 
1919 
1920 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1921   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1922     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, 2, 4, 2, 1,
1923       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1924   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1925   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1926     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, 3, 4, 2, 1,
1927       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1928   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1929   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1930     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, 4, 4, 2, 1,
1931       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1932   }
1933 
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1934   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1935     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, 2, 4, 2, 1,
1936       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1937   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1938   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1939     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, 3, 4, 2, 1,
1940       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1941   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1942   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1943     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128, 4, 4, 2, 1,
1944       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1945   }
1946 
qs8_gemm_xw_2x4c2__wasmsimd_dot16x2(benchmark::State & state,const char * net)1947   static void qs8_gemm_xw_2x4c2__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1948     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2, 2, 4, 2, 1,
1949       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1950   }
qs8_gemm_xw_3x4c2__wasmsimd_dot16x2(benchmark::State & state,const char * net)1951   static void qs8_gemm_xw_3x4c2__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1952     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2, 3, 4, 2, 1,
1953       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1954   }
qs8_gemm_xw_4x4c2__wasmsimd_dot16x2(benchmark::State & state,const char * net)1955   static void qs8_gemm_xw_4x4c2__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1956     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2, 4, 4, 2, 1,
1957       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1958   }
1959 
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1960   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1961     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, 2, 4, 2, 4,
1962       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1963   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1964   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1965     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, 3, 4, 2, 4,
1966       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1967   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1968   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1969     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, 4, 4, 2, 4,
1970       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1971   }
1972 
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1973   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1974     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, 2, 4, 2, 4,
1975       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1976   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1977   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1978     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, 3, 4, 2, 4,
1979       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1980   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1981   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1982     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, 4, 4, 2, 4,
1983       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1984   }
1985 
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1986   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1987     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, 2, 4, 8, 1,
1988       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1989   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1990   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1991     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, 3, 4, 8, 1,
1992       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1993   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1994   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1995     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, 4, 4, 8, 1,
1996       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1997   }
1998 
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1999   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
2000     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, 2, 4, 8, 1,
2001       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
2002   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)2003   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
2004     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, 3, 4, 8, 1,
2005       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
2006   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)2007   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
2008     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, 4, 4, 8, 1,
2009       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
2010   }
2011 
qs8_gemm_xw_2x4c8__wasmsimd_dot16x2(benchmark::State & state,const char * net)2012   static void qs8_gemm_xw_2x4c8__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
2013     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2, 2, 4, 8, 1,
2014       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
2015   }
qs8_gemm_xw_3x4c8__wasmsimd_dot16x2(benchmark::State & state,const char * net)2016   static void qs8_gemm_xw_3x4c8__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
2017     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2, 3, 4, 8, 1,
2018       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
2019   }
qs8_gemm_xw_4x4c8__wasmsimd_dot16x2(benchmark::State & state,const char * net)2020   static void qs8_gemm_xw_4x4c8__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
2021     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2, 4, 4, 8, 1,
2022       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
2023   }
2024 
2025   BENCHMARK_GEMM(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)2026   BENCHMARK_GEMM(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
2027   BENCHMARK_GEMM(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
2028   BENCHMARK_GEMM(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
2029   BENCHMARK_GEMM(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
2030   BENCHMARK_GEMM(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
2031   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__wasmsimd_dot16x2)
2032   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__wasmsimd_dot16x2)
2033   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__wasmsimd_dot16x2)
2034 
2035   BENCHMARK_GEMM(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
2036   BENCHMARK_GEMM(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
2037   BENCHMARK_GEMM(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
2038   BENCHMARK_GEMM(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
2039   BENCHMARK_GEMM(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
2040   BENCHMARK_GEMM(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
2041 
2042   BENCHMARK_GEMM(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
2043   BENCHMARK_GEMM(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
2044   BENCHMARK_GEMM(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
2045   BENCHMARK_GEMM(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
2046   BENCHMARK_GEMM(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
2047   BENCHMARK_GEMM(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
2048   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__wasmsimd_dot16x2)
2049   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__wasmsimd_dot16x2)
2050   BENCHMARK_GEMM(qs8_gemm_xw_4x4c8__wasmsimd_dot16x2)
2051 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2052 
2053 
2054 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2055   static void qs8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
2056     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, 2, 2, 1, 1,
2057       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2058   }
qs8_gemm_3x2__wasm_fmagic(benchmark::State & state,const char * net)2059   static void qs8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
2060     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, 3, 2, 1, 1,
2061       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2062   }
qs8_gemm_4x2__wasm_fmagic(benchmark::State & state,const char * net)2063   static void qs8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
2064     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, 4, 2, 1, 1,
2065       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2066   }
qs8_gemm_2x4__wasm_fmagic(benchmark::State & state,const char * net)2067   static void qs8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
2068     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, 2, 4, 1, 1,
2069       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2070   }
qs8_gemm_3x4__wasm_fmagic(benchmark::State & state,const char * net)2071   static void qs8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
2072     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, 3, 4, 1, 1,
2073       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2074   }
qs8_gemm_4x4__wasm_fmagic(benchmark::State & state,const char * net)2075   static void qs8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
2076     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, 4, 4, 1, 1,
2077       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2078   }
2079 
2080   BENCHMARK_GEMM(qs8_gemm_2x2__wasm_fmagic)
BENCHMARK_GEMM(qs8_gemm_3x2__wasm_fmagic)2081   BENCHMARK_GEMM(qs8_gemm_3x2__wasm_fmagic)
2082   BENCHMARK_GEMM(qs8_gemm_4x2__wasm_fmagic)
2083   BENCHMARK_GEMM(qs8_gemm_2x4__wasm_fmagic)
2084   BENCHMARK_GEMM(qs8_gemm_3x4__wasm_fmagic)
2085   BENCHMARK_GEMM(qs8_gemm_4x4__wasm_fmagic)
2086 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2087 
2088 
2089 static void qs8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
2090   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, 2, 2, 1, 1,
2091     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2092 }
qs8_gemm_3x2__scalar_fmagic(benchmark::State & state,const char * net)2093 static void qs8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
2094   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, 3, 2, 1, 1,
2095     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2096 }
qs8_gemm_4x2__scalar_fmagic(benchmark::State & state,const char * net)2097 static void qs8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
2098   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, 4, 2, 1, 1,
2099     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2100 }
qs8_gemm_2x4__scalar_fmagic(benchmark::State & state,const char * net)2101 static void qs8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
2102   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, 2, 4, 1, 1,
2103     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2104 }
qs8_gemm_3x4__scalar_fmagic(benchmark::State & state,const char * net)2105 static void qs8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
2106   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, 3, 4, 1, 1,
2107     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2108 }
qs8_gemm_4x4__scalar_fmagic(benchmark::State & state,const char * net)2109 static void qs8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
2110   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, 4, 4, 1, 1,
2111     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
2112 }
2113 
qs8_gemm_2x2__scalar_imagic(benchmark::State & state,const char * net)2114 static void qs8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
2115   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, 2, 2, 1, 1,
2116     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
2117 }
qs8_gemm_3x2__scalar_imagic(benchmark::State & state,const char * net)2118 static void qs8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
2119   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic, 3, 2, 1, 1,
2120     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
2121 }
qs8_gemm_4x2__scalar_imagic(benchmark::State & state,const char * net)2122 static void qs8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
2123   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic, 4, 2, 1, 1,
2124     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
2125 }
qs8_gemm_2x4__scalar_imagic(benchmark::State & state,const char * net)2126 static void qs8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
2127   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, 2, 4, 1, 1,
2128     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
2129 }
qs8_gemm_3x4__scalar_imagic(benchmark::State & state,const char * net)2130 static void qs8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
2131   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, 3, 4, 1, 1,
2132     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
2133 }
qs8_gemm_4x4__scalar_imagic(benchmark::State & state,const char * net)2134 static void qs8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
2135   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic, 4, 4, 1, 1,
2136     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
2137 }
2138 
qs8_gemm_2x2__scalar_lrintf(benchmark::State & state,const char * net)2139 static void qs8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
2140   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf, 2, 2, 1, 1,
2141     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
2142 }
qs8_gemm_3x2__scalar_lrintf(benchmark::State & state,const char * net)2143 static void qs8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
2144   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, 3, 2, 1, 1,
2145     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
2146 }
qs8_gemm_4x2__scalar_lrintf(benchmark::State & state,const char * net)2147 static void qs8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
2148   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, 4, 2, 1, 1,
2149     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
2150 }
qs8_gemm_2x4__scalar_lrintf(benchmark::State & state,const char * net)2151 static void qs8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
2152   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, 2, 4, 1, 1,
2153     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
2154 }
qs8_gemm_3x4__scalar_lrintf(benchmark::State & state,const char * net)2155 static void qs8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
2156   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, 3, 4, 1, 1,
2157     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
2158 }
qs8_gemm_4x4__scalar_lrintf(benchmark::State & state,const char * net)2159 static void qs8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
2160   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, 4, 4, 1, 1,
2161     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
2162 }
2163 
2164 BENCHMARK_GEMM(qs8_gemm_2x2__scalar_fmagic)
2165 BENCHMARK_GEMM(qs8_gemm_3x2__scalar_fmagic)
2166 BENCHMARK_GEMM(qs8_gemm_4x2__scalar_fmagic)
2167 BENCHMARK_GEMM(qs8_gemm_2x4__scalar_fmagic)
2168 BENCHMARK_GEMM(qs8_gemm_3x4__scalar_fmagic)
2169 BENCHMARK_GEMM(qs8_gemm_4x4__scalar_fmagic)
2170 
2171 BENCHMARK_GEMM(qs8_gemm_2x2__scalar_imagic)
2172 BENCHMARK_GEMM(qs8_gemm_3x2__scalar_imagic)
2173 BENCHMARK_GEMM(qs8_gemm_4x2__scalar_imagic)
2174 BENCHMARK_GEMM(qs8_gemm_2x4__scalar_imagic)
2175 BENCHMARK_GEMM(qs8_gemm_3x4__scalar_imagic)
2176 BENCHMARK_GEMM(qs8_gemm_4x4__scalar_imagic)
2177 
2178 BENCHMARK_GEMM(qs8_gemm_2x2__scalar_lrintf)
2179 BENCHMARK_GEMM(qs8_gemm_3x2__scalar_lrintf)
2180 BENCHMARK_GEMM(qs8_gemm_4x2__scalar_lrintf)
2181 BENCHMARK_GEMM(qs8_gemm_2x4__scalar_lrintf)
2182 BENCHMARK_GEMM(qs8_gemm_3x4__scalar_lrintf)
2183 BENCHMARK_GEMM(qs8_gemm_4x4__scalar_lrintf)
2184 
2185 
2186 #ifdef BENCHMARK_RUY
2187 BENCHMARK_GEMM(ruy_st)
2188 #endif  // BENCHMARK_RUY
2189 
2190 #ifndef XNNPACK_BENCHMARK_NO_MAIN
2191 BENCHMARK_MAIN();
2192 #endif
2193