1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <chrono>
9 #include <cmath>
10 #include <functional>
11 #include <limits>
12 #include <mutex>
13 #include <random>
14 #include <vector>
15
16 #include <cpuinfo.h>
17
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24
25 #include <xnnpack.h>
26 #include <xnnpack/aligned-allocator.h>
27 #include <xnnpack/common.h>
28 #include <xnnpack/gemm.h>
29 #include <xnnpack/math.h>
30 #include <xnnpack/microfnptr.h>
31 #include <xnnpack/microparams-init.h>
32 #include <xnnpack/pack.h>
33
34
GEMMBenchmark(benchmark::State & state,xnn_qs8_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr,bool extended_weights=false)35 static void GEMMBenchmark(benchmark::State& state,
36 xnn_qs8_gemm_minmax_ukernel_function gemm,
37 size_t mr, size_t nr, size_t kr, size_t sr,
38 xnn_init_qs8_conv_minmax_params_fn init_params,
39 benchmark::utils::IsaCheckFunction isa_check = nullptr,
40 bool extended_weights = false)
41 {
42 if (!cpuinfo_initialize()) {
43 state.SkipWithError("cpuinfo initialization failed");
44 return;
45 }
46 if (isa_check && !isa_check(state)) {
47 return;
48 }
49
50 const size_t mc = state.range(0);
51 const size_t nc = state.range(1);
52 const size_t kc = state.range(2);
53
54 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
55 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
56
57 std::random_device random_device;
58 auto rng = std::mt19937(random_device());
59 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
60 auto i8rng = std::bind(
61 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
62
63 std::vector<int8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(int8_t));
64 std::generate(a.begin(), a.end(), std::ref(i8rng));
65 std::vector<int8_t> k(nc * kc);
66 std::generate(k.begin(), k.end(), std::ref(i8rng));
67 std::vector<int32_t> b(nc);
68 std::generate(b.begin(), b.end(), std::ref(i32rng));
69
70 const size_t w_element_size = extended_weights ? sizeof(int16_t) : sizeof(int8_t);
71 const size_t w_size = nc_stride * sizeof(int32_t) + kc_stride * nc_stride * w_element_size;
72 const size_t c_elements = mc * nc;
73 const size_t num_buffers = 1 +
74 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), w_size + c_elements * sizeof(int8_t));
75
76 std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
77 std::fill(w.begin(), w.end(), 0);
78 const xnn_qs8_packing_params packing_params = { 127 };
79 if (extended_weights) {
80 xnn_pack_qs8_gemm_xw_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
81 } else {
82 xnn_pack_qs8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
83 }
84 std::vector<int8_t> c(c_elements * num_buffers);
85 std::fill(c.begin(), c.end(), 0xA5);
86
87 union xnn_qs8_conv_minmax_params quantization_params;
88 init_params(&quantization_params, 0.75f, 127, -127, 126);
89
90 size_t buffer_index = 0;
91 for (auto _ : state) {
92 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
93 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
94 // - W is not in cache (for any cache level)
95 // - C is not in cache (for any cache level)
96 state.PauseTiming();
97 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
98 buffer_index = (buffer_index + 1) % num_buffers;
99 state.ResumeTiming();
100
101 for (uint32_t m = 0; m < mc; m += mr) {
102 const uint32_t mb = min(mc - m, mr);
103 for (uint32_t n = 0; n < nc; n += nr) {
104 const uint32_t nb = min(nc - n, nr);
105 gemm(
106 mb, nb, kc * sizeof(int8_t),
107 a.data() + m * kc, kc * sizeof(int8_t),
108 w.data() + w_size * buffer_index + n * (kc_stride * w_element_size + sizeof(int32_t)),
109 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(int8_t), nr * sizeof(int8_t),
110 &quantization_params);
111 }
112 }
113 }
114
115 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
116 if (cpu_frequency != 0) {
117 state.counters["cpufreq"] = cpu_frequency;
118 }
119
120 state.counters["OPS"] = benchmark::Counter(
121 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
122 }
123
124 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)125 static void RuyBenchmark(benchmark::State& state, size_t threads)
126 {
127 const size_t mc = state.range(0);
128 const size_t nc = state.range(1);
129 const size_t kc = state.range(2);
130
131 std::random_device random_device;
132 auto rng = std::mt19937(random_device());
133 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
134 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
135
136 const size_t num_buffers = 1 +
137 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
138 nc * (sizeof(int8_t) * (mc + kc) + sizeof(int32_t)));
139
140 std::vector<int8_t> a(mc * kc);
141 std::generate(a.begin(), a.end(), std::ref(u8rng));
142 std::vector<int8_t> k(num_buffers * nc * kc);
143 std::generate(k.begin(), k.end(), std::ref(u8rng));
144 std::vector<int32_t> b(num_buffers * nc);
145 std::generate(b.begin(), b.end(), std::ref(i32rng));
146 std::vector<int8_t> c(num_buffers * nc * mc);
147 std::fill(c.begin(), c.end(), std::nanf(""));
148
149 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
150 static ruy::Context context;
151 context.set_max_num_threads(threads);
152
153 ruy::Matrix<int8_t> ruy_a;
154 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
155 ruy_a.set_zero_point(127);
156 ruy::Matrix<int8_t> ruy_b;
157 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
158 ruy_b.set_data(a.data());
159 ruy_b.set_zero_point(127);
160 ruy::Matrix<int8_t> ruy_c;
161 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
162 ruy_c.set_zero_point(127);
163
164 ruy::MulParams<int32_t, int8_t> mul_params;
165 mul_params.set_multiplier_fixedpoint(0x40000000);
166
167 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
168 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
169 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
170 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
171 static std::once_flag warmup;
172 std::call_once(warmup, [&](){
173 auto start = std::chrono::steady_clock::now();
174 do {
175 ruy_a.set_data(k.data());
176 ruy_c.set_data(c.data());
177 mul_params.set_bias(b.data());
178
179 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
180 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
181 });
182
183 size_t buffer_index = 0;
184 for (auto _ : state) {
185 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
186 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
187 // - K is not in cache (for any cache level)
188 // - B is not in cache (for any cache level)
189 // - C is not in cache (for any cache level)
190 state.PauseTiming();
191 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
192 buffer_index = (buffer_index + 1) % num_buffers;
193 state.ResumeTiming();
194
195 ruy_a.set_data(k.data() + buffer_index * nc * kc);
196 ruy_c.set_data(c.data() + buffer_index * mc * nc);
197 mul_params.set_bias(b.data() + buffer_index * nc);
198
199 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
200 }
201
202 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
203 if (cpu_frequency != 0) {
204 state.counters["cpufreq"] = cpu_frequency;
205 }
206
207 state.counters["OPS"] = benchmark::Counter(
208 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
209 }
210
ruy_st(benchmark::State & state,const char * net)211 static void ruy_st(benchmark::State& state, const char* net)
212 {
213 RuyBenchmark(state, 1);
214 }
215 #endif // BENCHMARK_RUY
216
217 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)218 static void GEMMBenchmark(benchmark::State& state,
219 xnn_jit_gemm_code_generator_function generator,
220 size_t mr, size_t nr, size_t kr, size_t sr,
221 xnn_init_qs8_conv_minmax_params_fn init_params,
222 benchmark::utils::IsaCheckFunction isa_check = nullptr)
223 {
224 xnn_initialize(/*allocator=*/nullptr);
225 xnn_code_buffer code_buffer;
226 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
227 const size_t nc = state.range(1);
228 const size_t kc = state.range(2);
229 generator(&code_buffer, mr, nc % nr, kc, nullptr);
230 xnn_finalize_code_memory(&code_buffer);
231 GEMMBenchmark(
232 state,
233 reinterpret_cast<xnn_qs8_gemm_minmax_ukernel_function>(code_buffer.start),
234 mr, nr, kr, sr, init_params, isa_check);
235 xnn_release_code_memory(&code_buffer);
236 }
237
jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,const char * net)238 static void jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
239 GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
240 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
241 }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)242 static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
243 GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
244 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
245 }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)246 static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
247 GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
248 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
249 }
250 BENCHMARK_GEMM(jit_qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)251 BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
252 BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
253 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
254
255 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
256 static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
257 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
258 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
259 }
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,const char * net)260 static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, const char* net) {
261 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55, 4, 8, 4, 1,
262 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
263 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)264 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
265 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, 4, 8, 1, 1,
266 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
267 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)268 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
269 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, 4, 8, 1, 1,
270 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
271 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)272 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
273 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, 4, 8, 1, 1,
274 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
275 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)276 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
277 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, 4, 8, 1, 1,
278 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
279 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)280 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
281 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
282 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
283 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)284 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
285 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
286 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
287 }
qs8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)288 static void qs8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
289 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, 1, 8, 1, 1,
290 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
291 }
qs8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)292 static void qs8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
293 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7, 1, 8, 1, 1,
294 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
295 }
296
297 BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)298 BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
299 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
300 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
301 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
302 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
303 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
304 BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
305 BENCHMARK_GEMM(qs8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7)
306 BENCHMARK_GEMM(qs8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
307 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
308
309 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
310 static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
311 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1,
312 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
313 }
qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)314 static void qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
315 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32, 1, 16, 4, 1,
316 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
317 }
qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)318 static void qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
319 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64, 1, 16, 4, 1,
320 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
321 }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)322 static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
323 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32, 4, 16, 4, 1,
324 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
325 }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)326 static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
327 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64, 4, 16, 4, 1,
328 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
329 }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)330 static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
331 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, 4, 16, 4, 1,
332 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
333 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)334 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
335 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, 4, 8, 1, 1,
336 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
337 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)338 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
339 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
340 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
341 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)342 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
343 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, 4, 16, 1, 1,
344 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
345 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)346 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
347 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, 4, 16, 1, 1,
348 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
349 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)350 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
351 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, 4, 16, 1, 1,
352 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
353 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)354 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
355 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, 4, 16, 1, 1,
356 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
357 }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)358 static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
359 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, 1, 8, 8, 1,
360 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
361 }
qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)362 static void qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
363 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, 1, 8, 8, 1,
364 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
365 }
qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)366 static void qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
367 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, 1, 8, 8, 1,
368 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
369 }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)370 static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
371 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, 1, 8, 8, 1,
372 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
373 }
qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State & state,const char * net)374 static void qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State& state, const char* net) {
375 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, 2, 8, 8, 1,
376 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
377 }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)378 static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
379 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, 2, 8, 8, 1,
380 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
381 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)382 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
383 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, 2, 8, 8, 1,
384 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
385 }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)386 static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
387 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, 2, 8, 8, 1,
388 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
389 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)390 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
391 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, 2, 8, 8, 1,
392 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
393 }
qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State & state,const char * net)394 static void qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State& state, const char* net) {
395 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, 2, 8, 16, 1,
396 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
397 }
398
399 BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld32)
BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)400 BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)
401 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld32)
402 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
403 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld128)
404 BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
405 BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
406 BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
407 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
408 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
409 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
410 BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
411 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm)
412 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal)
413 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53)
414 BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53)
415 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mull)
416 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal)
417 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
418 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
419 BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
420 BENCHMARK_GEMM(qs8_gemm_2x8c16__aarch64_neon_mlal)
421 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
422
423
424 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
425 static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
426 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, 1, 8, 4, 1,
427 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
428 }
qs8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)429 static void qs8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
430 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, 4, 8, 4, 1,
431 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
432 }
qs8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)433 static void qs8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
434 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, 6, 8, 4, 1,
435 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
436 }
qs8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)437 static void qs8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
438 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, 8, 8, 4, 1,
439 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
440 }
qs8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)441 static void qs8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
442 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, 1, 16, 4, 1,
443 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
444 }
qs8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)445 static void qs8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
446 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, 4, 16, 4, 1,
447 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
448 }
qs8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)449 static void qs8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
450 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, 6, 16, 4, 1,
451 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
452 }
qs8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)453 static void qs8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
454 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
455 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
456 }
457
458 BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)459 BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
460 BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)
461 BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
462 BENCHMARK_GEMM(qs8_gemm_1x16c4__neondot)
463 BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
464 BENCHMARK_GEMM(qs8_gemm_6x16c4__neondot)
465 BENCHMARK_GEMM(qs8_gemm_8x16c4__neondot)
466 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
467
468
469 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
470 static void qs8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
471 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, 1, 8, 1, 1,
472 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
473 }
qs8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)474 static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
475 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, 2, 8, 1, 1,
476 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
477 }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)478 static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
479 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, 3, 8, 1, 1,
480 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
481 }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)482 static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
483 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, 4, 8, 1, 1,
484 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
485 }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)486 static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
487 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, 6, 8, 1, 1,
488 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
489 }
qs8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)490 static void qs8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
491 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, 1, 16, 1, 1,
492 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
493 }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)494 static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
495 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1,
496 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
497 }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)498 static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
499 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, 3, 16, 1, 1,
500 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
501 }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)502 static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
503 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, 4, 16, 1, 1,
504 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
505 }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)506 static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
507 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, 6, 16, 1, 1,
508 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
509 }
qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)510 static void qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
511 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm, 1, 8, 1, 1,
512 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
513 }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)514 static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
515 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm, 2, 8, 1, 1,
516 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
517 }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)518 static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
519 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, 3, 8, 1, 1,
520 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
521 }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)522 static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
523 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm, 4, 8, 1, 1,
524 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
525 }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)526 static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
527 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, 6, 8, 1, 1,
528 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
529 }
qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)530 static void qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
531 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, 1, 16, 1, 1,
532 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
533 }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)534 static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
535 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, 2, 16, 1, 1,
536 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
537 }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)538 static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
539 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm, 3, 16, 1, 1,
540 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
541 }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)542 static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
543 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm, 4, 16, 1, 1,
544 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
545 }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)546 static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
547 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm, 6, 16, 1, 1,
548 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
549 }
qs8_gemm_1x8c2__neon_mull_dup(benchmark::State & state,const char * net)550 static void qs8_gemm_1x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
551 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, 1, 8, 2, 1,
552 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
553 }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,const char * net)554 static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
555 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, 2, 8, 2, 1,
556 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
557 }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,const char * net)558 static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
559 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, 3, 8, 2, 1,
560 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
561 }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,const char * net)562 static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
563 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, 4, 8, 2, 1,
564 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
565 }
qs8_gemm_1x16c2__neon_mull_dup(benchmark::State & state,const char * net)566 static void qs8_gemm_1x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
567 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, 1, 16, 2, 1,
568 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
569 }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,const char * net)570 static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
571 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, 2, 16, 2, 1,
572 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
573 }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,const char * net)574 static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
575 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, 3, 16, 2, 1,
576 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
577 }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,const char * net)578 static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
579 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, 4, 16, 2, 1,
580 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
581 }
qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State & state,const char * net)582 static void qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
583 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, 1, 8, 2, 1,
584 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
585 }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,const char * net)586 static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
587 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, 2, 8, 2, 1,
588 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
589 }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,const char * net)590 static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
591 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, 3, 8, 2, 1,
592 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
593 }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,const char * net)594 static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
595 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, 4, 8, 2, 1,
596 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
597 }
qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State & state,const char * net)598 static void qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
599 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, 1, 16, 2, 1,
600 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
601 }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,const char * net)602 static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
603 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, 2, 16, 2, 1,
604 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
605 }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,const char * net)606 static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
607 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, 3, 16, 2, 1,
608 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
609 }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,const char * net)610 static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
611 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, 4, 16, 2, 1,
612 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
613 }
qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)614 static void qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
615 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, 1, 8, 2, 1,
616 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
617 }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)618 static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
619 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, 2, 8, 2, 1,
620 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
621 }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)622 static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
623 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, 3, 8, 2, 1,
624 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
625 }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)626 static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
627 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, 4, 8, 2, 1,
628 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
629 }
qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)630 static void qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
631 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r, 1, 16, 2, 1,
632 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
633 }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)634 static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
635 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r, 2, 16, 2, 1,
636 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
637 }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)638 static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
639 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r, 3, 16, 2, 1,
640 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
641 }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)642 static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
643 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, 4, 16, 2, 1,
644 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
645 }
qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)646 static void qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
647 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, 1, 8, 2, 1,
648 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
649 }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)650 static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
651 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, 2, 8, 2, 1,
652 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
653 }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)654 static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
655 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, 3, 8, 2, 1,
656 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
657 }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)658 static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
659 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, 4, 8, 2, 1,
660 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
661 }
qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)662 static void qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
663 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, 1, 16, 2, 1,
664 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
665 }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)666 static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
667 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r, 2, 16, 2, 1,
668 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
669 }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)670 static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
671 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r, 3, 16, 2, 1,
672 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
673 }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)674 static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
675 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r, 4, 16, 2, 1,
676 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
677 }
qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)678 static void qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
679 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, 1, 8, 2, 1,
680 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
681 }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)682 static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
683 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r, 2, 8, 2, 1,
684 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
685 }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)686 static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
687 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r, 3, 8, 2, 1,
688 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
689 }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)690 static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
691 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r, 4, 8, 2, 1,
692 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
693 }
qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)694 static void qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
695 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, 1, 16, 2, 1,
696 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
697 }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)698 static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
699 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, 2, 16, 2, 1,
700 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
701 }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)702 static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
703 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, 3, 16, 2, 1,
704 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
705 }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)706 static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
707 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, 4, 16, 2, 1,
708 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
709 }
qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)710 static void qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
711 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, 1, 8, 2, 1,
712 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
713 }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)714 static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
715 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r, 2, 8, 2, 1,
716 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
717 }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)718 static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
719 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r, 3, 8, 2, 1,
720 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
721 }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)722 static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
723 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r, 4, 8, 2, 1,
724 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
725 }
qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)726 static void qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
727 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, 1, 16, 2, 1,
728 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
729 }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)730 static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
731 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, 2, 16, 2, 1,
732 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
733 }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)734 static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
735 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, 3, 16, 2, 1,
736 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
737 }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)738 static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
739 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, 4, 16, 2, 1,
740 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
741 }
qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)742 static void qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
743 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, 1, 8, 2, 1,
744 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
745 }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)746 static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
747 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, 2, 8, 2, 1,
748 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
749 }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)750 static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
751 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r, 3, 8, 2, 1,
752 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
753 }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)754 static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
755 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r, 4, 8, 2, 1,
756 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
757 }
qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)758 static void qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
759 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r, 1, 16, 2, 1,
760 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
761 }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)762 static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
763 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r, 2, 16, 2, 1,
764 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
765 }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)766 static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
767 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r, 3, 16, 2, 1,
768 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
769 }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)770 static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
771 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r, 4, 16, 2, 1,
772 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
773 }
qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)774 static void qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
775 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, 1, 8, 2, 1,
776 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
777 }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)778 static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
779 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r, 2, 8, 2, 1,
780 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
781 }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)782 static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
783 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r, 3, 8, 2, 1,
784 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
785 }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)786 static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
787 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r, 4, 8, 2, 1,
788 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
789 }
qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)790 static void qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
791 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, 1, 16, 2, 1,
792 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
793 }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)794 static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
795 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, 2, 16, 2, 1,
796 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
797 }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)798 static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
799 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, 3, 16, 2, 1,
800 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
801 }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)802 static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
803 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r, 4, 16, 2, 1,
804 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
805 }
qs8_gemm_1x8c2s4__neon_mull(benchmark::State & state,const char * net)806 static void qs8_gemm_1x8c2s4__neon_mull(benchmark::State& state, const char* net) {
807 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, 1, 8, 2, 4,
808 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
809 }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,const char * net)810 static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, const char* net) {
811 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, 2, 8, 2, 4,
812 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
813 }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,const char * net)814 static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, const char* net) {
815 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, 3, 8, 2, 4,
816 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
817 }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,const char * net)818 static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, const char* net) {
819 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, 4, 8, 2, 4,
820 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
821 }
qs8_gemm_1x16c2s4__neon_mull(benchmark::State & state,const char * net)822 static void qs8_gemm_1x16c2s4__neon_mull(benchmark::State& state, const char* net) {
823 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, 1, 16, 2, 4,
824 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
825 }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,const char * net)826 static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, const char* net) {
827 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, 2, 16, 2, 4,
828 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
829 }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,const char * net)830 static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, const char* net) {
831 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, 3, 16, 2, 4,
832 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
833 }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,const char * net)834 static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, const char* net) {
835 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, 4, 16, 2, 4,
836 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
837 }
qs8_gemm_1x8c2s4__neon_mlal(benchmark::State & state,const char * net)838 static void qs8_gemm_1x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
839 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, 1, 8, 2, 4,
840 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
841 }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,const char * net)842 static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
843 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, 2, 8, 2, 4,
844 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
845 }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,const char * net)846 static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
847 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, 3, 8, 2, 4,
848 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
849 }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,const char * net)850 static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
851 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, 4, 8, 2, 4,
852 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
853 }
qs8_gemm_1x16c2s4__neon_mlal(benchmark::State & state,const char * net)854 static void qs8_gemm_1x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
855 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, 1, 16, 2, 4,
856 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
857 }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,const char * net)858 static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
859 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, 2, 16, 2, 4,
860 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
861 }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,const char * net)862 static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
863 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, 3, 16, 2, 4,
864 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
865 }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,const char * net)866 static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
867 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, 4, 16, 2, 4,
868 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
869 }
qs8_gemm_1x8c4__neon_mull_dup(benchmark::State & state,const char * net)870 static void qs8_gemm_1x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
871 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, 1, 8, 4, 1,
872 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
873 }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,const char * net)874 static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
875 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, 2, 8, 4, 1,
876 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
877 }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,const char * net)878 static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
879 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, 3, 8, 4, 1,
880 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
881 }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,const char * net)882 static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
883 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, 4, 8, 4, 1,
884 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
885 }
qs8_gemm_1x16c4__neon_mull_dup(benchmark::State & state,const char * net)886 static void qs8_gemm_1x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
887 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, 1, 16, 4, 1,
888 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
889 }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,const char * net)890 static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
891 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, 2, 16, 4, 1,
892 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
893 }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,const char * net)894 static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
895 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, 3, 16, 4, 1,
896 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
897 }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,const char * net)898 static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
899 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, 4, 16, 4, 1,
900 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
901 }
qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State & state,const char * net)902 static void qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
903 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, 1, 8, 4, 1,
904 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
905 }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,const char * net)906 static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
907 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, 2, 8, 4, 1,
908 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
909 }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,const char * net)910 static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
911 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, 3, 8, 4, 1,
912 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
913 }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,const char * net)914 static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
915 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, 4, 8, 4, 1,
916 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
917 }
qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State & state,const char * net)918 static void qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
919 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, 1, 16, 4, 1,
920 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
921 }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,const char * net)922 static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
923 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, 2, 16, 4, 1,
924 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
925 }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,const char * net)926 static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
927 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, 3, 16, 4, 1,
928 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
929 }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,const char * net)930 static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
931 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, 4, 16, 4, 1,
932 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
933 }
qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)934 static void qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
935 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, 1, 8, 4, 1,
936 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
937 }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)938 static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
939 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, 2, 8, 4, 1,
940 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
941 }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)942 static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
943 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, 3, 8, 4, 1,
944 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
945 }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)946 static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
947 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, 4, 8, 4, 1,
948 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
949 }
qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)950 static void qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
951 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r, 1, 16, 4, 1,
952 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
953 }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)954 static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
955 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r, 2, 16, 4, 1,
956 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
957 }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)958 static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
959 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r, 3, 16, 4, 1,
960 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
961 }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)962 static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
963 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, 4, 16, 4, 1,
964 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
965 }
qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)966 static void qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
967 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, 1, 8, 4, 1,
968 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
969 }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)970 static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
971 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, 2, 8, 4, 1,
972 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
973 }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)974 static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
975 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, 3, 8, 4, 1,
976 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
977 }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)978 static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
979 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, 4, 8, 4, 1,
980 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
981 }
qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)982 static void qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
983 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r, 1, 16, 4, 1,
984 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
985 }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)986 static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
987 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r, 2, 16, 4, 1,
988 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
989 }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)990 static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
991 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r, 3, 16, 4, 1,
992 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
993 }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)994 static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
995 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, 4, 16, 4, 1,
996 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
997 }
qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)998 static void qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
999 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, 1, 8, 4, 1,
1000 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1001 }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)1002 static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1003 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r, 2, 8, 4, 1,
1004 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1005 }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)1006 static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1007 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r, 3, 8, 4, 1,
1008 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1009 }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)1010 static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1011 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r, 4, 8, 4, 1,
1012 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1013 }
qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1014 static void qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1015 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, 1, 16, 4, 1,
1016 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1017 }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1018 static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1019 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, 2, 16, 4, 1,
1020 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1021 }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1022 static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1023 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, 3, 16, 4, 1,
1024 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1025 }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)1026 static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
1027 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, 4, 16, 4, 1,
1028 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1029 }
qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1030 static void qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1031 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, 1, 8, 4, 1,
1032 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1033 }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1034 static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1035 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r, 2, 8, 4, 1,
1036 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1037 }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1038 static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1039 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r, 3, 8, 4, 1,
1040 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1041 }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1042 static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1043 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r, 4, 8, 4, 1,
1044 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1045 }
qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1046 static void qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1047 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, 1, 16, 4, 1,
1048 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1049 }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1050 static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1051 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, 2, 16, 4, 1,
1052 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1053 }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1054 static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1055 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, 3, 16, 4, 1,
1056 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1057 }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)1058 static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
1059 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, 4, 16, 4, 1,
1060 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1061 }
qs8_gemm_1x8c8__neon_mull(benchmark::State & state,const char * net)1062 static void qs8_gemm_1x8c8__neon_mull(benchmark::State& state, const char* net) {
1063 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull, 1, 8, 8, 1,
1064 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1065 }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,const char * net)1066 static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, const char* net) {
1067 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull, 2, 8, 8, 1,
1068 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1069 }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,const char * net)1070 static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, const char* net) {
1071 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull, 3, 8, 8, 1,
1072 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1073 }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,const char * net)1074 static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, const char* net) {
1075 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull, 4, 8, 8, 1,
1076 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1077 }
qs8_gemm_1x16c8__neon_mull(benchmark::State & state,const char * net)1078 static void qs8_gemm_1x16c8__neon_mull(benchmark::State& state, const char* net) {
1079 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull, 1, 16, 8, 1,
1080 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1081 }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,const char * net)1082 static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, const char* net) {
1083 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull, 2, 16, 8, 1,
1084 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1085 }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,const char * net)1086 static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, const char* net) {
1087 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull, 3, 16, 8, 1,
1088 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1089 }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,const char * net)1090 static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, const char* net) {
1091 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull, 4, 16, 8, 1,
1092 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1093 }
qs8_gemm_1x8c8__neon_mlal(benchmark::State & state,const char * net)1094 static void qs8_gemm_1x8c8__neon_mlal(benchmark::State& state, const char* net) {
1095 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, 1, 8, 8, 1,
1096 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1097 }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,const char * net)1098 static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, const char* net) {
1099 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, 2, 8, 8, 1,
1100 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1101 }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,const char * net)1102 static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, const char* net) {
1103 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, 3, 8, 8, 1,
1104 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1105 }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,const char * net)1106 static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, const char* net) {
1107 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, 4, 8, 8, 1,
1108 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1109 }
qs8_gemm_1x16c8__neon_mlal(benchmark::State & state,const char * net)1110 static void qs8_gemm_1x16c8__neon_mlal(benchmark::State& state, const char* net) {
1111 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal, 1, 16, 8, 1,
1112 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1113 }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,const char * net)1114 static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, const char* net) {
1115 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal, 2, 16, 8, 1,
1116 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1117 }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,const char * net)1118 static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, const char* net) {
1119 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, 3, 16, 8, 1,
1120 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1121 }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,const char * net)1122 static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, const char* net) {
1123 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, 4, 16, 8, 1,
1124 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1125 }
qs8_gemm_1x8c16__neon_mlal(benchmark::State & state,const char * net)1126 static void qs8_gemm_1x8c16__neon_mlal(benchmark::State& state, const char* net) {
1127 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, 1, 8, 16, 1,
1128 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1129 }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,const char * net)1130 static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, const char* net) {
1131 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, 2, 8, 16, 1,
1132 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1133 }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,const char * net)1134 static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, const char* net) {
1135 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, 3, 8, 16, 1,
1136 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1137 }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,const char * net)1138 static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, const char* net) {
1139 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, 4, 8, 16, 1,
1140 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1141 }
qs8_gemm_1x16c16__neon_mlal(benchmark::State & state,const char * net)1142 static void qs8_gemm_1x16c16__neon_mlal(benchmark::State& state, const char* net) {
1143 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, 1, 16, 16, 1,
1144 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1145 }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,const char * net)1146 static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, const char* net) {
1147 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, 2, 16, 16, 1,
1148 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1149 }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,const char * net)1150 static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, const char* net) {
1151 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal, 3, 16, 16, 1,
1152 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1153 }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,const char * net)1154 static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, const char* net) {
1155 GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal, 4, 16, 16, 1,
1156 xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1157 }
1158
1159 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_dup)
BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)1160 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)
1161 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_dup)
1162 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_dup)
1163 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_dup)
1164 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_dup)
1165 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_dup)
1166 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_dup)
1167 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_dup)
1168 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_dup)
1169 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_dup)
1170 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_dup)
1171 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_dup)
1172 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_dup)
1173 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_dup)
1174 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_dup)
1175 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld1r)
1176 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld1r)
1177 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld1r)
1178 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld1r)
1179 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld1r)
1180 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld1r)
1181 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld1r)
1182 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld1r)
1183 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld1r)
1184 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld1r)
1185 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld1r)
1186 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld1r)
1187 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld1r)
1188 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld1r)
1189 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld1r)
1190 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld1r)
1191 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld2r)
1192 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld2r)
1193 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld2r)
1194 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld2r)
1195 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld2r)
1196 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld2r)
1197 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld2r)
1198 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld2r)
1199 BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld2r)
1200 BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld2r)
1201 BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld2r)
1202 BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld2r)
1203 BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld2r)
1204 BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld2r)
1205 BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld2r)
1206 BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld2r)
1207 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_dup)
1208 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_dup)
1209 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_dup)
1210 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_dup)
1211 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_dup)
1212 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_dup)
1213 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_dup)
1214 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_dup)
1215 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_dup)
1216 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_dup)
1217 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_dup)
1218 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_dup)
1219 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_dup)
1220 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_dup)
1221 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_dup)
1222 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_dup)
1223 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld1r)
1224 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld1r)
1225 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld1r)
1226 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld1r)
1227 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld1r)
1228 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld1r)
1229 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld1r)
1230 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld1r)
1231 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld1r)
1232 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld1r)
1233 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld1r)
1234 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld1r)
1235 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld1r)
1236 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld1r)
1237 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld1r)
1238 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld1r)
1239 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld2r)
1240 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld2r)
1241 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld2r)
1242 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld2r)
1243 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld2r)
1244 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld2r)
1245 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld2r)
1246 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld2r)
1247 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld2r)
1248 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld2r)
1249 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld2r)
1250 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld2r)
1251 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld2r)
1252 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld2r)
1253 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld2r)
1254 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld2r)
1255 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld4r)
1256 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld4r)
1257 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld4r)
1258 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld4r)
1259 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld4r)
1260 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld4r)
1261 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld4r)
1262 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld4r)
1263 BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld4r)
1264 BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld4r)
1265 BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld4r)
1266 BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld4r)
1267 BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld4r)
1268 BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld4r)
1269 BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld4r)
1270 BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld4r)
1271 BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mull)
1272 BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mull)
1273 BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mull)
1274 BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mull)
1275 BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mull)
1276 BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mull)
1277 BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mull)
1278 BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mull)
1279 BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mlal)
1280 BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mlal)
1281 BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mlal)
1282 BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mlal)
1283 BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mlal)
1284 BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mlal)
1285 BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mlal)
1286 BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mlal)
1287 BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane)
1288 BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane)
1289 BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane)
1290 BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
1291 BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane)
1292 BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane)
1293 BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
1294 BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane)
1295 BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane)
1296 BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane)
1297 BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane_prfm)
1298 BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane_prfm)
1299 BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane_prfm)
1300 BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane_prfm)
1301 BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane_prfm)
1302 BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane_prfm)
1303 BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane_prfm)
1304 BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane_prfm)
1305 BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane_prfm)
1306 BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane_prfm)
1307 BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mull)
1308 BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mull)
1309 BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mull)
1310 BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mull)
1311 BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mull)
1312 BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mull)
1313 BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mull)
1314 BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mull)
1315 BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mlal)
1316 BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mlal)
1317 BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mlal)
1318 BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mlal)
1319 BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mlal)
1320 BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mlal)
1321 BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mlal)
1322 BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mlal)
1323 BENCHMARK_GEMM(qs8_gemm_1x8c16__neon_mlal)
1324 BENCHMARK_GEMM(qs8_gemm_2x8c16__neon_mlal)
1325 BENCHMARK_GEMM(qs8_gemm_3x8c16__neon_mlal)
1326 BENCHMARK_GEMM(qs8_gemm_4x8c16__neon_mlal)
1327 BENCHMARK_GEMM(qs8_gemm_1x16c16__neon_mlal)
1328 BENCHMARK_GEMM(qs8_gemm_2x16c16__neon_mlal)
1329 BENCHMARK_GEMM(qs8_gemm_3x16c16__neon_mlal)
1330 BENCHMARK_GEMM(qs8_gemm_4x16c16__neon_mlal)
1331 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1332
1333
1334 #if XNN_ARCH_ARM
1335 static void qs8_gemm_1x1c4__armsimd32(benchmark::State& state, const char* net) {
1336 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, 1, 1, 4, 1,
1337 xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1338 }
qs8_gemm_2x1c4__armsimd32(benchmark::State & state,const char * net)1339 static void qs8_gemm_2x1c4__armsimd32(benchmark::State& state, const char* net) {
1340 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32, 2, 1, 4, 1,
1341 xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1342 }
qs8_gemm_1x2c4__armsimd32(benchmark::State & state,const char * net)1343 static void qs8_gemm_1x2c4__armsimd32(benchmark::State& state, const char* net) {
1344 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, 1, 2, 4, 1,
1345 xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1346 }
qs8_gemm_2x2c4__armsimd32(benchmark::State & state,const char * net)1347 static void qs8_gemm_2x2c4__armsimd32(benchmark::State& state, const char* net) {
1348 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, 2, 2, 4, 1,
1349 xnn_init_qs8_conv_minmax_fp32_armsimd32_params, benchmark::utils::CheckARMV6);
1350 }
1351
1352 BENCHMARK_GEMM(qs8_gemm_1x1c4__armsimd32)
BENCHMARK_GEMM(qs8_gemm_2x1c4__armsimd32)1353 BENCHMARK_GEMM(qs8_gemm_2x1c4__armsimd32)
1354 BENCHMARK_GEMM(qs8_gemm_1x2c4__armsimd32)
1355 BENCHMARK_GEMM(qs8_gemm_2x2c4__armsimd32)
1356 #endif // XNN_ARCH_ARM
1357
1358
1359 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1360 static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
1361 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, 2, 16, 8, 1,
1362 xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1363 }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)1364 static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
1365 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, 3, 16, 8, 1,
1366 xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1367 }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)1368 static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
1369 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, 4, 16, 8, 1,
1370 xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1371 }
1372
qs8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)1373 static void qs8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
1374 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1375 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1376 }
qs8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)1377 static void qs8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
1378 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1379 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1380 }
1381
qs8_gemm_xw_2x8c8__avx2(benchmark::State & state,const char * net)1382 static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
1383 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1384 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1385 }
qs8_gemm_xw_3x8c8__avx2(benchmark::State & state,const char * net)1386 static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
1387 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1388 xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1389 }
1390
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)1391 static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
1392 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
1393 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1394 }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)1395 static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
1396 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
1397 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1398 }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)1399 static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
1400 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
1401 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1402 }
1403
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)1404 static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
1405 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
1406 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1407 }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)1408 static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
1409 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
1410 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1411 }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)1412 static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
1413 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
1414 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1415 }
1416
qs8_gemm_xw_2x4c2__xop(benchmark::State & state,const char * net)1417 static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
1418 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, 2, 4, 2, 1,
1419 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1420 }
qs8_gemm_xw_3x4c2__xop(benchmark::State & state,const char * net)1421 static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
1422 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, 3, 4, 2, 1,
1423 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1424 }
qs8_gemm_xw_4x4c2__xop(benchmark::State & state,const char * net)1425 static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
1426 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, 4, 4, 2, 1,
1427 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1428 }
1429
qs8_gemm_2x4c2s4__xop_ld64(benchmark::State & state,const char * net)1430 static void qs8_gemm_2x4c2s4__xop_ld64(benchmark::State& state, const char* net) {
1431 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, 2, 4, 2, 4,
1432 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1433 }
qs8_gemm_3x4c2s4__xop_ld64(benchmark::State & state,const char * net)1434 static void qs8_gemm_3x4c2s4__xop_ld64(benchmark::State& state, const char* net) {
1435 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld64, 3, 4, 2, 4,
1436 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1437 }
qs8_gemm_4x4c2s4__xop_ld64(benchmark::State & state,const char * net)1438 static void qs8_gemm_4x4c2s4__xop_ld64(benchmark::State& state, const char* net) {
1439 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, 4, 4, 2, 4,
1440 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1441 }
1442
qs8_gemm_2x4c2s4__xop_ld128(benchmark::State & state,const char * net)1443 static void qs8_gemm_2x4c2s4__xop_ld128(benchmark::State& state, const char* net) {
1444 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld128, 2, 4, 2, 4,
1445 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1446 }
qs8_gemm_3x4c2s4__xop_ld128(benchmark::State & state,const char * net)1447 static void qs8_gemm_3x4c2s4__xop_ld128(benchmark::State& state, const char* net) {
1448 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, 3, 4, 2, 4,
1449 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1450 }
qs8_gemm_4x4c2s4__xop_ld128(benchmark::State & state,const char * net)1451 static void qs8_gemm_4x4c2s4__xop_ld128(benchmark::State& state, const char* net) {
1452 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, 4, 4, 2, 4,
1453 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1454 }
1455
qs8_gemm_xw_2x4c2s4__xop(benchmark::State & state,const char * net)1456 static void qs8_gemm_xw_2x4c2s4__xop(benchmark::State& state, const char* net) {
1457 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2s4__xop, 2, 4, 2, 4,
1458 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1459 }
qs8_gemm_xw_3x4c2s4__xop(benchmark::State & state,const char * net)1460 static void qs8_gemm_xw_3x4c2s4__xop(benchmark::State& state, const char* net) {
1461 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2s4__xop, 3, 4, 2, 4,
1462 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1463 }
qs8_gemm_xw_4x4c2s4__xop(benchmark::State & state,const char * net)1464 static void qs8_gemm_xw_4x4c2s4__xop(benchmark::State& state, const char* net) {
1465 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2s4__xop, 4, 4, 2, 4,
1466 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1467 }
1468
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)1469 static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
1470 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
1471 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1472 }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)1473 static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
1474 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
1475 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1476 }
1477
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)1478 static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
1479 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
1480 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1481 }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)1482 static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
1483 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
1484 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1485 }
1486
qs8_gemm_xw_2x4c8__xop(benchmark::State & state,const char * net)1487 static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
1488 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, 2, 4, 8, 1,
1489 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1490 }
qs8_gemm_xw_3x4c8__xop(benchmark::State & state,const char * net)1491 static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
1492 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, 3, 4, 8, 1,
1493 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1494 }
1495
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)1496 static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
1497 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
1498 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1499 }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)1500 static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
1501 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
1502 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1503 }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)1504 static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
1505 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
1506 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1507 }
1508
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)1509 static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
1510 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
1511 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1512 }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)1513 static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
1514 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
1515 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1516 }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)1517 static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
1518 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
1519 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1520 }
1521
qs8_gemm_xw_2x4c2__avx(benchmark::State & state,const char * net)1522 static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
1523 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, 2, 4, 2, 1,
1524 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1525 }
qs8_gemm_xw_3x4c2__avx(benchmark::State & state,const char * net)1526 static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
1527 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, 3, 4, 2, 1,
1528 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1529 }
qs8_gemm_xw_4x4c2__avx(benchmark::State & state,const char * net)1530 static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
1531 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, 4, 4, 2, 1,
1532 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1533 }
1534
qs8_gemm_2x4c2s4__avx_ld64(benchmark::State & state,const char * net)1535 static void qs8_gemm_2x4c2s4__avx_ld64(benchmark::State& state, const char* net) {
1536 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, 2, 4, 2, 4,
1537 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1538 }
qs8_gemm_3x4c2s4__avx_ld64(benchmark::State & state,const char * net)1539 static void qs8_gemm_3x4c2s4__avx_ld64(benchmark::State& state, const char* net) {
1540 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64, 3, 4, 2, 4,
1541 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1542 }
qs8_gemm_4x4c2s4__avx_ld64(benchmark::State & state,const char * net)1543 static void qs8_gemm_4x4c2s4__avx_ld64(benchmark::State& state, const char* net) {
1544 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, 4, 4, 2, 4,
1545 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1546 }
1547
qs8_gemm_2x4c2s4__avx_ld128(benchmark::State & state,const char * net)1548 static void qs8_gemm_2x4c2s4__avx_ld128(benchmark::State& state, const char* net) {
1549 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, 2, 4, 2, 4,
1550 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1551 }
qs8_gemm_3x4c2s4__avx_ld128(benchmark::State & state,const char * net)1552 static void qs8_gemm_3x4c2s4__avx_ld128(benchmark::State& state, const char* net) {
1553 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128, 3, 4, 2, 4,
1554 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1555 }
qs8_gemm_4x4c2s4__avx_ld128(benchmark::State & state,const char * net)1556 static void qs8_gemm_4x4c2s4__avx_ld128(benchmark::State& state, const char* net) {
1557 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128, 4, 4, 2, 4,
1558 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1559 }
1560
qs8_gemm_xw_2x4c2s4__avx(benchmark::State & state,const char * net)1561 static void qs8_gemm_xw_2x4c2s4__avx(benchmark::State& state, const char* net) {
1562 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2s4__avx, 2, 4, 2, 4,
1563 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1564 }
qs8_gemm_xw_3x4c2s4__avx(benchmark::State & state,const char * net)1565 static void qs8_gemm_xw_3x4c2s4__avx(benchmark::State& state, const char* net) {
1566 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2s4__avx, 3, 4, 2, 4,
1567 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1568 }
qs8_gemm_xw_4x4c2s4__avx(benchmark::State & state,const char * net)1569 static void qs8_gemm_xw_4x4c2s4__avx(benchmark::State& state, const char* net) {
1570 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2s4__avx, 4, 4, 2, 4,
1571 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1572 }
1573
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)1574 static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
1575 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
1576 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1577 }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)1578 static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
1579 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
1580 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1581 }
1582
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)1583 static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
1584 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
1585 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1586 }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)1587 static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
1588 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
1589 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1590 }
1591
qs8_gemm_xw_2x4c8__avx(benchmark::State & state,const char * net)1592 static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
1593 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, 2, 4, 8, 1,
1594 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1595 }
qs8_gemm_xw_3x4c8__avx(benchmark::State & state,const char * net)1596 static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
1597 GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, 3, 4, 8, 1,
1598 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1599 }
1600
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)1601 static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1602 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
1603 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1604 }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)1605 static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1606 GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
1607 xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1608 }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)1609