xref: /aosp_15_r20/external/XNNPACK/bench/qs8-gemm-e2e.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstring>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <xnnpack.h>
14 
15 #include <benchmark/benchmark.h>
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 
20 #include <xnnpack.h>
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/microfnptr.h>
24 #include <xnnpack/microparams-init.h>
25 
26 
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qs8_gemm_minmax_ukernel_function gemm,xnn_qs8_igemm_minmax_ukernel_function igemm,xnn_qs8_gemm_minmax_ukernel_function gemm1,xnn_qs8_igemm_minmax_ukernel_function igemm1,xnn_init_qs8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void GEMMEnd2EndBenchmark(
28   benchmark::State& state,
29   models::ExecutionPlanFactory model_factory,
30   xnn_qs8_gemm_minmax_ukernel_function gemm,
31   xnn_qs8_igemm_minmax_ukernel_function igemm,
32   xnn_qs8_gemm_minmax_ukernel_function gemm1,
33   xnn_qs8_igemm_minmax_ukernel_function igemm1,
34   xnn_init_qs8_conv_minmax_params_fn init_params,
35   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
36   benchmark::utils::IsaCheckFunction isa_check = nullptr)
37 {
38   if (isa_check && !isa_check(state)) {
39     return;
40   }
41   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
42     state.SkipWithError("failed to initialize XNNPACK");
43     return;
44   }
45 
46   // Override microkernels chosen in xnn_initialize
47   // Note: do not directly assign to xnn_params.qs8.gemm because it breaks older gcc.
48   std::memset(&xnn_params.qs8.gemm, 0, sizeof(xnn_params.qs8.gemm));
49   xnn_params.qs8.gemm.minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
50   xnn_params.qs8.gemm.minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
51   xnn_params.qs8.gemm.minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
52   xnn_params.qs8.gemm.minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
53   xnn_params.qs8.gemm.init.qs8 = init_params;
54   xnn_params.qs8.gemm.mr = mr;
55   xnn_params.qs8.gemm.nr = nr;
56   xnn_params.qs8.gemm.log2_kr = log2_kr;
57   xnn_params.qs8.gemm.log2_sr = log2_sr;
58 
59   auto execution_plan = model_factory(nullptr);
60   if (execution_plan.empty()) {
61     state.SkipWithError("failed to create a model");
62     return;
63   }
64 
65   for (auto _ : state) {
66     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
67       xnn_status status = xnn_run_operator(op.get(), nullptr);
68       if (status != xnn_status_success) {
69         state.SkipWithError("failed to run a model");
70         return;
71       }
72     }
73   }
74 
75   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
76   if (cpu_frequency != 0) {
77     state.counters["cpufreq"] = cpu_frequency;
78   }
79 }
80 
81 
82 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)83   static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
84     GEMMEnd2EndBenchmark(state, model,
85       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
86       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
87       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
88       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
89       xnn_init_qs8_conv_minmax_rndnu_neon_params,
90       4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
91       benchmark::utils::CheckNEONDOT);
92   }
qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)93   static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
94     GEMMEnd2EndBenchmark(state, model,
95       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
96       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
97       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
98       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
99       xnn_init_qs8_conv_minmax_rndnu_neon_params,
100       4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
101       benchmark::utils::CheckNEONDOT);
102   }
103 
104   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)105   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)
106 #endif  // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
107 
108 
109 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
110   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
111     GEMMEnd2EndBenchmark(state, model,
112       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
113       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
114       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
115       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
116       xnn_init_qs8_conv_minmax_rndnu_neon_params,
117       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
118       benchmark::utils::CheckNEON);
119   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)120   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
121     GEMMEnd2EndBenchmark(state, model,
122       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
123       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
124       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
125       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
126       xnn_init_qs8_conv_minmax_rndnu_neon_params,
127       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
128       benchmark::utils::CheckNEON);
129   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)130   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
131     GEMMEnd2EndBenchmark(state, model,
132       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
133       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
134       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
135       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
136       xnn_init_qs8_conv_minmax_rndnu_neon_params,
137       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
138       benchmark::utils::CheckNEON);
139   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)140   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
141     GEMMEnd2EndBenchmark(state, model,
142       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
143       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
144       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
145       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
146       xnn_init_qs8_conv_minmax_rndnu_neon_params,
147       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
148       benchmark::utils::CheckNEON);
149   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)150   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
151     GEMMEnd2EndBenchmark(state, model,
152       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
153       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
154       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
155       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
156       xnn_init_qs8_conv_minmax_rndnu_neon_params,
157       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
158       benchmark::utils::CheckNEON);
159   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)160   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
161     GEMMEnd2EndBenchmark(state, model,
162       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
163       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
164       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
165       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
166       xnn_init_qs8_conv_minmax_rndnu_neon_params,
167       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
168       benchmark::utils::CheckNEON);
169   }
170 
171   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)172   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
173   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
174   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
175   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
176   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
177 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
178 
179 
180 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
181   static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
182     GEMMEnd2EndBenchmark(state, model,
183       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
184       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
185       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
186       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
187       xnn_init_qs8_conv_minmax_rndnu_neon_params,
188       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
189       benchmark::utils::CheckNEONDOT);
190   }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,models::ExecutionPlanFactory model)191   static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
192     GEMMEnd2EndBenchmark(state, model,
193       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32,
194       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
195       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32,
196       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
197       xnn_init_qs8_conv_minmax_rndnu_neon_params,
198       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
199       benchmark::utils::CheckNEONDOT);
200   }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)201   static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
202     GEMMEnd2EndBenchmark(state, model,
203       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
204       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
205       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
206       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
207       xnn_init_qs8_conv_minmax_rndnu_neon_params,
208       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
209       benchmark::utils::CheckNEONDOT);
210   }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)211   static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
212     GEMMEnd2EndBenchmark(state, model,
213       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
214       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
215       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
216       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
217       xnn_init_qs8_conv_minmax_rndnu_neon_params,
218       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
219       benchmark::utils::CheckNEONDOT);
220   }
221 
222   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)223   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)
224   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld64)
225   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
226 #endif  // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
227 
228 
229 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
230   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
231     GEMMEnd2EndBenchmark(state, model,
232       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
233       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
234       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
235       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
236       xnn_init_qs8_conv_minmax_rndnu_neon_params,
237       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
238       benchmark::utils::CheckNEON);
239   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)240   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
241     GEMMEnd2EndBenchmark(state, model,
242       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
243       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
244       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
245       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
246       xnn_init_qs8_conv_minmax_rndnu_neon_params,
247       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
248       benchmark::utils::CheckNEON);
249   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)250   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
251     GEMMEnd2EndBenchmark(state, model,
252       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
253       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
254       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
255       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
256       xnn_init_qs8_conv_minmax_rndnu_neon_params,
257       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
258       benchmark::utils::CheckNEON);
259   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)260   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
261     GEMMEnd2EndBenchmark(state, model,
262       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
263       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
264       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
265       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
266       xnn_init_qs8_conv_minmax_rndnu_neon_params,
267       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
268       benchmark::utils::CheckNEON);
269   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)270   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
271     GEMMEnd2EndBenchmark(state, model,
272       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
273       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
274       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
275       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
276       xnn_init_qs8_conv_minmax_rndnu_neon_params,
277       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
278       benchmark::utils::CheckNEON);
279   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)280   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
281     GEMMEnd2EndBenchmark(state, model,
282       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
283       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
284       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
285       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
286       xnn_init_qs8_conv_minmax_rndnu_neon_params,
287       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
288       benchmark::utils::CheckNEON);
289   }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)290   static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
291     GEMMEnd2EndBenchmark(state, model,
292       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
293       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
294       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
295       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
296       xnn_init_qs8_conv_minmax_rndnu_neon_params,
297       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
298       benchmark::utils::CheckNEON);
299   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,models::ExecutionPlanFactory model)300   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
301     GEMMEnd2EndBenchmark(state, model,
302       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
303       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
304       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
305       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
306       xnn_init_qs8_conv_minmax_rndnu_neon_params,
307       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
308       benchmark::utils::CheckNEON);
309   }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)310   static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
311     GEMMEnd2EndBenchmark(state, model,
312       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53,
313       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
314       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
315       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
316       xnn_init_qs8_conv_minmax_rndnu_neon_params,
317       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
318       benchmark::utils::CheckNEON);
319   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)320   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
321     GEMMEnd2EndBenchmark(state, model,
322       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
323       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
324       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
325       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
326       xnn_init_qs8_conv_minmax_rndnu_neon_params,
327       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
328       benchmark::utils::CheckNEON);
329   }
330 
331   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)332   BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
333   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
334   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
335   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
336   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
337   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
338   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
339   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
340   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal)
341 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
342 
343 
344 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
345   static void qs8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
346     GEMMEnd2EndBenchmark(state, model,
347       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
348       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
349       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
350       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
351       xnn_init_qs8_conv_minmax_rndnu_neon_params,
352       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
353       benchmark::utils::CheckNEONDOT);
354   }
qs8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)355   static void qs8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
356     GEMMEnd2EndBenchmark(state, model,
357       xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
358       xnn_qs8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
359       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
360       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
361       xnn_init_qs8_conv_minmax_rndnu_neon_params,
362       6 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
363       benchmark::utils::CheckNEONDOT);
364   }
qs8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)365   static void qs8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
366     GEMMEnd2EndBenchmark(state, model,
367       xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
368       xnn_qs8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
369       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
370       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
371       xnn_init_qs8_conv_minmax_rndnu_neon_params,
372       8 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
373       benchmark::utils::CheckNEONDOT);
374   }
qs8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)375   static void qs8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
376     GEMMEnd2EndBenchmark(state, model,
377       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
378       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
379       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
380       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
381       xnn_init_qs8_conv_minmax_rndnu_neon_params,
382       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
383       benchmark::utils::CheckNEONDOT);
384   }
qs8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)385   static void qs8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
386     GEMMEnd2EndBenchmark(state, model,
387       xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
388       xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
389       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
390       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
391       xnn_init_qs8_conv_minmax_rndnu_neon_params,
392       6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
393       benchmark::utils::CheckNEONDOT);
394   }
qs8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)395   static void qs8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
396     GEMMEnd2EndBenchmark(state, model,
397       xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
398       xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
399       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
400       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
401       xnn_init_qs8_conv_minmax_rndnu_neon_params,
402       8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
403       benchmark::utils::CheckNEONDOT);
404   }
405 
406   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neondot);
407   BENCHMARK_QS8_END2END(qs8_gemm_6x8c4__neondot);
408   BENCHMARK_QS8_END2END(qs8_gemm_8x8c4__neondot);
409   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neondot);
410   BENCHMARK_QS8_END2END(qs8_gemm_6x16c4__neondot);
411   BENCHMARK_QS8_END2END(qs8_gemm_8x16c4__neondot);
412 #endif  // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
413 
414 
415 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_gemm_2x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)416   static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
417     GEMMEnd2EndBenchmark(state, model,
418       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
419       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
420       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
421       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
422       xnn_init_qs8_conv_minmax_rndnu_neon_params,
423       2 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
424       benchmark::utils::CheckNEON);
425   }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)426   static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
427     GEMMEnd2EndBenchmark(state, model,
428       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
429       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
430       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
431       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
432       xnn_init_qs8_conv_minmax_rndnu_neon_params,
433       2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
434       benchmark::utils::CheckNEON);
435   }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)436   static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
437     GEMMEnd2EndBenchmark(state, model,
438       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
439       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
440       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
441       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
442       xnn_init_qs8_conv_minmax_rndnu_neon_params,
443       3 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
444       benchmark::utils::CheckNEON);
445   }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)446   static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
447     GEMMEnd2EndBenchmark(state, model,
448       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
449       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
450       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
451       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
452       xnn_init_qs8_conv_minmax_rndnu_neon_params,
453       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
454       benchmark::utils::CheckNEON);
455   }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)456   static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
457     GEMMEnd2EndBenchmark(state, model,
458       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
459       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
460       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
461       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
462       xnn_init_qs8_conv_minmax_rndnu_neon_params,
463       4 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
464       benchmark::utils::CheckNEON);
465   }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)466   static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
467     GEMMEnd2EndBenchmark(state, model,
468       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
469       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
470       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
471       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
472       xnn_init_qs8_conv_minmax_rndnu_neon_params,
473       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
474       benchmark::utils::CheckNEON);
475   }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)476   static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
477     GEMMEnd2EndBenchmark(state, model,
478       xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
479       xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
480       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
481       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
482       xnn_init_qs8_conv_minmax_rndnu_neon_params,
483       6 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
484       benchmark::utils::CheckNEON);
485   }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)486   static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
487     GEMMEnd2EndBenchmark(state, model,
488       xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
489       xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
490       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
491       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
492       xnn_init_qs8_conv_minmax_rndnu_neon_params,
493       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
494       benchmark::utils::CheckNEON);
495   }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)496   static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
497     GEMMEnd2EndBenchmark(state, model,
498       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
499       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
500       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
501       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
502       xnn_init_qs8_conv_minmax_rndnu_neon_params,
503       2 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
504       benchmark::utils::CheckNEON);
505   }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)506   static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
507     GEMMEnd2EndBenchmark(state, model,
508       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
509       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
510       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
511       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
512       xnn_init_qs8_conv_minmax_rndnu_neon_params,
513       2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
514       benchmark::utils::CheckNEON);
515   }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)516   static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
517     GEMMEnd2EndBenchmark(state, model,
518       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
519       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
520       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
521       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
522       xnn_init_qs8_conv_minmax_rndnu_neon_params,
523       3 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
524       benchmark::utils::CheckNEON);
525   }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)526   static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
527     GEMMEnd2EndBenchmark(state, model,
528       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
529       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
530       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
531       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
532       xnn_init_qs8_conv_minmax_rndnu_neon_params,
533       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
534       benchmark::utils::CheckNEON);
535   }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)536   static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
537     GEMMEnd2EndBenchmark(state, model,
538       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
539       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
540       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
541       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
542       xnn_init_qs8_conv_minmax_rndnu_neon_params,
543       4 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
544       benchmark::utils::CheckNEON);
545   }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)546   static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
547     GEMMEnd2EndBenchmark(state, model,
548       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
549       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
550       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
551       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
552       xnn_init_qs8_conv_minmax_rndnu_neon_params,
553       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
554       benchmark::utils::CheckNEON);
555   }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)556   static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
557     GEMMEnd2EndBenchmark(state, model,
558       xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
559       xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
560       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
561       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
562       xnn_init_qs8_conv_minmax_rndnu_neon_params,
563       6 /* mr */, 8  /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
564       benchmark::utils::CheckNEON);
565   }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)566   static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
567     GEMMEnd2EndBenchmark(state, model,
568       xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
569       xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
570       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
571       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
572       xnn_init_qs8_conv_minmax_rndnu_neon_params,
573       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
574       benchmark::utils::CheckNEON);
575   }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)576   static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
577     GEMMEnd2EndBenchmark(state, model,
578       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
579       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
580       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
581       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
582       xnn_init_qs8_conv_minmax_rndnu_neon_params,
583       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
584       benchmark::utils::CheckNEON);
585   }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)586   static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
587     GEMMEnd2EndBenchmark(state, model,
588       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
589       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
590       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
591       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
592       xnn_init_qs8_conv_minmax_rndnu_neon_params,
593       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
594       benchmark::utils::CheckNEON);
595   }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)596   static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
597     GEMMEnd2EndBenchmark(state, model,
598       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
599       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
600       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
601       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
602       xnn_init_qs8_conv_minmax_rndnu_neon_params,
603       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
604       benchmark::utils::CheckNEON);
605   }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)606   static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
607     GEMMEnd2EndBenchmark(state, model,
608       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
609       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
610       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
611       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
612       xnn_init_qs8_conv_minmax_rndnu_neon_params,
613       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
614       benchmark::utils::CheckNEON);
615   }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)616   static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
617     GEMMEnd2EndBenchmark(state, model,
618       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
619       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
620       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
621       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
622       xnn_init_qs8_conv_minmax_rndnu_neon_params,
623       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
624       benchmark::utils::CheckNEON);
625   }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)626   static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
627     GEMMEnd2EndBenchmark(state, model,
628       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
629       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
630       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
631       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
632       xnn_init_qs8_conv_minmax_rndnu_neon_params,
633       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
634       benchmark::utils::CheckNEON);
635   }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)636   static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
637     GEMMEnd2EndBenchmark(state, model,
638       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
639       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
640       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
641       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
642       xnn_init_qs8_conv_minmax_rndnu_neon_params,
643       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
644       benchmark::utils::CheckNEON);
645   }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)646   static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
647     GEMMEnd2EndBenchmark(state, model,
648       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
649       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
650       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
651       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
652       xnn_init_qs8_conv_minmax_rndnu_neon_params,
653       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
654       benchmark::utils::CheckNEON);
655   }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)656   static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
657     GEMMEnd2EndBenchmark(state, model,
658       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
659       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
660       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
661       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
662       xnn_init_qs8_conv_minmax_rndnu_neon_params,
663       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
664       benchmark::utils::CheckNEON);
665   }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)666   static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
667     GEMMEnd2EndBenchmark(state, model,
668       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
669       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
670       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
671       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
672       xnn_init_qs8_conv_minmax_rndnu_neon_params,
673       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
674       benchmark::utils::CheckNEON);
675   }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)676   static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
677     GEMMEnd2EndBenchmark(state, model,
678       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
679       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
680       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
681       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
682       xnn_init_qs8_conv_minmax_rndnu_neon_params,
683       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
684       benchmark::utils::CheckNEON);
685   }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)686   static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
687     GEMMEnd2EndBenchmark(state, model,
688       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
689       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
690       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
691       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
692       xnn_init_qs8_conv_minmax_rndnu_neon_params,
693       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
694       benchmark::utils::CheckNEON);
695   }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)696   static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
697     GEMMEnd2EndBenchmark(state, model,
698       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
699       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
700       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
701       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
702       xnn_init_qs8_conv_minmax_rndnu_neon_params,
703       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
704       benchmark::utils::CheckNEON);
705   }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)706   static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
707     GEMMEnd2EndBenchmark(state, model,
708       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
709       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
710       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
711       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
712       xnn_init_qs8_conv_minmax_rndnu_neon_params,
713       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
714       benchmark::utils::CheckNEON);
715   }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)716   static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
717     GEMMEnd2EndBenchmark(state, model,
718       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
719       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
720       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
721       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
722       xnn_init_qs8_conv_minmax_rndnu_neon_params,
723       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
724       benchmark::utils::CheckNEON);
725   }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)726   static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
727     GEMMEnd2EndBenchmark(state, model,
728       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
729       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
730       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
731       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
732       xnn_init_qs8_conv_minmax_rndnu_neon_params,
733       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
734       benchmark::utils::CheckNEON);
735   }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)736   static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
737     GEMMEnd2EndBenchmark(state, model,
738       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
739       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
740       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
741       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
742       xnn_init_qs8_conv_minmax_rndnu_neon_params,
743       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
744       benchmark::utils::CheckNEON);
745   }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)746   static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
747     GEMMEnd2EndBenchmark(state, model,
748       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
749       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
750       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
751       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
752       xnn_init_qs8_conv_minmax_rndnu_neon_params,
753       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
754       benchmark::utils::CheckNEON);
755   }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)756   static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
757     GEMMEnd2EndBenchmark(state, model,
758       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
759       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
760       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
761       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
762       xnn_init_qs8_conv_minmax_rndnu_neon_params,
763       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
764       benchmark::utils::CheckNEON);
765   }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)766   static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
767     GEMMEnd2EndBenchmark(state, model,
768       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
769       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
770       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
771       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
772       xnn_init_qs8_conv_minmax_rndnu_neon_params,
773       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
774       benchmark::utils::CheckNEON);
775   }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)776   static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
777     GEMMEnd2EndBenchmark(state, model,
778       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
779       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
780       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
781       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
782       xnn_init_qs8_conv_minmax_rndnu_neon_params,
783       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
784       benchmark::utils::CheckNEON);
785   }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)786   static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
787     GEMMEnd2EndBenchmark(state, model,
788       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
789       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
790       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
791       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
792       xnn_init_qs8_conv_minmax_rndnu_neon_params,
793       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
794       benchmark::utils::CheckNEON);
795   }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)796   static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
797     GEMMEnd2EndBenchmark(state, model,
798       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
799       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
800       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
801       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
802       xnn_init_qs8_conv_minmax_rndnu_neon_params,
803       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
804       benchmark::utils::CheckNEON);
805   }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)806   static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
807     GEMMEnd2EndBenchmark(state, model,
808       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
809       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
810       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
811       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
812       xnn_init_qs8_conv_minmax_rndnu_neon_params,
813       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
814       benchmark::utils::CheckNEON);
815   }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)816   static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
817     GEMMEnd2EndBenchmark(state, model,
818       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
819       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
820       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
821       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
822       xnn_init_qs8_conv_minmax_rndnu_neon_params,
823       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
824       benchmark::utils::CheckNEON);
825   }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)826   static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
827     GEMMEnd2EndBenchmark(state, model,
828       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
829       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
830       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
831       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
832       xnn_init_qs8_conv_minmax_rndnu_neon_params,
833       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
834       benchmark::utils::CheckNEON);
835   }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)836   static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
837     GEMMEnd2EndBenchmark(state, model,
838       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
839       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
840       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
841       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
842       xnn_init_qs8_conv_minmax_rndnu_neon_params,
843       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
844       benchmark::utils::CheckNEON);
845   }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)846   static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
847     GEMMEnd2EndBenchmark(state, model,
848       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
849       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
850       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
851       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
852       xnn_init_qs8_conv_minmax_rndnu_neon_params,
853       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
854       benchmark::utils::CheckNEON);
855   }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)856   static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
857     GEMMEnd2EndBenchmark(state, model,
858       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
859       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
860       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
861       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
862       xnn_init_qs8_conv_minmax_rndnu_neon_params,
863       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
864       benchmark::utils::CheckNEON);
865   }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)866   static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
867     GEMMEnd2EndBenchmark(state, model,
868       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
869       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
870       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
871       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
872       xnn_init_qs8_conv_minmax_rndnu_neon_params,
873       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
874       benchmark::utils::CheckNEON);
875   }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)876   static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
877     GEMMEnd2EndBenchmark(state, model,
878       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
879       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
880       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
881       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
882       xnn_init_qs8_conv_minmax_rndnu_neon_params,
883       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
884       benchmark::utils::CheckNEON);
885   }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)886   static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
887     GEMMEnd2EndBenchmark(state, model,
888       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
889       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
890       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
891       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
892       xnn_init_qs8_conv_minmax_rndnu_neon_params,
893       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
894       benchmark::utils::CheckNEON);
895   }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)896   static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
897     GEMMEnd2EndBenchmark(state, model,
898       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
899       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
900       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
901       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
902       xnn_init_qs8_conv_minmax_rndnu_neon_params,
903       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
904       benchmark::utils::CheckNEON);
905   }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)906   static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
907     GEMMEnd2EndBenchmark(state, model,
908       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
909       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
910       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
911       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
912       xnn_init_qs8_conv_minmax_rndnu_neon_params,
913       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
914       benchmark::utils::CheckNEON);
915   }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)916   static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
917     GEMMEnd2EndBenchmark(state, model,
918       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
919       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
920       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
921       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
922       xnn_init_qs8_conv_minmax_rndnu_neon_params,
923       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
924       benchmark::utils::CheckNEON);
925   }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)926   static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
927     GEMMEnd2EndBenchmark(state, model,
928       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
929       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
930       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
931       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
932       xnn_init_qs8_conv_minmax_rndnu_neon_params,
933       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
934       benchmark::utils::CheckNEON);
935   }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)936   static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
937     GEMMEnd2EndBenchmark(state, model,
938       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
939       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
940       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
941       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
942       xnn_init_qs8_conv_minmax_rndnu_neon_params,
943       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
944       benchmark::utils::CheckNEON);
945   }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)946   static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
947     GEMMEnd2EndBenchmark(state, model,
948       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
949       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
950       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
951       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
952       xnn_init_qs8_conv_minmax_rndnu_neon_params,
953       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
954       benchmark::utils::CheckNEON);
955   }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)956   static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
957     GEMMEnd2EndBenchmark(state, model,
958       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
959       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
960       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
961       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
962       xnn_init_qs8_conv_minmax_rndnu_neon_params,
963       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
964       benchmark::utils::CheckNEON);
965   }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)966   static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
967     GEMMEnd2EndBenchmark(state, model,
968       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
969       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
970       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
971       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
972       xnn_init_qs8_conv_minmax_rndnu_neon_params,
973       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
974       benchmark::utils::CheckNEON);
975   }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)976   static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
977     GEMMEnd2EndBenchmark(state, model,
978       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
979       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
980       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
981       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
982       xnn_init_qs8_conv_minmax_rndnu_neon_params,
983       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
984       benchmark::utils::CheckNEON);
985   }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)986   static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
987     GEMMEnd2EndBenchmark(state, model,
988       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
989       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
990       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
991       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
992       xnn_init_qs8_conv_minmax_rndnu_neon_params,
993       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
994       benchmark::utils::CheckNEON);
995   }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)996   static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
997     GEMMEnd2EndBenchmark(state, model,
998       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
999       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
1000       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1001       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1002       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1003       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1004       benchmark::utils::CheckNEON);
1005   }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1006   static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1007     GEMMEnd2EndBenchmark(state, model,
1008       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
1009       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
1010       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1011       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1012       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1013       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1014       benchmark::utils::CheckNEON);
1015   }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1016   static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1017     GEMMEnd2EndBenchmark(state, model,
1018       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
1019       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
1020       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1021       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1022       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1023       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1024       benchmark::utils::CheckNEON);
1025   }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1026   static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1027     GEMMEnd2EndBenchmark(state, model,
1028       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
1029       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
1030       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1031       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1032       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1033       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1034       benchmark::utils::CheckNEON);
1035   }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1036   static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1037     GEMMEnd2EndBenchmark(state, model,
1038       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
1039       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
1040       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1041       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1042       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1043       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1044       benchmark::utils::CheckNEON);
1045   }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1046   static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1047     GEMMEnd2EndBenchmark(state, model,
1048       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
1049       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
1050       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1051       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1052       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1053       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1054       benchmark::utils::CheckNEON);
1055   }
qs8_gemm_2x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1056   static void qs8_gemm_2x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1057     GEMMEnd2EndBenchmark(state, model,
1058       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
1059       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
1060       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1061       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1062       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1063       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1064       benchmark::utils::CheckNEON);
1065   }
qs8_gemm_2x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1066   static void qs8_gemm_2x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1067     GEMMEnd2EndBenchmark(state, model,
1068       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
1069       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
1070       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1071       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1072       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1073       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1074       benchmark::utils::CheckNEON);
1075   }
qs8_gemm_3x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1076   static void qs8_gemm_3x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1077     GEMMEnd2EndBenchmark(state, model,
1078       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
1079       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
1080       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1081       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1082       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1083       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1084       benchmark::utils::CheckNEON);
1085   }
qs8_gemm_3x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1086   static void qs8_gemm_3x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1087     GEMMEnd2EndBenchmark(state, model,
1088       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1089       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1090       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1091       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1092       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1093       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1094       benchmark::utils::CheckNEON);
1095   }
qs8_gemm_4x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1096   static void qs8_gemm_4x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1097     GEMMEnd2EndBenchmark(state, model,
1098       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1099       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1100       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1101       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1102       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1103       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1104       benchmark::utils::CheckNEON);
1105   }
qs8_gemm_4x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1106   static void qs8_gemm_4x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1107     GEMMEnd2EndBenchmark(state, model,
1108       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1109       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1110       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1111       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1112       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1113       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1114       benchmark::utils::CheckNEON);
1115   }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1116   static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1117     GEMMEnd2EndBenchmark(state, model,
1118       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1119       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1120       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1121       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1122       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1123       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1124       benchmark::utils::CheckNEON);
1125   }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1126   static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1127     GEMMEnd2EndBenchmark(state, model,
1128       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1129       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1130       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1131       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1132       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1133       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1134       benchmark::utils::CheckNEON);
1135   }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1136   static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1137     GEMMEnd2EndBenchmark(state, model,
1138       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1139       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1140       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1141       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1142       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1143       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1144       benchmark::utils::CheckNEON);
1145   }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1146   static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1147     GEMMEnd2EndBenchmark(state, model,
1148       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1149       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1150       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1151       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1152       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1153       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1154       benchmark::utils::CheckNEON);
1155   }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1156   static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1157     GEMMEnd2EndBenchmark(state, model,
1158       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1159       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1160       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1161       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1162       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1163       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1164       benchmark::utils::CheckNEON);
1165   }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1166   static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1167     GEMMEnd2EndBenchmark(state, model,
1168       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1169       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1170       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1171       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1172       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1173       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1174       benchmark::utils::CheckNEON);
1175   }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1176   static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1177     GEMMEnd2EndBenchmark(state, model,
1178       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1179       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1180       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1181       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1182       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1183       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1184       benchmark::utils::CheckNEON);
1185   }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1186   static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1187     GEMMEnd2EndBenchmark(state, model,
1188       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1189       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1190       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1191       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1192       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1193       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1194       benchmark::utils::CheckNEON);
1195   }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1196   static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1197     GEMMEnd2EndBenchmark(state, model,
1198       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1199       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1200       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1201       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1202       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1203       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1204       benchmark::utils::CheckNEON);
1205   }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1206   static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1207     GEMMEnd2EndBenchmark(state, model,
1208       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1209       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1210       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1211       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1212       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1213       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1214       benchmark::utils::CheckNEON);
1215   }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1216   static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1217     GEMMEnd2EndBenchmark(state, model,
1218       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1219       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1220       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1221       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1222       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1223       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1224       benchmark::utils::CheckNEON);
1225   }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1226   static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1227     GEMMEnd2EndBenchmark(state, model,
1228       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1229       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1230       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1231       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1232       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1233       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1234       benchmark::utils::CheckNEON);
1235   }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1236   static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1237     GEMMEnd2EndBenchmark(state, model,
1238       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1239       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1240       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1241       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1242       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1243       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1244       benchmark::utils::CheckNEON);
1245   }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1246   static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1247     GEMMEnd2EndBenchmark(state, model,
1248       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1249       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1250       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1251       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1252       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1253       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1254       benchmark::utils::CheckNEON);
1255   }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1256   static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1257     GEMMEnd2EndBenchmark(state, model,
1258       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1259       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1260       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1261       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1262       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1263       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1264       benchmark::utils::CheckNEON);
1265   }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1266   static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1267     GEMMEnd2EndBenchmark(state, model,
1268       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1269       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1270       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1271       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1272       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1273       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1274       benchmark::utils::CheckNEON);
1275   }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1276   static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1277     GEMMEnd2EndBenchmark(state, model,
1278       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1279       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1280       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1281       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1282       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1283       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1284       benchmark::utils::CheckNEON);
1285   }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1286   static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1287     GEMMEnd2EndBenchmark(state, model,
1288       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1289       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1290       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1291       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1292       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1293       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1294       benchmark::utils::CheckNEON);
1295   }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1296   static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1297     GEMMEnd2EndBenchmark(state, model,
1298       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1299       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1300       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1301       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1302       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1303       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1304       benchmark::utils::CheckNEON);
1305   }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1306   static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1307     GEMMEnd2EndBenchmark(state, model,
1308       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1309       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1310       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1311       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1312       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1313       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1314       benchmark::utils::CheckNEON);
1315   }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1316   static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1317     GEMMEnd2EndBenchmark(state, model,
1318       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1319       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1320       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1321       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1322       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1323       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1324       benchmark::utils::CheckNEON);
1325   }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1326   static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1327     GEMMEnd2EndBenchmark(state, model,
1328       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1329       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1330       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1331       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1332       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1333       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1334       benchmark::utils::CheckNEON);
1335   }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1336   static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1337     GEMMEnd2EndBenchmark(state, model,
1338       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1339       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1340       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1341       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1342       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1343       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1344       benchmark::utils::CheckNEON);
1345   }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1346   static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1347     GEMMEnd2EndBenchmark(state, model,
1348       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1349       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1350       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1351       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1352       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1353       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1354       benchmark::utils::CheckNEON);
1355   }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1356   static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1357     GEMMEnd2EndBenchmark(state, model,
1358       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1359       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1360       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1361       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1362       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1363       2 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1364       benchmark::utils::CheckNEON);
1365   }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1366   static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1367     GEMMEnd2EndBenchmark(state, model,
1368       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1369       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1370       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1371       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1372       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1373       2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1374       benchmark::utils::CheckNEON);
1375   }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1376   static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1377     GEMMEnd2EndBenchmark(state, model,
1378       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1379       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1380       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1381       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1382       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1383       3 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1384       benchmark::utils::CheckNEON);
1385   }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1386   static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1387     GEMMEnd2EndBenchmark(state, model,
1388       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1389       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1390       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1391       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1392       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1393       3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1394       benchmark::utils::CheckNEON);
1395   }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1396   static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1397     GEMMEnd2EndBenchmark(state, model,
1398       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1399       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1400       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1401       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1402       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1403       4 /* mr */, 8  /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1404       benchmark::utils::CheckNEON);
1405   }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1406   static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1407     GEMMEnd2EndBenchmark(state, model,
1408       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1409       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1410       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1411       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1412       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1413       4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1414       benchmark::utils::CheckNEON);
1415   }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1416   static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1417     GEMMEnd2EndBenchmark(state, model,
1418       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1419       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1420       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1421       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1422       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1423       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1424       benchmark::utils::CheckNEON);
1425   }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1426   static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1427     GEMMEnd2EndBenchmark(state, model,
1428       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1429       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1430       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1431       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1432       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1433       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1434       benchmark::utils::CheckNEON);
1435   }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1436   static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1437     GEMMEnd2EndBenchmark(state, model,
1438       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1439       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1440       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1441       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1442       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1443       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1444       benchmark::utils::CheckNEON);
1445   }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1446   static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1447     GEMMEnd2EndBenchmark(state, model,
1448       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1449       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1450       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1451       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1452       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1453       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1454       benchmark::utils::CheckNEON);
1455   }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1456   static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1457     GEMMEnd2EndBenchmark(state, model,
1458       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1459       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1460       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1461       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1462       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1463       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1464       benchmark::utils::CheckNEON);
1465   }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1466   static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1467     GEMMEnd2EndBenchmark(state, model,
1468       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1469       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1470       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1471       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1472       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1473       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1474       benchmark::utils::CheckNEON);
1475   }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1476   static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1477     GEMMEnd2EndBenchmark(state, model,
1478       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1479       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1480       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1481       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1482       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1483       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1484       benchmark::utils::CheckNEON);
1485   }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1486   static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1487     GEMMEnd2EndBenchmark(state, model,
1488       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1489       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1490       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1491       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1492       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1493       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1494       benchmark::utils::CheckNEON);
1495   }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1496   static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1497     GEMMEnd2EndBenchmark(state, model,
1498       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1499       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1500       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1501       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1502       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1503       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1504       benchmark::utils::CheckNEON);
1505   }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1506   static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1507     GEMMEnd2EndBenchmark(state, model,
1508       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1509       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1510       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1511       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1512       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1513       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1514       benchmark::utils::CheckNEON);
1515   }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1516   static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1517     GEMMEnd2EndBenchmark(state, model,
1518       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1519       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1520       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1521       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1522       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1523       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1524       benchmark::utils::CheckNEON);
1525   }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1526   static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1527     GEMMEnd2EndBenchmark(state, model,
1528       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1529       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1530       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1531       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1532       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1533       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1534       benchmark::utils::CheckNEON);
1535   }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1536   static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1537     GEMMEnd2EndBenchmark(state, model,
1538       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1539       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1540       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1541       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1542       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1543       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1544       benchmark::utils::CheckNEON);
1545   }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1546   static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1547     GEMMEnd2EndBenchmark(state, model,
1548       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1549       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1550       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1551       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1552       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1553       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1554       benchmark::utils::CheckNEON);
1555   }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1556   static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1557     GEMMEnd2EndBenchmark(state, model,
1558       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1559       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1560       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1561       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1562       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1563       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1564       benchmark::utils::CheckNEON);
1565   }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1566   static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1567     GEMMEnd2EndBenchmark(state, model,
1568       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1569       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1570       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1571       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1572       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1573       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1574       benchmark::utils::CheckNEON);
1575   }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1576   static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1577     GEMMEnd2EndBenchmark(state, model,
1578       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1579       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1580       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1581       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1582       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1583       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1584       benchmark::utils::CheckNEON);
1585   }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1586   static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1587     GEMMEnd2EndBenchmark(state, model,
1588       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1589       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1590       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1591       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1592       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1593       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1594       benchmark::utils::CheckNEON);
1595   }
qs8_gemm_2x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1596   static void qs8_gemm_2x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1597     GEMMEnd2EndBenchmark(state, model,
1598       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1599       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1600       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1601       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1602       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1603       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1604       benchmark::utils::CheckNEON);
1605   }
qs8_gemm_2x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1606   static void qs8_gemm_2x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1607     GEMMEnd2EndBenchmark(state, model,
1608       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1609       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1610       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1611       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1612       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1613       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1614       benchmark::utils::CheckNEON);
1615   }
qs8_gemm_3x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1616   static void qs8_gemm_3x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1617     GEMMEnd2EndBenchmark(state, model,
1618       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1619       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1620       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1621       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1622       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1623       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1624       benchmark::utils::CheckNEON);
1625   }
qs8_gemm_3x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1626   static void qs8_gemm_3x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1627     GEMMEnd2EndBenchmark(state, model,
1628       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1629       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1630       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1631       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1632       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1633       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1634       benchmark::utils::CheckNEON);
1635   }
qs8_gemm_4x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1636   static void qs8_gemm_4x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1637     GEMMEnd2EndBenchmark(state, model,
1638       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1639       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1640       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1641       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1642       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1643       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1644       benchmark::utils::CheckNEON);
1645   }
qs8_gemm_4x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1646   static void qs8_gemm_4x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1647     GEMMEnd2EndBenchmark(state, model,
1648       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1649       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1650       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1651       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1652       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1653       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1654       benchmark::utils::CheckNEON);
1655   }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1656   static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1657     GEMMEnd2EndBenchmark(state, model,
1658       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1659       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1660       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1661       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1662       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1663       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1664       benchmark::utils::CheckNEON);
1665   }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1666   static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1667     GEMMEnd2EndBenchmark(state, model,
1668       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1669       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1670       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1671       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1672       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1673       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1674       benchmark::utils::CheckNEON);
1675   }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1676   static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1677     GEMMEnd2EndBenchmark(state, model,
1678       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1679       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1680       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1681       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1682       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1683       3 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1684       benchmark::utils::CheckNEON);
1685   }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1686   static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1687     GEMMEnd2EndBenchmark(state, model,
1688       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1689       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1690       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1691       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1692       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1693       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1694       benchmark::utils::CheckNEON);
1695   }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1696   static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1697     GEMMEnd2EndBenchmark(state, model,
1698       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1699       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1700       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1701       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1702       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1703       4 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1704       benchmark::utils::CheckNEON);
1705   }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1706   static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1707     GEMMEnd2EndBenchmark(state, model,
1708       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1709       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1710       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1711       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1712       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1713       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1714       benchmark::utils::CheckNEON);
1715   }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1716   static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1717     GEMMEnd2EndBenchmark(state, model,
1718       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1719       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1720       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1721       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1722       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1723       2 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1724       benchmark::utils::CheckNEON);
1725   }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1726   static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1727     GEMMEnd2EndBenchmark(state, model,
1728       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1729       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1730       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1731       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1732       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1733       2 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1734       benchmark::utils::CheckNEON);
1735   }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1736   static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1737     GEMMEnd2EndBenchmark(state, model,
1738       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1739       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1740       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1741       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1742       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1743       4 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1744       benchmark::utils::CheckNEON);
1745   }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1746   static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1747     GEMMEnd2EndBenchmark(state, model,
1748       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1749       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1750       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1751       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1752       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1753       4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1754       benchmark::utils::CheckNEON);
1755   }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1756   static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1757     GEMMEnd2EndBenchmark(state, model,
1758       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1759       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1760       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1761       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1762       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1763       4 /* mr */, 8  /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1764       benchmark::utils::CheckNEON);
1765   }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1766   static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1767     GEMMEnd2EndBenchmark(state, model,
1768       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1769       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1770       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1771       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1772       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1773       4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1774       benchmark::utils::CheckNEON);
1775   }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1776   static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1777     GEMMEnd2EndBenchmark(state, model,
1778       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1779       xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1780       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1781       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1782       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1783       2 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1784       benchmark::utils::CheckNEON);
1785   }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1786   static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1787     GEMMEnd2EndBenchmark(state, model,
1788       xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1789       xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1790       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1791       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1792       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1793       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1794       benchmark::utils::CheckNEON);
1795   }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1796   static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1797     GEMMEnd2EndBenchmark(state, model,
1798       xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1799       xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1800       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1801       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1802       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1803       3 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1804       benchmark::utils::CheckNEON);
1805   }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1806   static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1807     GEMMEnd2EndBenchmark(state, model,
1808       xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1809       xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1810       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1811       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1812       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1813       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1814       benchmark::utils::CheckNEON);
1815   }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1816   static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1817     GEMMEnd2EndBenchmark(state, model,
1818       xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1819       xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1820       xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1821       xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1822       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1823       4 /* mr */, 8  /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1824       benchmark::utils::CheckNEON);
1825   }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1826   static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1827     GEMMEnd2EndBenchmark(state, model,
1828       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1829       xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1830       xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1831       xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1832       xnn_init_qs8_conv_minmax_rndnu_neon_params,
1833       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1834       benchmark::utils::CheckNEON);
1835   }
1836 
1837   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mlal);
1838   BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mlal);
1839   BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mlal);
1840   BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mlal);
1841   BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mlal);
1842   BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mlal);
1843 
1844   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mull);
1845   BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mull);
1846   BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mull);
1847   BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mull);
1848   BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mull);
1849   BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mull);
1850 
1851   BENCHMARK_QS8_END2END(qs8_gemm_2x8c16__neon_mlal);
1852   BENCHMARK_QS8_END2END(qs8_gemm_2x16c16__neon_mlal);
1853   BENCHMARK_QS8_END2END(qs8_gemm_3x8c16__neon_mlal);
1854   BENCHMARK_QS8_END2END(qs8_gemm_3x16c16__neon_mlal);
1855   BENCHMARK_QS8_END2END(qs8_gemm_4x8c16__neon_mlal);
1856   BENCHMARK_QS8_END2END(qs8_gemm_4x16c16__neon_mlal);
1857 
1858   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_dup);
1859   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_dup);
1860   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_dup);
1861   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_dup);
1862   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_dup);
1863   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_dup);
1864 
1865   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_dup);
1866   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_dup);
1867   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_dup);
1868   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_dup);
1869   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_dup);
1870   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_dup);
1871 
1872   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld1r);
1873   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld1r);
1874   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld1r);
1875   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld1r);
1876   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld1r);
1877   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld1r);
1878 
1879   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld1r);
1880   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld1r);
1881   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld1r);
1882   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld1r);
1883   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld1r);
1884   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld1r);
1885 
1886   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld2r);
1887   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld2r);
1888   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld2r);
1889   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld2r);
1890   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld2r);
1891   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld2r);
1892 
1893   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld2r);
1894   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld2r);
1895   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld2r);
1896   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld2r);
1897   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld2r);
1898   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld2r);
1899 
1900   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mlal);
1901   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mlal);
1902   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mlal);
1903   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mlal);
1904   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mlal);
1905   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mlal);
1906 
1907   BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mull);
1908   BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mull);
1909   BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mull);
1910   BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mull);
1911   BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mull);
1912   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mull);
1913 
1914   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_dup);
1915   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_dup);
1916   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_dup);
1917   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_dup);
1918   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_dup);
1919   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_dup);
1920 
1921   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_dup);
1922   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_dup);
1923   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_dup);
1924   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_dup);
1925   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_dup);
1926   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_dup);
1927 
1928   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld1r);
1929   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld1r);
1930   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld1r);
1931   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld1r);
1932   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld1r);
1933   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld1r);
1934 
1935   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld1r);
1936   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld1r);
1937   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld1r);
1938   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld1r);
1939   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld1r);
1940   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld1r);
1941 
1942   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld2r);
1943   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld2r);
1944   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld2r);
1945   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld2r);
1946   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld2r);
1947   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld2r);
1948 
1949   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld2r);
1950   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld2r);
1951   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld2r);
1952   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld2r);
1953   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld2r);
1954   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld2r);
1955 
1956   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld4r);
1957   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld4r);
1958   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld4r);
1959   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld4r);
1960   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld4r);
1961   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld4r);
1962 
1963   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld4r);
1964   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld4r);
1965   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld4r);
1966   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld4r);
1967   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld4r);
1968   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld4r);
1969 
1970   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mlal);
1971   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mlal);
1972   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mlal);
1973   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mlal);
1974   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mlal);
1975   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mlal);
1976 
1977   BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mull);
1978   BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mull);
1979   BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mull);
1980   BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mull);
1981   BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mull);
1982   BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mull);
1983 
1984   BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane);
1985   BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane);
1986   BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane);
1987   BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane);
1988   BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane);
1989   BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane);
1990   BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane);
1991   BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane);
1992 
1993   BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane_prfm);
1994   BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane_prfm);
1995   BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane_prfm);
1996   BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane_prfm);
1997   BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane_prfm);
1998   BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane_prfm);
1999   BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane_prfm);
2000   BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane_prfm);
2001 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2002 
2003 
2004 #if XNN_ARCH_ARM
qs8_gemm_1x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2005   static void qs8_gemm_1x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2006     GEMMEnd2EndBenchmark(state, model,
2007       xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2008       xnn_qs8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2009       xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2010       xnn_qs8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2011       xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2012       1 /* mr */, 1  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2013       benchmark::utils::CheckARMV6);
2014   }
qs8_gemm_2x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2015   static void qs8_gemm_2x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2016     GEMMEnd2EndBenchmark(state, model,
2017       xnn_qs8_gemm_minmax_fp32_ukernel_2x1c4__armsimd32,
2018       xnn_qs8_igemm_minmax_fp32_ukernel_2x1c4__armsimd32,
2019       xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2020       xnn_qs8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2021       xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2022       2 /* mr */, 1  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2023       benchmark::utils::CheckARMV6);
2024   }
qs8_gemm_1x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2025   static void qs8_gemm_1x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2026     GEMMEnd2EndBenchmark(state, model,
2027       xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2028       xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2029       xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2030       xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2031       xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2032       1 /* mr */, 2  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2033       benchmark::utils::CheckARMV6);
2034   }
qs8_gemm_2x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2035   static void qs8_gemm_2x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2036     GEMMEnd2EndBenchmark(state, model,
2037       xnn_qs8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32,
2038       xnn_qs8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32,
2039       xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2040       xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2041       xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2042       2 /* mr */, 2  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2043       benchmark::utils::CheckARMV6);
2044   }
2045 
2046   BENCHMARK_QS8_END2END(qs8_gemm_1x1c4__armsimd32);
2047   BENCHMARK_QS8_END2END(qs8_gemm_2x1c4__armsimd32);
2048   BENCHMARK_QS8_END2END(qs8_gemm_1x2c4__armsimd32);
2049   BENCHMARK_QS8_END2END(qs8_gemm_2x2c4__armsimd32);
2050 #endif  // XNN_ARCH_ARM
2051 
2052 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2053   static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2054     GEMMEnd2EndBenchmark(state, model,
2055       xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
2056       xnn_qs8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
2057       xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2058       xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2059       xnn_init_qs8_conv_minmax_fp32_avx512_params,
2060       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2061       benchmark::utils::CheckAVX512F);
2062   }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2063   static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2064     GEMMEnd2EndBenchmark(state, model,
2065       xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
2066       xnn_qs8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
2067       xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2068       xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2069       xnn_init_qs8_conv_minmax_fp32_avx512_params,
2070       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2071       benchmark::utils::CheckAVX512F);
2072   }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2073   static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2074     GEMMEnd2EndBenchmark(state, model,
2075       xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2076       xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2077       xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2078       xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2079       xnn_init_qs8_conv_minmax_fp32_avx512_params,
2080       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2081       benchmark::utils::CheckAVX512F);
2082   }
qs8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2083   static void qs8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2084     GEMMEnd2EndBenchmark(state, model,
2085       xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
2086       xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
2087       xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2088       xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2089       xnn_init_qs8_conv_minmax_fp32_avx2_params,
2090       2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2091       benchmark::utils::CheckAVX2);
2092   }
qs8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2093   static void qs8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2094     GEMMEnd2EndBenchmark(state, model,
2095       xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
2096       xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
2097       xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2098       xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2099       xnn_init_qs8_conv_minmax_fp32_avx2_params,
2100       3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2101       benchmark::utils::CheckAVX2);
2102   }
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2103   static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2104     GEMMEnd2EndBenchmark(state, model,
2105       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2106       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2107       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2108       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2109       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2110       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2111       benchmark::utils::CheckXOP);
2112   }
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2113   static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2114     GEMMEnd2EndBenchmark(state, model,
2115       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2116       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2117       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2118       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2119       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2120       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2121       benchmark::utils::CheckXOP);
2122   }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2123   static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2124     GEMMEnd2EndBenchmark(state, model,
2125       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2126       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2127       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2128       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2129       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2130       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2131       benchmark::utils::CheckXOP);
2132   }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2133   static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2134     GEMMEnd2EndBenchmark(state, model,
2135       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2136       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2137       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2138       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2139       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2140       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2141       benchmark::utils::CheckXOP);
2142   }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2143   static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2144     GEMMEnd2EndBenchmark(state, model,
2145       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2146       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2147       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2148       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2149       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2150       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2151       benchmark::utils::CheckXOP);
2152   }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2153   static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2154     GEMMEnd2EndBenchmark(state, model,
2155       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2156       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2157       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2158       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2159       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2160       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2161       benchmark::utils::CheckXOP);
2162   }
qs8_gemm_2x4c2s4__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2163   static void qs8_gemm_2x4c2s4__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2164     GEMMEnd2EndBenchmark(state, model,
2165       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64,
2166       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64,
2167       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2168       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2169       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2170       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2171       benchmark::utils::CheckXOP);
2172   }
qs8_gemm_2x4c2s4__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2173   static void qs8_gemm_2x4c2s4__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2174     GEMMEnd2EndBenchmark(state, model,
2175       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld128,
2176       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__xop_ld128,
2177       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2178       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2179       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2180       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2181       benchmark::utils::CheckXOP);
2182   }
qs8_gemm_3x4c2s4__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2183   static void qs8_gemm_3x4c2s4__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2184     GEMMEnd2EndBenchmark(state, model,
2185       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld64,
2186       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__xop_ld64,
2187       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2188       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2189       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2190       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2191       benchmark::utils::CheckXOP);
2192   }
qs8_gemm_3x4c2s4__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2193   static void qs8_gemm_3x4c2s4__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2194     GEMMEnd2EndBenchmark(state, model,
2195       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128,
2196       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128,
2197       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2198       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2199       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2200       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2201       benchmark::utils::CheckXOP);
2202   }
qs8_gemm_4x4c2s4__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2203   static void qs8_gemm_4x4c2s4__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2204     GEMMEnd2EndBenchmark(state, model,
2205       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64,
2206       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64,
2207       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2208       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2209       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2210       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2211       benchmark::utils::CheckXOP);
2212   }
qs8_gemm_4x4c2s4__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2213   static void qs8_gemm_4x4c2s4__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2214     GEMMEnd2EndBenchmark(state, model,
2215       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128,
2216       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128,
2217       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2218       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2219       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2220       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2221       benchmark::utils::CheckXOP);
2222   }
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2223   static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2224     GEMMEnd2EndBenchmark(state, model,
2225       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2226       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2227       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2228       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2229       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2230       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2231       benchmark::utils::CheckXOP);
2232   }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2233   static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2234     GEMMEnd2EndBenchmark(state, model,
2235       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2236       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2237       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2238       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2239       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2240       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2241       benchmark::utils::CheckXOP);
2242   }
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2243   static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2244     GEMMEnd2EndBenchmark(state, model,
2245       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2246       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2247       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2248       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2249       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2250       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2251       benchmark::utils::CheckXOP);
2252   }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2253   static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2254     GEMMEnd2EndBenchmark(state, model,
2255       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2256       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2257       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2258       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2259       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2260       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2261       benchmark::utils::CheckXOP);
2262   }
2263 
2264 
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2265   static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2266     GEMMEnd2EndBenchmark(state, model,
2267       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2268       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2269       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2270       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2271       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2272       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2273       benchmark::utils::CheckAVX);
2274   }
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2275   static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2276     GEMMEnd2EndBenchmark(state, model,
2277       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2278       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2279       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2280       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2281       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2282       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2283       benchmark::utils::CheckAVX);
2284   }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2285   static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2286     GEMMEnd2EndBenchmark(state, model,
2287       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2288       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2289       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2290       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2291       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2292       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2293       benchmark::utils::CheckAVX);
2294   }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2295   static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2296     GEMMEnd2EndBenchmark(state, model,
2297       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2298       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2299       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2300       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2301       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2302       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2303       benchmark::utils::CheckAVX);
2304   }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2305   static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2306     GEMMEnd2EndBenchmark(state, model,
2307       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2308       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2309       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2310       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2311       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2312       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2313       benchmark::utils::CheckAVX);
2314   }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2315   static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2316     GEMMEnd2EndBenchmark(state, model,
2317       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2318       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2319       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2320       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2321       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2322       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2323       benchmark::utils::CheckAVX);
2324   }
qs8_gemm_2x4c2s4__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2325   static void qs8_gemm_2x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2326     GEMMEnd2EndBenchmark(state, model,
2327       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64,
2328       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64,
2329       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2330       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2331       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2332       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2333       benchmark::utils::CheckAVX);
2334   }
qs8_gemm_2x4c2s4__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2335   static void qs8_gemm_2x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2336     GEMMEnd2EndBenchmark(state, model,
2337       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128,
2338       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128,
2339       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2340       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2341       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2342       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2343       benchmark::utils::CheckAVX);
2344   }
qs8_gemm_3x4c2s4__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2345   static void qs8_gemm_3x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2346     GEMMEnd2EndBenchmark(state, model,
2347       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64,
2348       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64,
2349       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2350       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2351       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2352       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2353       benchmark::utils::CheckAVX);
2354   }
qs8_gemm_3x4c2s4__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2355   static void qs8_gemm_3x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2356     GEMMEnd2EndBenchmark(state, model,
2357       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128,
2358       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128,
2359       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2360       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2361       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2362       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2363       benchmark::utils::CheckAVX);
2364   }
qs8_gemm_4x4c2s4__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2365   static void qs8_gemm_4x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2366     GEMMEnd2EndBenchmark(state, model,
2367       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64,
2368       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64,
2369       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2370       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2371       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2372       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2373       benchmark::utils::CheckAVX);
2374   }
qs8_gemm_4x4c2s4__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2375   static void qs8_gemm_4x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2376     GEMMEnd2EndBenchmark(state, model,
2377       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128,
2378       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128,
2379       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2380       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2381       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2382       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2383       benchmark::utils::CheckAVX);
2384   }
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2385   static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2386     GEMMEnd2EndBenchmark(state, model,
2387       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2388       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2389       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2390       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2391       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2392       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2393       benchmark::utils::CheckAVX);
2394   }
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2395   static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2396     GEMMEnd2EndBenchmark(state, model,
2397       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2398       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2399       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2400       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2401       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2402       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2403       benchmark::utils::CheckAVX);
2404   }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2405   static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2406     GEMMEnd2EndBenchmark(state, model,
2407       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2408       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2409       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2410       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2411       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2412       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2413       benchmark::utils::CheckAVX);
2414   }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2415   static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2416     GEMMEnd2EndBenchmark(state, model,
2417       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2418       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2419       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2420       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2421       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2422       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2423       benchmark::utils::CheckAVX);
2424   }
2425 
2426 
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2427   static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2428     GEMMEnd2EndBenchmark(state, model,
2429       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2430       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2431       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2432       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2433       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2434       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2435       benchmark::utils::CheckSSE41);
2436   }
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2437   static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2438     GEMMEnd2EndBenchmark(state, model,
2439       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2440       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2441       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2442       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2443       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2444       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2445       benchmark::utils::CheckSSE41);
2446   }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2447   static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2448     GEMMEnd2EndBenchmark(state, model,
2449       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2450       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2451       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2452       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2453       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2454       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2455       benchmark::utils::CheckSSE41);
2456   }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2457   static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2458     GEMMEnd2EndBenchmark(state, model,
2459       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2460       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2461       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2462       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2463       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2464       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2465       benchmark::utils::CheckSSE41);
2466   }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2467   static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2468     GEMMEnd2EndBenchmark(state, model,
2469       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2470       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2471       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2472       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2473       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2474       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2475       benchmark::utils::CheckSSE41);
2476   }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2477   static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2478     GEMMEnd2EndBenchmark(state, model,
2479       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2480       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2481       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2482       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2483       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2484       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2485       benchmark::utils::CheckSSE41);
2486   }
qs8_gemm_2x4c2s4__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2487   static void qs8_gemm_2x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2488     GEMMEnd2EndBenchmark(state, model,
2489       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64,
2490       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64,
2491       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2492       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2493       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2494       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2495       benchmark::utils::CheckSSE41);
2496   }
qs8_gemm_2x4c2s4__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2497   static void qs8_gemm_2x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2498     GEMMEnd2EndBenchmark(state, model,
2499       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128,
2500       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128,
2501       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2502       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2503       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2504       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2505       benchmark::utils::CheckSSE41);
2506   }
qs8_gemm_3x4c2s4__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2507   static void qs8_gemm_3x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2508     GEMMEnd2EndBenchmark(state, model,
2509       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64,
2510       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64,
2511       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2512       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2513       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2514       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2515       benchmark::utils::CheckSSE41);
2516   }
qs8_gemm_3x4c2s4__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2517   static void qs8_gemm_3x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2518     GEMMEnd2EndBenchmark(state, model,
2519       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128,
2520       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128,
2521       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2522       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2523       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2524       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2525       benchmark::utils::CheckSSE41);
2526   }
qs8_gemm_4x4c2s4__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2527   static void qs8_gemm_4x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2528     GEMMEnd2EndBenchmark(state, model,
2529       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64,
2530       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64,
2531       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2532       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2533       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2534       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2535       benchmark::utils::CheckSSE41);
2536   }
qs8_gemm_4x4c2s4__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2537   static void qs8_gemm_4x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2538     GEMMEnd2EndBenchmark(state, model,
2539       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128,
2540       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128,
2541       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2542       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2543       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2544       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2545       benchmark::utils::CheckSSE41);
2546   }
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2547   static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2548     GEMMEnd2EndBenchmark(state, model,
2549       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2550       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2551       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2552       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2553       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2554       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2555       benchmark::utils::CheckSSE41);
2556   }
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2557   static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2558     GEMMEnd2EndBenchmark(state, model,
2559       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2560       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2561       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2562       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2563       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2564       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2565       benchmark::utils::CheckSSE41);
2566   }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2567   static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2568     GEMMEnd2EndBenchmark(state, model,
2569       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2570       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2571       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2572       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2573       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2574       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2575       benchmark::utils::CheckSSE41);
2576   }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2577   static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2578     GEMMEnd2EndBenchmark(state, model,
2579       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2580       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2581       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2582       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2583       xnn_init_qs8_conv_minmax_fp32_sse4_params,
2584       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2585       benchmark::utils::CheckSSE41);
2586   }
2587 
2588 
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2589   static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2590     GEMMEnd2EndBenchmark(state, model,
2591       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2592       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2593       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2594       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2595       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2596       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2597       benchmark::utils::CheckSSSE3);
2598   }
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2599   static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2600     GEMMEnd2EndBenchmark(state, model,
2601       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2602       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2603       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2604       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2605       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2606       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2607       benchmark::utils::CheckSSSE3);
2608   }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2609   static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2610     GEMMEnd2EndBenchmark(state, model,
2611       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2612       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2613       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2614       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2615       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2616       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2617       benchmark::utils::CheckSSSE3);
2618   }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2619   static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2620     GEMMEnd2EndBenchmark(state, model,
2621       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2622       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2623       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2624       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2625       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2626       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2627       benchmark::utils::CheckSSSE3);
2628   }
2629 
2630 
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2631   static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2632     GEMMEnd2EndBenchmark(state, model,
2633       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2634       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2635       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2636       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2637       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2638       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2639   }
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2640   static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2641     GEMMEnd2EndBenchmark(state, model,
2642       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2643       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2644       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2645       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2646       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2647       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2648   }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2649   static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2650     GEMMEnd2EndBenchmark(state, model,
2651       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2652       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2653       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2654       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2655       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2656       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2657   }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2658   static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2659     GEMMEnd2EndBenchmark(state, model,
2660       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2661       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2662       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2663       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2664       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2665       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2666   }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2667   static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2668     GEMMEnd2EndBenchmark(state, model,
2669       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2670       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2671       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2672       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2673       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2674       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2675   }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2676   static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2677     GEMMEnd2EndBenchmark(state, model,
2678       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2679       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2680       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2681       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2682       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2683       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2684   }
qs8_gemm_2x4c2s4__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2685   static void qs8_gemm_2x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2686     GEMMEnd2EndBenchmark(state, model,
2687       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64,
2688       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64,
2689       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2690       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2691       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2692       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2693   }
qs8_gemm_2x4c2s4__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2694   static void qs8_gemm_2x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2695     GEMMEnd2EndBenchmark(state, model,
2696       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128,
2697       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128,
2698       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2699       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2700       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2701       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2702   }
qs8_gemm_3x4c2s4__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2703   static void qs8_gemm_3x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2704     GEMMEnd2EndBenchmark(state, model,
2705       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64,
2706       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64,
2707       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2708       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2709       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2710       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2711   }
qs8_gemm_3x4c2s4__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2712   static void qs8_gemm_3x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2713     GEMMEnd2EndBenchmark(state, model,
2714       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128,
2715       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128,
2716       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2717       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2718       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2719       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2720   }
qs8_gemm_4x4c2s4__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2721   static void qs8_gemm_4x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2722     GEMMEnd2EndBenchmark(state, model,
2723       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64,
2724       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64,
2725       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2726       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2727       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2728       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2729   }
qs8_gemm_4x4c2s4__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2730   static void qs8_gemm_4x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2731     GEMMEnd2EndBenchmark(state, model,
2732       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128,
2733       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128,
2734       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2735       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2736       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2737       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2738   }
qs8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2739   static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2740     GEMMEnd2EndBenchmark(state, model,
2741       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2742       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2743       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2744       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2745       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2746       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2747   }
qs8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2748   static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2749     GEMMEnd2EndBenchmark(state, model,
2750       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2751       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2752       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2753       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2754       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2755       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2756   }
qs8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2757   static void qs8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2758     GEMMEnd2EndBenchmark(state, model,
2759       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2760       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2761       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2762       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2763       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2764       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2765   }
qs8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2766   static void qs8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2767     GEMMEnd2EndBenchmark(state, model,
2768       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2769       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2770       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2771       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2772       xnn_init_qs8_conv_minmax_fp32_sse2_params,
2773       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2774   }
2775 
2776 
2777   BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__avx512skx);
2778   BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__avx512skx);
2779   BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__avx512skx);
2780 
2781   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__avx2);
2782   BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__avx2);
2783 
2784   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld64);
2785   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld128);
2786   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld64);
2787   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld128);
2788   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld64);
2789   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld128);
2790 
2791   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__xop_ld64);
2792   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__xop_ld128);
2793   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__xop_ld64);
2794   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__xop_ld128);
2795   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__xop_ld64);
2796   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__xop_ld128);
2797 
2798   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld64);
2799   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld128);
2800   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld64);
2801   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld128);
2802 
2803   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld64);
2804   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld128);
2805   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld64);
2806   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld128);
2807   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld64);
2808   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld128);
2809 
2810   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__avx_ld64);
2811   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__avx_ld128);
2812   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__avx_ld64);
2813   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__avx_ld128);
2814   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__avx_ld64);
2815   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__avx_ld128);
2816 
2817   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld64);
2818   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld128);
2819   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld64);
2820   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld128);
2821 
2822   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld64);
2823   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld128);
2824   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld64);
2825   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld128);
2826   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld64);
2827   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld128);
2828 
2829   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse41_ld64);
2830   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse41_ld128);
2831   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse41_ld64);
2832   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse41_ld128);
2833   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse41_ld64);
2834   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse41_ld128);
2835 
2836   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld64);
2837   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld128);
2838   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld64);
2839   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld128);
2840 
2841   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld64);
2842   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld128);
2843   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld64);
2844   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld128);
2845 
2846   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld64);
2847   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld128);
2848   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld64);
2849   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld128);
2850   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld64);
2851   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld128);
2852 
2853   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse2_ld64);
2854   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse2_ld128);
2855   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse2_ld64);
2856   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse2_ld128);
2857   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse2_ld64);
2858   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse2_ld128);
2859 
2860   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld64);
2861   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld128);
2862   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld64);
2863   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld128);
2864 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2865 
2866 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2867   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2868     GEMMEnd2EndBenchmark(state, model,
2869       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2870       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2871       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2872       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2873       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2874       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2875   }
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2876   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2877     GEMMEnd2EndBenchmark(state, model,
2878       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2879       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2880       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2881       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2882       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2883       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2884   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2885   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2886     GEMMEnd2EndBenchmark(state, model,
2887       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2888       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2889       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2890       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2891       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2892       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2893   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2894   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2895     GEMMEnd2EndBenchmark(state, model,
2896       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2897       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2898       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2899       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2900       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2901       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2902   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2903   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2904     GEMMEnd2EndBenchmark(state, model,
2905       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2906       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2907       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2908       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2909       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2910       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2911   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2912   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2913     GEMMEnd2EndBenchmark(state, model,
2914       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2915       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2916       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2917       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2918       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2919       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2920   }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2921   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2922     GEMMEnd2EndBenchmark(state, model,
2923       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2924       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2925       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2926       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2927       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2928       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2929   }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2930   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2931     GEMMEnd2EndBenchmark(state, model,
2932       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2933       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2934       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2935       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2936       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2937       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2938   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2939   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2940     GEMMEnd2EndBenchmark(state, model,
2941       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2942       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2943       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2944       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2945       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2946       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2947   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2948   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2949     GEMMEnd2EndBenchmark(state, model,
2950       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2951       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2952       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2953       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2954       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2955       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2956   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2957   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2958     GEMMEnd2EndBenchmark(state, model,
2959       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2960       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2961       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2962       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2963       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2964       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2965   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2966   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2967     GEMMEnd2EndBenchmark(state, model,
2968       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2969       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2970       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2971       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2972       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2973       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2974   }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2975   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2976     GEMMEnd2EndBenchmark(state, model,
2977       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2978       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2979       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2980       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2981       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2982       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2983   }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2984   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2985     GEMMEnd2EndBenchmark(state, model,
2986       xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2987       xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2988       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2989       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2990       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2991       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2992   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2993   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2994     GEMMEnd2EndBenchmark(state, model,
2995       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
2996       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
2997       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2998       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2999       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
3000       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
3001   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)3002   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
3003     GEMMEnd2EndBenchmark(state, model,
3004       xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
3005       xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
3006       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
3007       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
3008       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
3009       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
3010   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)3011   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
3012     GEMMEnd2EndBenchmark(state, model,
3013       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
3014       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
3015       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
3016       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
3017       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
3018       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
3019   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)3020   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
3021     GEMMEnd2EndBenchmark(state, model,
3022       xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
3023       xnn_qs8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
3024       xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
3025       xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
3026       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
3027       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
3028   }
3029 
3030   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)3031   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
3032   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
3033   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
3034   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
3035   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
3036 
3037   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
3038   BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
3039   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
3040   BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
3041   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
3042   BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
3043 
3044   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
3045   BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
3046   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
3047   BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
3048   BENCHMARK_QS8_END2END(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
3049   BENCHMARK_QS8_END2END(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
3050 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3051 
3052 
3053 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3054   static void qs8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3055     GEMMEnd2EndBenchmark(state, model,
3056       xnn_qs8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
3057       xnn_qs8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
3058       xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
3059       xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
3060       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3061       2 /* mr */, 2 /* nr */);
3062   }
qs8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3063   static void qs8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3064     GEMMEnd2EndBenchmark(state, model,
3065       xnn_qs8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
3066       xnn_qs8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
3067       xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
3068       xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
3069       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3070       3 /* mr */, 2 /* nr */);
3071   }
qs8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3072   static void qs8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3073     GEMMEnd2EndBenchmark(state, model,
3074       xnn_qs8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
3075       xnn_qs8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
3076       xnn_qs8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
3077       xnn_qs8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
3078       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3079       4 /* mr */, 2 /* nr */);
3080   }
qs8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3081   static void qs8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3082     GEMMEnd2EndBenchmark(state, model,
3083       xnn_qs8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
3084       xnn_qs8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
3085       xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
3086       xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
3087       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3088       2 /* mr */, 4 /* nr */);
3089   }
qs8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3090   static void qs8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3091     GEMMEnd2EndBenchmark(state, model,
3092       xnn_qs8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
3093       xnn_qs8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
3094       xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
3095       xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
3096       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3097       3 /* mr */, 4 /* nr */);
3098   }
qs8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3099   static void qs8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3100     GEMMEnd2EndBenchmark(state, model,
3101       xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
3102       xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
3103       xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
3104       xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
3105       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3106       4 /* mr */, 4 /* nr */);
3107   }
3108 
3109   BENCHMARK_QS8_END2END(qs8_gemm_2x2__wasm_fmagic)
BENCHMARK_QS8_END2END(qs8_gemm_3x2__wasm_fmagic)3110   BENCHMARK_QS8_END2END(qs8_gemm_3x2__wasm_fmagic)
3111   BENCHMARK_QS8_END2END(qs8_gemm_4x2__wasm_fmagic)
3112   BENCHMARK_QS8_END2END(qs8_gemm_2x4__wasm_fmagic)
3113   BENCHMARK_QS8_END2END(qs8_gemm_3x4__wasm_fmagic)
3114   BENCHMARK_QS8_END2END(qs8_gemm_4x4__wasm_fmagic)
3115 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3116 
3117 
3118 static void qs8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3119   GEMMEnd2EndBenchmark(state, model,
3120     xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
3121     xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
3122     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
3123     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
3124     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3125     2 /* mr */, 2 /* nr */);
3126 }
qs8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3127 static void qs8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3128   GEMMEnd2EndBenchmark(state, model,
3129     xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
3130     xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
3131     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
3132     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
3133     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3134     3 /* mr */, 2 /* nr */);
3135 }
qs8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3136 static void qs8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3137   GEMMEnd2EndBenchmark(state, model,
3138     xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
3139     xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
3140     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
3141     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
3142     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3143     4 /* mr */, 2 /* nr */);
3144 }
qs8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3145 static void qs8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3146   GEMMEnd2EndBenchmark(state, model,
3147     xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
3148     xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
3149     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
3150     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
3151     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3152     2 /* mr */, 4 /* nr */);
3153 }
qs8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3154 static void qs8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3155   GEMMEnd2EndBenchmark(state, model,
3156     xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
3157     xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
3158     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
3159     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
3160     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3161     3 /* mr */, 4 /* nr */);
3162 }
qs8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)3163 static void qs8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3164   GEMMEnd2EndBenchmark(state, model,
3165     xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
3166     xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
3167     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
3168     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
3169     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
3170     4 /* mr */, 4 /* nr */);
3171 }
3172 
qs8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)3173 static void qs8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3174   GEMMEnd2EndBenchmark(state, model,
3175     xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
3176     xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
3177     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
3178     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
3179     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
3180     2 /* mr */, 2 /* nr */);
3181 }
qs8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)3182 static void qs8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3183   GEMMEnd2EndBenchmark(state, model,
3184     xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
3185     xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
3186     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
3187     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
3188     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
3189     3 /* mr */, 2 /* nr */);
3190 }
qs8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)3191 static void qs8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3192   GEMMEnd2EndBenchmark(state, model,
3193     xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
3194     xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
3195     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
3196     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
3197     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
3198     4 /* mr */, 2 /* nr */);
3199 }
qs8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)3200 static void qs8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3201   GEMMEnd2EndBenchmark(state, model,
3202     xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
3203     xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
3204     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
3205     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
3206     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
3207     2 /* mr */, 4 /* nr */);
3208 }
qs8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)3209 static void qs8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3210   GEMMEnd2EndBenchmark(state, model,
3211     xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
3212     xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
3213     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
3214     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
3215     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
3216     3 /* mr */, 4 /* nr */);
3217 }
qs8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)3218 static void qs8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
3219   GEMMEnd2EndBenchmark(state, model,
3220     xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
3221     xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic,
3222     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
3223     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
3224     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
3225     4 /* mr */, 4 /* nr */);
3226 }
3227 
qs8_gemm_2x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)3228 static void qs8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
3229   GEMMEnd2EndBenchmark(state, model,
3230     xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
3231     xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
3232     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
3233     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
3234     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
3235     2 /* mr */, 2 /* nr */);
3236 }
qs8_gemm_3x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)3237 static void qs8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
3238   GEMMEnd2EndBenchmark(state, model,
3239     xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
3240     xnn_qs8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
3241     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
3242     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
3243     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
3244     3 /* mr */, 2 /* nr */);
3245 }
qs8_gemm_4x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)3246 static void qs8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
3247   GEMMEnd2EndBenchmark(state, model,
3248     xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
3249     xnn_qs8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
3250     xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
3251     xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
3252     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
3253     4 /* mr */, 2 /* nr */);
3254 }
qs8_gemm_2x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)3255 static void qs8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
3256   GEMMEnd2EndBenchmark(state, model,
3257     xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
3258     xnn_qs8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
3259     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
3260     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
3261     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
3262     2 /* mr */, 4 /* nr */);
3263 }
qs8_gemm_3x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)3264 static void qs8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
3265   GEMMEnd2EndBenchmark(state, model,
3266     xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
3267     xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
3268     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
3269     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
3270     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
3271     3 /* mr */, 4 /* nr */);
3272 }
qs8_gemm_4x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)3273 static void qs8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
3274   GEMMEnd2EndBenchmark(state, model,
3275     xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
3276     xnn_qs8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
3277     xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
3278     xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
3279     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
3280     4 /* mr */, 4 /* nr */);
3281 }
3282 
3283 BENCHMARK_QS8_END2END(qs8_gemm_2x2__scalar_fmagic)
3284 BENCHMARK_QS8_END2END(qs8_gemm_3x2__scalar_fmagic)
3285 BENCHMARK_QS8_END2END(qs8_gemm_4x2__scalar_fmagic)
3286 BENCHMARK_QS8_END2END(qs8_gemm_2x4__scalar_fmagic)
3287 BENCHMARK_QS8_END2END(qs8_gemm_3x4__scalar_fmagic)
3288 BENCHMARK_QS8_END2END(qs8_gemm_4x4__scalar_fmagic)
3289 
3290 BENCHMARK_QS8_END2END(qs8_gemm_2x2__scalar_imagic)
3291 BENCHMARK_QS8_END2END(qs8_gemm_3x2__scalar_imagic)
3292 BENCHMARK_QS8_END2END(qs8_gemm_4x2__scalar_imagic)
3293 BENCHMARK_QS8_END2END(qs8_gemm_2x4__scalar_imagic)
3294 BENCHMARK_QS8_END2END(qs8_gemm_3x4__scalar_imagic)
3295 BENCHMARK_QS8_END2END(qs8_gemm_4x4__scalar_imagic)
3296 
3297 BENCHMARK_QS8_END2END(qs8_gemm_2x2__scalar_lrintf)
3298 BENCHMARK_QS8_END2END(qs8_gemm_3x2__scalar_lrintf)
3299 BENCHMARK_QS8_END2END(qs8_gemm_4x2__scalar_lrintf)
3300 BENCHMARK_QS8_END2END(qs8_gemm_2x4__scalar_lrintf)
3301 BENCHMARK_QS8_END2END(qs8_gemm_3x4__scalar_lrintf)
3302 BENCHMARK_QS8_END2END(qs8_gemm_4x4__scalar_lrintf)
3303 
3304 #ifndef XNNPACK_BENCHMARK_NO_MAIN
3305 BENCHMARK_MAIN();
3306 #endif
3307