xref: /aosp_15_r20/external/XNNPACK/bench/qu8-gemm-e2e.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstring>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <xnnpack.h>
14 
15 #include <benchmark/benchmark.h>
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 
20 #include <xnnpack.h>
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/microfnptr.h>
24 #include <xnnpack/microparams-init.h>
25 
26 
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_qu8_igemm_minmax_ukernel_function igemm,xnn_qu8_gemm_minmax_ukernel_function gemm1,xnn_qu8_igemm_minmax_ukernel_function igemm1,xnn_init_qu8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void GEMMEnd2EndBenchmark(
28   benchmark::State& state,
29   models::ExecutionPlanFactory model_factory,
30   xnn_qu8_gemm_minmax_ukernel_function gemm,
31   xnn_qu8_igemm_minmax_ukernel_function igemm,
32   xnn_qu8_gemm_minmax_ukernel_function gemm1,
33   xnn_qu8_igemm_minmax_ukernel_function igemm1,
34   xnn_init_qu8_conv_minmax_params_fn init_params,
35   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
36   benchmark::utils::IsaCheckFunction isa_check = nullptr)
37 {
38   if (isa_check && !isa_check(state)) {
39     return;
40   }
41   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
42     state.SkipWithError("failed to initialize XNNPACK");
43     return;
44   }
45 
46   // Override microkernels chosen in xnn_initialize
47   // Note: do not directly assign to xnn_params.qu8.gemm because it breaks older gcc.
48   std::memset(&xnn_params.qu8.gemm, 0, sizeof(xnn_params.qu8.gemm));
49   xnn_params.qu8.gemm.minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
50   xnn_params.qu8.gemm.minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
51   xnn_params.qu8.gemm.minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
52   xnn_params.qu8.gemm.minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
53   xnn_params.qu8.gemm.init.qu8 = init_params;
54   xnn_params.qu8.gemm.mr = mr;
55   xnn_params.qu8.gemm.nr = nr;
56   xnn_params.qu8.gemm.log2_kr = log2_kr;
57   xnn_params.qu8.gemm.log2_sr = log2_sr;
58 
59   auto execution_plan = model_factory(nullptr);
60   if (execution_plan.empty()) {
61     state.SkipWithError("failed to create a model");
62     return;
63   }
64 
65   for (auto _ : state) {
66     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
67       xnn_status status = xnn_run_operator(op.get(), nullptr);
68       if (status != xnn_status_success) {
69         state.SkipWithError("failed to run a model");
70         return;
71       }
72     }
73   }
74 
75   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
76   if (cpu_frequency != 0) {
77     state.counters["cpufreq"] = cpu_frequency;
78   }
79 }
80 
81 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)82   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
83     GEMMEnd2EndBenchmark(state, model,
84       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
85       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
86       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
87       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
88       xnn_init_qu8_conv_minmax_rndnu_neon_params,
89       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
90       benchmark::utils::CheckNEON);
91   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)92   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
93     GEMMEnd2EndBenchmark(state, model,
94       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
95       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
96       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
97       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
98       xnn_init_qu8_conv_minmax_rndnu_neon_params,
99       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
100       benchmark::utils::CheckNEON);
101   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)102   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
103     GEMMEnd2EndBenchmark(state, model,
104       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
105       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
106       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
107       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
108       xnn_init_qu8_conv_minmax_rndnu_neon_params,
109       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
110       benchmark::utils::CheckNEON);
111   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)112   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
113     GEMMEnd2EndBenchmark(state, model,
114       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
115       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
116       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
117       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
118       xnn_init_qu8_conv_minmax_rndnu_neon_params,
119       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
120       benchmark::utils::CheckNEON);
121   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)122   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
123     GEMMEnd2EndBenchmark(state, model,
124       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
125       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
126       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
127       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
128       xnn_init_qu8_conv_minmax_rndnu_neon_params,
129       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
130       benchmark::utils::CheckNEON);
131   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)132   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
133     GEMMEnd2EndBenchmark(state, model,
134       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
135       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
136       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
137       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
138       xnn_init_qu8_conv_minmax_rndnu_neon_params,
139       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
140       benchmark::utils::CheckNEON);
141   }
142   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)143   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
144   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
145   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
146   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
147   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
148 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
149 
150 
151 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
152   static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
153     GEMMEnd2EndBenchmark(state, model,
154       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
155       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
156       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
157       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
158       xnn_init_qu8_conv_minmax_rndnu_neon_params,
159       4 /* mr */, 16  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
160       benchmark::utils::CheckNEONDOT);
161   }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)162   static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
163     GEMMEnd2EndBenchmark(state, model,
164       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
165       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
166       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
167       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
168       xnn_init_qu8_conv_minmax_rndnu_neon_params,
169       4 /* mr */, 16  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
170       benchmark::utils::CheckNEONDOT);
171   }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)172   static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
173     GEMMEnd2EndBenchmark(state, model,
174       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
175       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
176       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
177       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
178       xnn_init_qu8_conv_minmax_rndnu_neon_params,
179       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
180       benchmark::utils::CheckNEONDOT);
181   }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)182   static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
183     GEMMEnd2EndBenchmark(state, model,
184       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
185       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
186       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
187       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
188       xnn_init_qu8_conv_minmax_rndnu_neon_params,
189       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
190       benchmark::utils::CheckNEONDOT);
191   }
192 
193   BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55);
194   BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55);
195   BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_ld128);
196   BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_ld128);
197 #endif  // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
198 
199 
200 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)201   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
202     GEMMEnd2EndBenchmark(state, model,
203       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
204       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
205       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
206       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
207       xnn_init_qu8_conv_minmax_rndnu_neon_params,
208       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
209       benchmark::utils::CheckNEON);
210   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)211   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
212     GEMMEnd2EndBenchmark(state, model,
213       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
214       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
215       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
216       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
217       xnn_init_qu8_conv_minmax_rndnu_neon_params,
218       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
219       benchmark::utils::CheckNEON);
220   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)221   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
222     GEMMEnd2EndBenchmark(state, model,
223       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
224       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
225       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
226       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
227       xnn_init_qu8_conv_minmax_rndnu_neon_params,
228       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
229       benchmark::utils::CheckNEON);
230   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)231   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
232     GEMMEnd2EndBenchmark(state, model,
233       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
234       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
235       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
236       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
237       xnn_init_qu8_conv_minmax_rndnu_neon_params,
238       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
239       benchmark::utils::CheckNEON);
240   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)241   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
242     GEMMEnd2EndBenchmark(state, model,
243       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
244       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
245       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
246       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
247       xnn_init_qu8_conv_minmax_rndnu_neon_params,
248       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
249       benchmark::utils::CheckNEON);
250   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)251   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
252     GEMMEnd2EndBenchmark(state, model,
253       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
254       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
255       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
256       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
257       xnn_init_qu8_conv_minmax_rndnu_neon_params,
258       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
259       benchmark::utils::CheckNEON);
260   }
261 
262   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75);
263   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
264   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53);
265   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
266   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64);
267   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64);
268 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
269 
270 
271 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
qu8_gemm_1x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)272   static void qu8_gemm_1x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
273     GEMMEnd2EndBenchmark(state, model,
274       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
275       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
276       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
277       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
278       xnn_init_qu8_conv_minmax_rndnu_neon_params,
279       1 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
280       benchmark::utils::CheckNEONDOT);
281   }
qu8_gemm_2x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)282   static void qu8_gemm_2x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
283     GEMMEnd2EndBenchmark(state, model,
284       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
285       xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c4__neondot,
286       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
287       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
288       xnn_init_qu8_conv_minmax_rndnu_neon_params,
289       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
290       benchmark::utils::CheckNEONDOT);
291   }
qu8_gemm_3x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)292   static void qu8_gemm_3x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
293     GEMMEnd2EndBenchmark(state, model,
294       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
295       xnn_qu8_igemm_minmax_rndnu_ukernel_3x8c4__neondot,
296       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
297       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
298       xnn_init_qu8_conv_minmax_rndnu_neon_params,
299       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
300       benchmark::utils::CheckNEONDOT);
301   }
qu8_gemm_4x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)302   static void qu8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
303     GEMMEnd2EndBenchmark(state, model,
304       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
305       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
306       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
307       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
308       xnn_init_qu8_conv_minmax_rndnu_neon_params,
309       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
310       benchmark::utils::CheckNEONDOT);
311   }
qu8_gemm_5x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)312   static void qu8_gemm_5x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
313     GEMMEnd2EndBenchmark(state, model,
314       xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
315       xnn_qu8_igemm_minmax_rndnu_ukernel_5x8c4__neondot,
316       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
317       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
318       xnn_init_qu8_conv_minmax_rndnu_neon_params,
319       5 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
320       benchmark::utils::CheckNEONDOT);
321   }
qu8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)322   static void qu8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
323     GEMMEnd2EndBenchmark(state, model,
324       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
325       xnn_qu8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
326       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
327       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
328       xnn_init_qu8_conv_minmax_rndnu_neon_params,
329       6 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
330       benchmark::utils::CheckNEONDOT);
331   }
qu8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)332   static void qu8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
333     GEMMEnd2EndBenchmark(state, model,
334       xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
335       xnn_qu8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
336       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
337       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
338       xnn_init_qu8_conv_minmax_rndnu_neon_params,
339       8 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
340       benchmark::utils::CheckNEONDOT);
341   }
qu8_gemm_1x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)342   static void qu8_gemm_1x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
343     GEMMEnd2EndBenchmark(state, model,
344       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
345       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
346       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
347       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
348       xnn_init_qu8_conv_minmax_rndnu_neon_params,
349       1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
350       benchmark::utils::CheckNEONDOT);
351   }
qu8_gemm_2x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)352   static void qu8_gemm_2x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
353     GEMMEnd2EndBenchmark(state, model,
354       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
355       xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot,
356       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
357       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
358       xnn_init_qu8_conv_minmax_rndnu_neon_params,
359       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
360       benchmark::utils::CheckNEONDOT);
361   }
qu8_gemm_3x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)362   static void qu8_gemm_3x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
363     GEMMEnd2EndBenchmark(state, model,
364       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
365       xnn_qu8_igemm_minmax_rndnu_ukernel_3x16c4__neondot,
366       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
367       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
368       xnn_init_qu8_conv_minmax_rndnu_neon_params,
369       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
370       benchmark::utils::CheckNEONDOT);
371   }
qu8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)372   static void qu8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
373     GEMMEnd2EndBenchmark(state, model,
374       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
375       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
376       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
377       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
378       xnn_init_qu8_conv_minmax_rndnu_neon_params,
379       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
380       benchmark::utils::CheckNEONDOT);
381   }
qu8_gemm_5x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)382   static void qu8_gemm_5x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
383     GEMMEnd2EndBenchmark(state, model,
384       xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
385       xnn_qu8_igemm_minmax_rndnu_ukernel_5x16c4__neondot,
386       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
387       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
388       xnn_init_qu8_conv_minmax_rndnu_neon_params,
389       5 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
390       benchmark::utils::CheckNEONDOT);
391   }
qu8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)392   static void qu8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
393     GEMMEnd2EndBenchmark(state, model,
394       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
395       xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
396       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
397       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
398       xnn_init_qu8_conv_minmax_rndnu_neon_params,
399       6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
400       benchmark::utils::CheckNEONDOT);
401   }
qu8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)402   static void qu8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
403     GEMMEnd2EndBenchmark(state, model,
404       xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
405       xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
406       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
407       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
408       xnn_init_qu8_conv_minmax_rndnu_neon_params,
409       8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
410       benchmark::utils::CheckNEONDOT);
411   }
qu8_gemm_2x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)412   static void qu8_gemm_2x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
413     GEMMEnd2EndBenchmark(state, model,
414       xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
415       xnn_qu8_igemm_minmax_rndnu_ukernel_2x32c4__neondot,
416       xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
417       xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
418       xnn_init_qu8_conv_minmax_rndnu_neon_params,
419       2 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
420       benchmark::utils::CheckNEONDOT);
421   }
qu8_gemm_3x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)422   static void qu8_gemm_3x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
423     GEMMEnd2EndBenchmark(state, model,
424       xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
425       xnn_qu8_igemm_minmax_rndnu_ukernel_3x32c4__neondot,
426       xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
427       xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
428       xnn_init_qu8_conv_minmax_rndnu_neon_params,
429       3 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
430       benchmark::utils::CheckNEONDOT);
431   }
432 
433   BENCHMARK_QU8_END2END(qu8_gemm_1x8c4__neondot);
434   BENCHMARK_QU8_END2END(qu8_gemm_2x8c4__neondot);
435   BENCHMARK_QU8_END2END(qu8_gemm_3x8c4__neondot);
436   BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__neondot);
437   BENCHMARK_QU8_END2END(qu8_gemm_5x8c4__neondot);
438   BENCHMARK_QU8_END2END(qu8_gemm_6x8c4__neondot);
439   BENCHMARK_QU8_END2END(qu8_gemm_8x8c4__neondot);
440   BENCHMARK_QU8_END2END(qu8_gemm_1x16c4__neondot);
441   BENCHMARK_QU8_END2END(qu8_gemm_2x16c4__neondot);
442   BENCHMARK_QU8_END2END(qu8_gemm_3x16c4__neondot);
443   BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__neondot);
444   BENCHMARK_QU8_END2END(qu8_gemm_5x16c4__neondot);
445   BENCHMARK_QU8_END2END(qu8_gemm_6x16c4__neondot);
446   BENCHMARK_QU8_END2END(qu8_gemm_8x16c4__neondot);
447   BENCHMARK_QU8_END2END(qu8_gemm_2x32c4__neondot);
448   BENCHMARK_QU8_END2END(qu8_gemm_3x32c4__neondot);
449 #endif  // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
450 
451 
452 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)453   static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
454     GEMMEnd2EndBenchmark(state, model,
455       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
456       xnn_qu8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
457       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
458       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
459       xnn_init_qu8_conv_minmax_rndnu_neon_params,
460       2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
461       benchmark::utils::CheckNEON);
462   }
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)463   static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
464     GEMMEnd2EndBenchmark(state, model,
465       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
466       xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
467       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
468       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
469       xnn_init_qu8_conv_minmax_rndnu_neon_params,
470       3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
471       benchmark::utils::CheckNEON);
472   }
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)473   static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
474     GEMMEnd2EndBenchmark(state, model,
475       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
476       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
477       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
478       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
479       xnn_init_qu8_conv_minmax_rndnu_neon_params,
480       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
481       benchmark::utils::CheckNEON);
482   }
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)483   static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
484     GEMMEnd2EndBenchmark(state, model,
485       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
486       xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
487       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
488       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
489       xnn_init_qu8_conv_minmax_rndnu_neon_params,
490       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
491       benchmark::utils::CheckNEON);
492   }
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)493   static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
494     GEMMEnd2EndBenchmark(state, model,
495       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
496       xnn_qu8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
497       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
498       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
499       xnn_init_qu8_conv_minmax_rndnu_neon_params,
500       2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
501       benchmark::utils::CheckNEON);
502   }
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)503   static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
504     GEMMEnd2EndBenchmark(state, model,
505       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
506       xnn_qu8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
507       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
508       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
509       xnn_init_qu8_conv_minmax_rndnu_neon_params,
510       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
511       benchmark::utils::CheckNEON);
512   }
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)513   static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
514     GEMMEnd2EndBenchmark(state, model,
515       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
516       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
517       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
518       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
519       xnn_init_qu8_conv_minmax_rndnu_neon_params,
520       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
521       benchmark::utils::CheckNEON);
522   }
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)523   static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
524     GEMMEnd2EndBenchmark(state, model,
525       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
526       xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
527       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
528       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
529       xnn_init_qu8_conv_minmax_rndnu_neon_params,
530       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
531       benchmark::utils::CheckNEON);
532   }
533 
534   BENCHMARK_QU8_END2END(qu8_gemm_2x8__neon_mlal_lane);
535   BENCHMARK_QU8_END2END(qu8_gemm_3x8__neon_mlal_lane);
536   BENCHMARK_QU8_END2END(qu8_gemm_4x8__neon_mlal_lane);
537   BENCHMARK_QU8_END2END(qu8_gemm_6x8__neon_mlal_lane);
538   BENCHMARK_QU8_END2END(qu8_gemm_2x16__neon_mlal_lane);
539   BENCHMARK_QU8_END2END(qu8_gemm_3x16__neon_mlal_lane);
540   BENCHMARK_QU8_END2END(qu8_gemm_4x16__neon_mlal_lane);
541   BENCHMARK_QU8_END2END(qu8_gemm_6x16__neon_mlal_lane);
542 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
543 
544 
545 #if XNN_ARCH_ARM
qu8_gemm_1x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)546   static void qu8_gemm_1x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
547     GEMMEnd2EndBenchmark(state, model,
548       xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
549       xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
550       xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
551       xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
552       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
553       1 /* mr */, 1  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
554       benchmark::utils::CheckARMV6);
555   }
qu8_gemm_2x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)556   static void qu8_gemm_2x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
557     GEMMEnd2EndBenchmark(state, model,
558       xnn_qu8_gemm_minmax_fp32_ukernel_2x1c4__armsimd32,
559       xnn_qu8_igemm_minmax_fp32_ukernel_2x1c4__armsimd32,
560       xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
561       xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
562       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
563       2 /* mr */, 1  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
564       benchmark::utils::CheckARMV6);
565   }
qu8_gemm_1x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)566   static void qu8_gemm_1x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
567     GEMMEnd2EndBenchmark(state, model,
568       xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
569       xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
570       xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
571       xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
572       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
573       1 /* mr */, 2  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
574       benchmark::utils::CheckARMV6);
575   }
qu8_gemm_2x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)576   static void qu8_gemm_2x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
577     GEMMEnd2EndBenchmark(state, model,
578       xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32,
579       xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32,
580       xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
581       xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
582       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
583       2 /* mr */, 2  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
584       benchmark::utils::CheckARMV6);
585   }
586 
587   BENCHMARK_QU8_END2END(qu8_gemm_1x1c4__armsimd32);
588   BENCHMARK_QU8_END2END(qu8_gemm_2x1c4__armsimd32);
589   BENCHMARK_QU8_END2END(qu8_gemm_1x2c4__armsimd32);
590   BENCHMARK_QU8_END2END(qu8_gemm_2x2c4__armsimd32);
591 #endif  // XNN_ARCH_ARM
592 
593 
594 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)595   static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
596     GEMMEnd2EndBenchmark(state, model,
597       xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
598       xnn_qu8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
599       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
600       xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
601       xnn_init_qu8_conv_minmax_fp32_avx512_params,
602       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
603       benchmark::utils::CheckAVX512F);
604   }
605 
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)606   static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
607     GEMMEnd2EndBenchmark(state, model,
608       xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
609       xnn_qu8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
610       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
611       xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
612       xnn_init_qu8_conv_minmax_fp32_avx512_params,
613       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
614       benchmark::utils::CheckAVX512F);
615   }
616 
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)617   static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
618     GEMMEnd2EndBenchmark(state, model,
619       xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
620       xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
621       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
622       xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
623       xnn_init_qu8_conv_minmax_fp32_avx512_params,
624       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
625       benchmark::utils::CheckAVX512F);
626   }
627 
qu8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)628   static void qu8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
629     GEMMEnd2EndBenchmark(state, model,
630       xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
631       xnn_qu8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
632       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
633       xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
634       xnn_init_qu8_conv_minmax_fp32_avx2_params,
635       2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
636       benchmark::utils::CheckAVX2);
637   }
qu8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)638   static void qu8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
639     GEMMEnd2EndBenchmark(state, model,
640       xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
641       xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
642       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
643       xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
644       xnn_init_qu8_conv_minmax_fp32_avx2_params,
645       3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
646       benchmark::utils::CheckAVX2);
647   }
648 
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)649   static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
650     GEMMEnd2EndBenchmark(state, model,
651       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
652       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
653       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
654       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
655       xnn_init_qu8_conv_minmax_fp32_sse2_params,
656       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
657       benchmark::utils::CheckXOP);
658   }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)659   static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
660     GEMMEnd2EndBenchmark(state, model,
661       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
662       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
663       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
664       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
665       xnn_init_qu8_conv_minmax_fp32_sse2_params,
666       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
667       benchmark::utils::CheckXOP);
668   }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)669   static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
670     GEMMEnd2EndBenchmark(state, model,
671       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
672       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
673       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
674       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
675       xnn_init_qu8_conv_minmax_fp32_sse2_params,
676       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
677       benchmark::utils::CheckXOP);
678   }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)679   static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
680     GEMMEnd2EndBenchmark(state, model,
681       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
682       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
683       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
684       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
685       xnn_init_qu8_conv_minmax_fp32_sse2_params,
686       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
687       benchmark::utils::CheckXOP);
688   }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)689   static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
690     GEMMEnd2EndBenchmark(state, model,
691       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
692       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
693       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
694       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
695       xnn_init_qu8_conv_minmax_fp32_sse2_params,
696       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
697       benchmark::utils::CheckXOP);
698   }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)699   static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
700     GEMMEnd2EndBenchmark(state, model,
701       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
702       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
703       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
704       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
705       xnn_init_qu8_conv_minmax_fp32_sse2_params,
706       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
707       benchmark::utils::CheckXOP);
708   }
709 
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)710   static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
711     GEMMEnd2EndBenchmark(state, model,
712       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
713       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
714       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
715       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
716       xnn_init_qu8_conv_minmax_fp32_sse2_params,
717       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
718       benchmark::utils::CheckXOP);
719   }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)720   static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
721     GEMMEnd2EndBenchmark(state, model,
722       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
723       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
724       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
725       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
726       xnn_init_qu8_conv_minmax_fp32_sse2_params,
727       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
728       benchmark::utils::CheckXOP);
729   }
730 
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)731   static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
732     GEMMEnd2EndBenchmark(state, model,
733       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
734       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
735       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
736       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
737       xnn_init_qu8_conv_minmax_fp32_sse2_params,
738       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
739       benchmark::utils::CheckXOP);
740   }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)741   static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
742     GEMMEnd2EndBenchmark(state, model,
743       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
744       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
745       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
746       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
747       xnn_init_qu8_conv_minmax_fp32_sse2_params,
748       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
749       benchmark::utils::CheckXOP);
750   }
751 
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)752   static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
753     GEMMEnd2EndBenchmark(state, model,
754       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
755       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
756       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
757       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
758       xnn_init_qu8_conv_minmax_fp32_sse2_params,
759       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
760       benchmark::utils::CheckAVX);
761   }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)762   static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
763     GEMMEnd2EndBenchmark(state, model,
764       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
765       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
766       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
767       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
768       xnn_init_qu8_conv_minmax_fp32_sse2_params,
769       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
770       benchmark::utils::CheckAVX);
771   }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)772   static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
773     GEMMEnd2EndBenchmark(state, model,
774       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
775       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
776       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
777       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
778       xnn_init_qu8_conv_minmax_fp32_sse2_params,
779       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
780       benchmark::utils::CheckAVX);
781   }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)782   static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
783     GEMMEnd2EndBenchmark(state, model,
784       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
785       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
786       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
787       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
788       xnn_init_qu8_conv_minmax_fp32_sse2_params,
789       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
790       benchmark::utils::CheckAVX);
791   }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)792   static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
793     GEMMEnd2EndBenchmark(state, model,
794       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
795       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
796       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
797       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
798       xnn_init_qu8_conv_minmax_fp32_sse2_params,
799       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
800       benchmark::utils::CheckAVX);
801   }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)802   static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
803     GEMMEnd2EndBenchmark(state, model,
804       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
805       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
806       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
807       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
808       xnn_init_qu8_conv_minmax_fp32_sse2_params,
809       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
810       benchmark::utils::CheckAVX);
811   }
812 
813 
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)814   static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
815     GEMMEnd2EndBenchmark(state, model,
816       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
817       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
818       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
819       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
820       xnn_init_qu8_conv_minmax_fp32_sse2_params,
821       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
822       benchmark::utils::CheckAVX);
823   }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)824   static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
825     GEMMEnd2EndBenchmark(state, model,
826       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
827       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
828       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
829       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
830       xnn_init_qu8_conv_minmax_fp32_sse2_params,
831       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
832       benchmark::utils::CheckAVX);
833   }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)834   static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
835     GEMMEnd2EndBenchmark(state, model,
836       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
837       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
838       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
839       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
840       xnn_init_qu8_conv_minmax_fp32_sse2_params,
841       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
842       benchmark::utils::CheckAVX);
843   }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)844   static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
845     GEMMEnd2EndBenchmark(state, model,
846       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
847       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
848       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
849       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
850       xnn_init_qu8_conv_minmax_fp32_sse2_params,
851       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
852       benchmark::utils::CheckAVX);
853   }
854 
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)855   static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
856     GEMMEnd2EndBenchmark(state, model,
857       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
858       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
859       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
860       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
861       xnn_init_qu8_conv_minmax_fp32_sse2_params,
862       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
863       benchmark::utils::CheckSSE41);
864   }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)865   static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
866     GEMMEnd2EndBenchmark(state, model,
867       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
868       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
869       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
870       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
871       xnn_init_qu8_conv_minmax_fp32_sse2_params,
872       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
873       benchmark::utils::CheckSSE41);
874   }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)875   static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
876     GEMMEnd2EndBenchmark(state, model,
877       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
878       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
879       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
880       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
881       xnn_init_qu8_conv_minmax_fp32_sse2_params,
882       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
883       benchmark::utils::CheckSSE41);
884   }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)885   static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
886     GEMMEnd2EndBenchmark(state, model,
887       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
888       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
889       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
890       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
891       xnn_init_qu8_conv_minmax_fp32_sse2_params,
892       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
893       benchmark::utils::CheckSSE41);
894   }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)895   static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
896     GEMMEnd2EndBenchmark(state, model,
897       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
898       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
899       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
900       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
901       xnn_init_qu8_conv_minmax_fp32_sse2_params,
902       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
903       benchmark::utils::CheckSSE41);
904   }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)905   static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
906     GEMMEnd2EndBenchmark(state, model,
907       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
908       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
909       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
910       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
911       xnn_init_qu8_conv_minmax_fp32_sse2_params,
912       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
913       benchmark::utils::CheckSSE41);
914   }
915 
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)916   static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
917     GEMMEnd2EndBenchmark(state, model,
918       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
919       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
920       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
921       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
922       xnn_init_qu8_conv_minmax_fp32_sse2_params,
923       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
924       benchmark::utils::CheckSSE41);
925   }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)926   static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
927     GEMMEnd2EndBenchmark(state, model,
928       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
929       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
930       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
931       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
932       xnn_init_qu8_conv_minmax_fp32_sse2_params,
933       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
934       benchmark::utils::CheckSSE41);
935   }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)936   static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
937     GEMMEnd2EndBenchmark(state, model,
938       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
939       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
940       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
941       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
942       xnn_init_qu8_conv_minmax_fp32_sse2_params,
943       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
944       benchmark::utils::CheckSSE41);
945   }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)946   static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
947     GEMMEnd2EndBenchmark(state, model,
948       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
949       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
950       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
951       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
952       xnn_init_qu8_conv_minmax_fp32_sse2_params,
953       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
954       benchmark::utils::CheckSSE41);
955   }
956 
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)957   static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
958     GEMMEnd2EndBenchmark(state, model,
959       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
960       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
961       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
962       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
963       xnn_init_qu8_conv_minmax_fp32_sse2_params,
964       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
965   }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)966   static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
967     GEMMEnd2EndBenchmark(state, model,
968       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
969       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
970       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
971       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
972       xnn_init_qu8_conv_minmax_fp32_sse2_params,
973       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
974   }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)975   static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
976     GEMMEnd2EndBenchmark(state, model,
977       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
978       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
979       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
980       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
981       xnn_init_qu8_conv_minmax_fp32_sse2_params,
982       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
983   }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)984   static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
985     GEMMEnd2EndBenchmark(state, model,
986       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
987       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
988       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
989       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
990       xnn_init_qu8_conv_minmax_fp32_sse2_params,
991       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
992   }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)993   static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
994     GEMMEnd2EndBenchmark(state, model,
995       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
996       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
997       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
998       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
999       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1000       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
1001   }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1002   static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1003     GEMMEnd2EndBenchmark(state, model,
1004       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
1005       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
1006       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
1007       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
1008       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1009       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
1010   }
1011 
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1012   static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1013     GEMMEnd2EndBenchmark(state, model,
1014       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1015       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1016       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1017       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1018       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1019       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1020   }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1021   static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1022     GEMMEnd2EndBenchmark(state, model,
1023       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1024       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1025       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1026       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1027       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1028       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1029   }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1030   static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1031     GEMMEnd2EndBenchmark(state, model,
1032       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1033       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1034       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1035       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1036       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1037       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1038   }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1039   static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1040     GEMMEnd2EndBenchmark(state, model,
1041       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1042       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1043       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1044       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1045       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1046       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1047   }
1048 
1049 
1050   BENCHMARK_QU8_END2END(qu8_gemm_2x16c8__avx512skx);
1051   BENCHMARK_QU8_END2END(qu8_gemm_3x16c8__avx512skx);
1052   BENCHMARK_QU8_END2END(qu8_gemm_4x16c8__avx512skx);
1053 
1054   BENCHMARK_QU8_END2END(qu8_gemm_2x8c8__avx2);
1055   BENCHMARK_QU8_END2END(qu8_gemm_3x8c8__avx2);
1056 
1057   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld64);
1058   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld128);
1059   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld64);
1060   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld128);
1061   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld64);
1062   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld128);
1063 
1064   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld64);
1065   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld128);
1066   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld64);
1067   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld128);
1068 
1069   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld64);
1070   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld128);
1071   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld64);
1072   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld128);
1073   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld64);
1074   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld128);
1075 
1076   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld64);
1077   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld128);
1078   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld64);
1079   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld128);
1080 
1081   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld64);
1082   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld128);
1083   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld64);
1084   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld128);
1085   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld64);
1086   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld128);
1087 
1088   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld64);
1089   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld128);
1090   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld64);
1091   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld128);
1092 
1093   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld64);
1094   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld128);
1095   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld64);
1096   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld128);
1097   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld64);
1098   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld128);
1099 
1100   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld64);
1101   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld128);
1102   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld64);
1103   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld128);
1104 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1105 
1106 
1107 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1108   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1109     GEMMEnd2EndBenchmark(state, model,
1110       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1111       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1112       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1113       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1114       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1115       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1116   }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1117   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1118     GEMMEnd2EndBenchmark(state, model,
1119       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1120       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1121       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1122       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1123       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1124       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1125   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1126   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1127     GEMMEnd2EndBenchmark(state, model,
1128       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1129       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1130       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1131       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1132       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1133       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1134   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1135   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1136     GEMMEnd2EndBenchmark(state, model,
1137       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1138       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1139       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1140       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1141       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1142       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1143   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1144   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1145     GEMMEnd2EndBenchmark(state, model,
1146       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1147       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1148       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1149       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1150       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1151       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1152   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1153   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1154     GEMMEnd2EndBenchmark(state, model,
1155       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1156       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1157       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1158       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1159       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1160       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1161   }
1162 
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1163   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1164     GEMMEnd2EndBenchmark(state, model,
1165       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1166       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1167       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1168       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1169       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1170       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1171   }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1172   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1173     GEMMEnd2EndBenchmark(state, model,
1174       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1175       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1176       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1177       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1178       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1179       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1180   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1181   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1182     GEMMEnd2EndBenchmark(state, model,
1183       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1184       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1185       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1186       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1187       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1189   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1190   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1191     GEMMEnd2EndBenchmark(state, model,
1192       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1193       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1194       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1195       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1196       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1197       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1198   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1199   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1200     GEMMEnd2EndBenchmark(state, model,
1201       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1202       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1203       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1204       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1205       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1206       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1207   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1208   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1209     GEMMEnd2EndBenchmark(state, model,
1210       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1211       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1212       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1213       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1214       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1215       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1216   }
1217 
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1218   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1219     GEMMEnd2EndBenchmark(state, model,
1220       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1221       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1222       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1223       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1224       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1226   }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1227   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1228     GEMMEnd2EndBenchmark(state, model,
1229       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1230       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1231       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1232       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1233       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1234       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1235   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1236   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1237     GEMMEnd2EndBenchmark(state, model,
1238       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1239       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1240       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1241       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1242       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1243       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1244   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1245   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1246     GEMMEnd2EndBenchmark(state, model,
1247       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1248       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1249       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1250       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1251       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1252       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1253   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1254   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1255     GEMMEnd2EndBenchmark(state, model,
1256       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1257       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1258       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1259       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1260       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1261       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1262   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1263   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1264     GEMMEnd2EndBenchmark(state, model,
1265       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1266       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1267       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1268       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1269       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1270       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1271   }
1272 
1273   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)1274   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1275   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1276   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1277   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1278   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1279 
1280   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1281   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1282   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1283   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1284   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1285   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1286 
1287   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1288   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1289   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1290   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1291   BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1292   BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1293 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1294 
1295 
1296 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1297   static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1298     GEMMEnd2EndBenchmark(state, model,
1299       xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1300       xnn_qu8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1301       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1302       xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1303       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1304       2 /* mr */, 2 /* nr */);
1305   }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1306   static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1307     GEMMEnd2EndBenchmark(state, model,
1308       xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1309       xnn_qu8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1310       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1311       xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1312       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1313       3 /* mr */, 2 /* nr */);
1314   }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1315   static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1316     GEMMEnd2EndBenchmark(state, model,
1317       xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1318       xnn_qu8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1319       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1320       xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1321       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1322       4 /* mr */, 2 /* nr */);
1323   }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1324   static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1325     GEMMEnd2EndBenchmark(state, model,
1326       xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1327       xnn_qu8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1328       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1329       xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1330       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1331       2 /* mr */, 4 /* nr */);
1332   }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1333   static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1334     GEMMEnd2EndBenchmark(state, model,
1335       xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1336       xnn_qu8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1337       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1338       xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1339       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1340       3 /* mr */, 4 /* nr */);
1341   }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1342   static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1343     GEMMEnd2EndBenchmark(state, model,
1344       xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1345       xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1346       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1347       xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1348       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1349       4 /* mr */, 4 /* nr */);
1350   }
1351 
1352   BENCHMARK_QU8_END2END(qu8_gemm_2x2__wasm_fmagic)
BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)1353   BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)
1354   BENCHMARK_QU8_END2END(qu8_gemm_4x2__wasm_fmagic)
1355   BENCHMARK_QU8_END2END(qu8_gemm_2x4__wasm_fmagic)
1356   BENCHMARK_QU8_END2END(qu8_gemm_3x4__wasm_fmagic)
1357   BENCHMARK_QU8_END2END(qu8_gemm_4x4__wasm_fmagic)
1358 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1359 
1360 
1361 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1362   GEMMEnd2EndBenchmark(state, model,
1363     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1364     xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1365     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1366     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1367     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368     2 /* mr */, 2 /* nr */);
1369 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1370 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1371   GEMMEnd2EndBenchmark(state, model,
1372     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1373     xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1374     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1375     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1376     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1377     3 /* mr */, 2 /* nr */);
1378 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1379 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1380   GEMMEnd2EndBenchmark(state, model,
1381     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1382     xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1383     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1384     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1385     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386     4 /* mr */, 2 /* nr */);
1387 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1388 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1389   GEMMEnd2EndBenchmark(state, model,
1390     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1391     xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1392     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1393     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1394     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1395     2 /* mr */, 4 /* nr */);
1396 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1397 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1398   GEMMEnd2EndBenchmark(state, model,
1399     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1400     xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1401     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1402     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1403     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1404     3 /* mr */, 4 /* nr */);
1405 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1406 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1407   GEMMEnd2EndBenchmark(state, model,
1408     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1409     xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1410     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1411     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1412     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1413     4 /* mr */, 4 /* nr */);
1414 }
1415 
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1416 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1417   GEMMEnd2EndBenchmark(state, model,
1418     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1419     xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1420     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1421     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1422     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1423     2 /* mr */, 2 /* nr */);
1424 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1425 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1426   GEMMEnd2EndBenchmark(state, model,
1427     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1428     xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1429     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1430     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1431     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1432     3 /* mr */, 2 /* nr */);
1433 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1434 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1435   GEMMEnd2EndBenchmark(state, model,
1436     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1437     xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1438     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1439     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1440     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1441     4 /* mr */, 2 /* nr */);
1442 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1443 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1444   GEMMEnd2EndBenchmark(state, model,
1445     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1446     xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1447     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1448     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1449     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1450     2 /* mr */, 4 /* nr */);
1451 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1452 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1453   GEMMEnd2EndBenchmark(state, model,
1454     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1455     xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1456     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1457     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1458     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1459     3 /* mr */, 4 /* nr */);
1460 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1461 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1462   GEMMEnd2EndBenchmark(state, model,
1463     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1464     xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1465     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1466     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1467     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1468     4 /* mr */, 4 /* nr */);
1469 }
1470 
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1471 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1472   GEMMEnd2EndBenchmark(state, model,
1473     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1474     xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1475     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1476     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1477     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1478     2 /* mr */, 2 /* nr */);
1479 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1480 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1481   GEMMEnd2EndBenchmark(state, model,
1482     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1483     xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1484     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1485     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1486     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1487     3 /* mr */, 2 /* nr */);
1488 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1489 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1490   GEMMEnd2EndBenchmark(state, model,
1491     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1492     xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1493     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1494     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1495     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1496     4 /* mr */, 2 /* nr */);
1497 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1498 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1499   GEMMEnd2EndBenchmark(state, model,
1500     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1501     xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1502     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1503     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1504     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1505     2 /* mr */, 4 /* nr */);
1506 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1507 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1508   GEMMEnd2EndBenchmark(state, model,
1509     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1510     xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1511     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1512     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1513     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1514     3 /* mr */, 4 /* nr */);
1515 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1516 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1517   GEMMEnd2EndBenchmark(state, model,
1518     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1519     xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1520     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1521     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1522     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1523     4 /* mr */, 4 /* nr */);
1524 }
1525 
1526 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_fmagic)
1527 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_fmagic)
1528 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_fmagic)
1529 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_fmagic)
1530 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_fmagic)
1531 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_fmagic)
1532 
1533 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_imagic)
1534 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_imagic)
1535 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_imagic)
1536 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_imagic)
1537 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_imagic)
1538 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_imagic)
1539 
1540 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_lrintf)
1541 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_lrintf)
1542 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_lrintf)
1543 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_lrintf)
1544 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_lrintf)
1545 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_lrintf)
1546 
1547 
1548 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1549 BENCHMARK_MAIN();
1550 #endif
1551