1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <cstring>
9 #include <functional>
10 #include <random>
11 #include <vector>
12
13 #include <xnnpack.h>
14
15 #include <benchmark/benchmark.h>
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19
20 #include <xnnpack.h>
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/microfnptr.h>
24 #include <xnnpack/microparams-init.h>
25
26
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qs8_gemm_minmax_ukernel_function gemm,xnn_qs8_igemm_minmax_ukernel_function igemm,xnn_qs8_gemm_minmax_ukernel_function gemm1,xnn_qs8_igemm_minmax_ukernel_function igemm1,xnn_init_qs8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void GEMMEnd2EndBenchmark(
28 benchmark::State& state,
29 models::ExecutionPlanFactory model_factory,
30 xnn_qs8_gemm_minmax_ukernel_function gemm,
31 xnn_qs8_igemm_minmax_ukernel_function igemm,
32 xnn_qs8_gemm_minmax_ukernel_function gemm1,
33 xnn_qs8_igemm_minmax_ukernel_function igemm1,
34 xnn_init_qs8_conv_minmax_params_fn init_params,
35 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
36 benchmark::utils::IsaCheckFunction isa_check = nullptr)
37 {
38 if (isa_check && !isa_check(state)) {
39 return;
40 }
41 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
42 state.SkipWithError("failed to initialize XNNPACK");
43 return;
44 }
45
46 // Override microkernels chosen in xnn_initialize
47 // Note: do not directly assign to xnn_params.qs8.gemm because it breaks older gcc.
48 std::memset(&xnn_params.qs8.gemm, 0, sizeof(xnn_params.qs8.gemm));
49 xnn_params.qs8.gemm.minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
50 xnn_params.qs8.gemm.minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
51 xnn_params.qs8.gemm.minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
52 xnn_params.qs8.gemm.minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
53 xnn_params.qs8.gemm.init.qs8 = init_params;
54 xnn_params.qs8.gemm.mr = mr;
55 xnn_params.qs8.gemm.nr = nr;
56 xnn_params.qs8.gemm.log2_kr = log2_kr;
57 xnn_params.qs8.gemm.log2_sr = log2_sr;
58
59 auto execution_plan = model_factory(nullptr);
60 if (execution_plan.empty()) {
61 state.SkipWithError("failed to create a model");
62 return;
63 }
64
65 for (auto _ : state) {
66 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
67 xnn_status status = xnn_run_operator(op.get(), nullptr);
68 if (status != xnn_status_success) {
69 state.SkipWithError("failed to run a model");
70 return;
71 }
72 }
73 }
74
75 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
76 if (cpu_frequency != 0) {
77 state.counters["cpufreq"] = cpu_frequency;
78 }
79 }
80
81
82 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)83 static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
84 GEMMEnd2EndBenchmark(state, model,
85 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
86 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55,
87 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
88 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
89 xnn_init_qs8_conv_minmax_rndnu_neon_params,
90 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
91 benchmark::utils::CheckNEONDOT);
92 }
qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)93 static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
94 GEMMEnd2EndBenchmark(state, model,
95 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
96 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64,
97 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
98 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
99 xnn_init_qs8_conv_minmax_rndnu_neon_params,
100 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
101 benchmark::utils::CheckNEONDOT);
102 }
103
104 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)105 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__aarch32_neondot_ld64)
106 #endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
107
108
109 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
110 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
111 GEMMEnd2EndBenchmark(state, model,
112 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
113 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
114 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
115 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
116 xnn_init_qs8_conv_minmax_rndnu_neon_params,
117 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
118 benchmark::utils::CheckNEON);
119 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)120 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
121 GEMMEnd2EndBenchmark(state, model,
122 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
123 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
124 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
125 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
126 xnn_init_qs8_conv_minmax_rndnu_neon_params,
127 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
128 benchmark::utils::CheckNEON);
129 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)130 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
131 GEMMEnd2EndBenchmark(state, model,
132 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
133 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
134 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
135 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
136 xnn_init_qs8_conv_minmax_rndnu_neon_params,
137 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
138 benchmark::utils::CheckNEON);
139 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)140 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
141 GEMMEnd2EndBenchmark(state, model,
142 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
143 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
144 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
145 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
146 xnn_init_qs8_conv_minmax_rndnu_neon_params,
147 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
148 benchmark::utils::CheckNEON);
149 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)150 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
151 GEMMEnd2EndBenchmark(state, model,
152 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
153 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
154 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
155 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
156 xnn_init_qs8_conv_minmax_rndnu_neon_params,
157 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
158 benchmark::utils::CheckNEON);
159 }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)160 static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
161 GEMMEnd2EndBenchmark(state, model,
162 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
163 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
164 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
165 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
166 xnn_init_qs8_conv_minmax_rndnu_neon_params,
167 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
168 benchmark::utils::CheckNEON);
169 }
170
171 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)172 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
173 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
174 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
175 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
176 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
177 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
178
179
180 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
181 static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
182 GEMMEnd2EndBenchmark(state, model,
183 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
184 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
185 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
186 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
187 xnn_init_qs8_conv_minmax_rndnu_neon_params,
188 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
189 benchmark::utils::CheckNEONDOT);
190 }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,models::ExecutionPlanFactory model)191 static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, models::ExecutionPlanFactory model) {
192 GEMMEnd2EndBenchmark(state, model,
193 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32,
194 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
195 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32,
196 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
197 xnn_init_qs8_conv_minmax_rndnu_neon_params,
198 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
199 benchmark::utils::CheckNEONDOT);
200 }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,models::ExecutionPlanFactory model)201 static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
202 GEMMEnd2EndBenchmark(state, model,
203 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
204 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64,
205 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
206 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
207 xnn_init_qs8_conv_minmax_rndnu_neon_params,
208 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
209 benchmark::utils::CheckNEONDOT);
210 }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)211 static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
212 GEMMEnd2EndBenchmark(state, model,
213 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
214 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
215 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64,
216 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
217 xnn_init_qs8_conv_minmax_rndnu_neon_params,
218 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
219 benchmark::utils::CheckNEONDOT);
220 }
221
222 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)223 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)
224 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld64)
225 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
226 #endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
227
228
229 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
230 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
231 GEMMEnd2EndBenchmark(state, model,
232 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
233 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
234 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
235 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
236 xnn_init_qs8_conv_minmax_rndnu_neon_params,
237 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
238 benchmark::utils::CheckNEON);
239 }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)240 static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
241 GEMMEnd2EndBenchmark(state, model,
242 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
243 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
244 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
245 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
246 xnn_init_qs8_conv_minmax_rndnu_neon_params,
247 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
248 benchmark::utils::CheckNEON);
249 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)250 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
251 GEMMEnd2EndBenchmark(state, model,
252 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
253 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
254 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
255 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
256 xnn_init_qs8_conv_minmax_rndnu_neon_params,
257 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
258 benchmark::utils::CheckNEON);
259 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)260 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
261 GEMMEnd2EndBenchmark(state, model,
262 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
263 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
264 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
265 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
266 xnn_init_qs8_conv_minmax_rndnu_neon_params,
267 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
268 benchmark::utils::CheckNEON);
269 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)270 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
271 GEMMEnd2EndBenchmark(state, model,
272 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
273 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
274 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
275 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
276 xnn_init_qs8_conv_minmax_rndnu_neon_params,
277 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
278 benchmark::utils::CheckNEON);
279 }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)280 static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
281 GEMMEnd2EndBenchmark(state, model,
282 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
283 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
284 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
285 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
286 xnn_init_qs8_conv_minmax_rndnu_neon_params,
287 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
288 benchmark::utils::CheckNEON);
289 }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)290 static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
291 GEMMEnd2EndBenchmark(state, model,
292 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
293 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
294 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
295 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
296 xnn_init_qs8_conv_minmax_rndnu_neon_params,
297 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
298 benchmark::utils::CheckNEON);
299 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,models::ExecutionPlanFactory model)300 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
301 GEMMEnd2EndBenchmark(state, model,
302 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
303 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm,
304 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
305 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm,
306 xnn_init_qs8_conv_minmax_rndnu_neon_params,
307 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
308 benchmark::utils::CheckNEON);
309 }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)310 static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
311 GEMMEnd2EndBenchmark(state, model,
312 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53,
313 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
314 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
315 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal,
316 xnn_init_qs8_conv_minmax_rndnu_neon_params,
317 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
318 benchmark::utils::CheckNEON);
319 }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)320 static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
321 GEMMEnd2EndBenchmark(state, model,
322 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
323 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53,
324 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
325 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53,
326 xnn_init_qs8_conv_minmax_rndnu_neon_params,
327 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
328 benchmark::utils::CheckNEON);
329 }
330
331 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)332 BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
333 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
334 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
335 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
336 BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
337 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
338 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
339 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
340 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal)
341 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
342
343
344 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
345 static void qs8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
346 GEMMEnd2EndBenchmark(state, model,
347 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
348 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
349 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
350 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
351 xnn_init_qs8_conv_minmax_rndnu_neon_params,
352 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
353 benchmark::utils::CheckNEONDOT);
354 }
qs8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)355 static void qs8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
356 GEMMEnd2EndBenchmark(state, model,
357 xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
358 xnn_qs8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
359 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
360 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
361 xnn_init_qs8_conv_minmax_rndnu_neon_params,
362 6 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
363 benchmark::utils::CheckNEONDOT);
364 }
qs8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)365 static void qs8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
366 GEMMEnd2EndBenchmark(state, model,
367 xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
368 xnn_qs8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
369 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
370 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
371 xnn_init_qs8_conv_minmax_rndnu_neon_params,
372 8 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
373 benchmark::utils::CheckNEONDOT);
374 }
qs8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)375 static void qs8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
376 GEMMEnd2EndBenchmark(state, model,
377 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
378 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
379 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
380 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
381 xnn_init_qs8_conv_minmax_rndnu_neon_params,
382 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
383 benchmark::utils::CheckNEONDOT);
384 }
qs8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)385 static void qs8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
386 GEMMEnd2EndBenchmark(state, model,
387 xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
388 xnn_qs8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
389 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
390 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
391 xnn_init_qs8_conv_minmax_rndnu_neon_params,
392 6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
393 benchmark::utils::CheckNEONDOT);
394 }
qs8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)395 static void qs8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
396 GEMMEnd2EndBenchmark(state, model,
397 xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
398 xnn_qs8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
399 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
400 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
401 xnn_init_qs8_conv_minmax_rndnu_neon_params,
402 8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
403 benchmark::utils::CheckNEONDOT);
404 }
405
406 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neondot);
407 BENCHMARK_QS8_END2END(qs8_gemm_6x8c4__neondot);
408 BENCHMARK_QS8_END2END(qs8_gemm_8x8c4__neondot);
409 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neondot);
410 BENCHMARK_QS8_END2END(qs8_gemm_6x16c4__neondot);
411 BENCHMARK_QS8_END2END(qs8_gemm_8x16c4__neondot);
412 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
413
414
415 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_gemm_2x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)416 static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
417 GEMMEnd2EndBenchmark(state, model,
418 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
419 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
420 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
421 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
422 xnn_init_qs8_conv_minmax_rndnu_neon_params,
423 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
424 benchmark::utils::CheckNEON);
425 }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)426 static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
427 GEMMEnd2EndBenchmark(state, model,
428 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
429 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
430 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
431 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
432 xnn_init_qs8_conv_minmax_rndnu_neon_params,
433 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
434 benchmark::utils::CheckNEON);
435 }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)436 static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
437 GEMMEnd2EndBenchmark(state, model,
438 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
439 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
440 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
441 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
442 xnn_init_qs8_conv_minmax_rndnu_neon_params,
443 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
444 benchmark::utils::CheckNEON);
445 }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)446 static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
447 GEMMEnd2EndBenchmark(state, model,
448 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
449 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
450 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
451 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
452 xnn_init_qs8_conv_minmax_rndnu_neon_params,
453 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
454 benchmark::utils::CheckNEON);
455 }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)456 static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
457 GEMMEnd2EndBenchmark(state, model,
458 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
459 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
460 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
461 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
462 xnn_init_qs8_conv_minmax_rndnu_neon_params,
463 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
464 benchmark::utils::CheckNEON);
465 }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)466 static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
467 GEMMEnd2EndBenchmark(state, model,
468 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
469 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
470 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
471 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
472 xnn_init_qs8_conv_minmax_rndnu_neon_params,
473 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
474 benchmark::utils::CheckNEON);
475 }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)476 static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
477 GEMMEnd2EndBenchmark(state, model,
478 xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
479 xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
480 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
481 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
482 xnn_init_qs8_conv_minmax_rndnu_neon_params,
483 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
484 benchmark::utils::CheckNEON);
485 }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)486 static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
487 GEMMEnd2EndBenchmark(state, model,
488 xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
489 xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
490 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
491 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
492 xnn_init_qs8_conv_minmax_rndnu_neon_params,
493 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
494 benchmark::utils::CheckNEON);
495 }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)496 static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
497 GEMMEnd2EndBenchmark(state, model,
498 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
499 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm,
500 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
501 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
502 xnn_init_qs8_conv_minmax_rndnu_neon_params,
503 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
504 benchmark::utils::CheckNEON);
505 }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)506 static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
507 GEMMEnd2EndBenchmark(state, model,
508 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
509 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm,
510 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
511 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
512 xnn_init_qs8_conv_minmax_rndnu_neon_params,
513 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
514 benchmark::utils::CheckNEON);
515 }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)516 static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
517 GEMMEnd2EndBenchmark(state, model,
518 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
519 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm,
520 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
521 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
522 xnn_init_qs8_conv_minmax_rndnu_neon_params,
523 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
524 benchmark::utils::CheckNEON);
525 }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)526 static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
527 GEMMEnd2EndBenchmark(state, model,
528 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
529 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm,
530 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
531 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
532 xnn_init_qs8_conv_minmax_rndnu_neon_params,
533 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
534 benchmark::utils::CheckNEON);
535 }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)536 static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
537 GEMMEnd2EndBenchmark(state, model,
538 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
539 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm,
540 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
541 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
542 xnn_init_qs8_conv_minmax_rndnu_neon_params,
543 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
544 benchmark::utils::CheckNEON);
545 }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)546 static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
547 GEMMEnd2EndBenchmark(state, model,
548 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
549 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm,
550 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
551 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
552 xnn_init_qs8_conv_minmax_rndnu_neon_params,
553 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
554 benchmark::utils::CheckNEON);
555 }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)556 static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
557 GEMMEnd2EndBenchmark(state, model,
558 xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
559 xnn_qs8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm,
560 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
561 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm,
562 xnn_init_qs8_conv_minmax_rndnu_neon_params,
563 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
564 benchmark::utils::CheckNEON);
565 }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,models::ExecutionPlanFactory model)566 static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, models::ExecutionPlanFactory model) {
567 GEMMEnd2EndBenchmark(state, model,
568 xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
569 xnn_qs8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm,
570 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
571 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm,
572 xnn_init_qs8_conv_minmax_rndnu_neon_params,
573 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
574 benchmark::utils::CheckNEON);
575 }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)576 static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
577 GEMMEnd2EndBenchmark(state, model,
578 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
579 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup,
580 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
581 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
582 xnn_init_qs8_conv_minmax_rndnu_neon_params,
583 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
584 benchmark::utils::CheckNEON);
585 }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)586 static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
587 GEMMEnd2EndBenchmark(state, model,
588 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
589 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup,
590 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
591 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
592 xnn_init_qs8_conv_minmax_rndnu_neon_params,
593 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
594 benchmark::utils::CheckNEON);
595 }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)596 static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
597 GEMMEnd2EndBenchmark(state, model,
598 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
599 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup,
600 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
601 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
602 xnn_init_qs8_conv_minmax_rndnu_neon_params,
603 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
604 benchmark::utils::CheckNEON);
605 }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)606 static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
607 GEMMEnd2EndBenchmark(state, model,
608 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
609 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup,
610 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
611 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
612 xnn_init_qs8_conv_minmax_rndnu_neon_params,
613 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
614 benchmark::utils::CheckNEON);
615 }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)616 static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
617 GEMMEnd2EndBenchmark(state, model,
618 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
619 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup,
620 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
621 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup,
622 xnn_init_qs8_conv_minmax_rndnu_neon_params,
623 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
624 benchmark::utils::CheckNEON);
625 }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)626 static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
627 GEMMEnd2EndBenchmark(state, model,
628 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
629 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup,
630 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
631 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup,
632 xnn_init_qs8_conv_minmax_rndnu_neon_params,
633 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
634 benchmark::utils::CheckNEON);
635 }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)636 static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
637 GEMMEnd2EndBenchmark(state, model,
638 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
639 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r,
640 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
641 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
642 xnn_init_qs8_conv_minmax_rndnu_neon_params,
643 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
644 benchmark::utils::CheckNEON);
645 }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)646 static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
647 GEMMEnd2EndBenchmark(state, model,
648 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
649 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r,
650 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
651 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
652 xnn_init_qs8_conv_minmax_rndnu_neon_params,
653 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
654 benchmark::utils::CheckNEON);
655 }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)656 static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
657 GEMMEnd2EndBenchmark(state, model,
658 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
659 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r,
660 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
661 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
662 xnn_init_qs8_conv_minmax_rndnu_neon_params,
663 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
664 benchmark::utils::CheckNEON);
665 }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)666 static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
667 GEMMEnd2EndBenchmark(state, model,
668 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
669 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r,
670 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
671 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
672 xnn_init_qs8_conv_minmax_rndnu_neon_params,
673 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
674 benchmark::utils::CheckNEON);
675 }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)676 static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
677 GEMMEnd2EndBenchmark(state, model,
678 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
679 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r,
680 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
681 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r,
682 xnn_init_qs8_conv_minmax_rndnu_neon_params,
683 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
684 benchmark::utils::CheckNEON);
685 }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)686 static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
687 GEMMEnd2EndBenchmark(state, model,
688 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
689 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r,
690 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
691 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r,
692 xnn_init_qs8_conv_minmax_rndnu_neon_params,
693 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
694 benchmark::utils::CheckNEON);
695 }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)696 static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
697 GEMMEnd2EndBenchmark(state, model,
698 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
699 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r,
700 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
701 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
702 xnn_init_qs8_conv_minmax_rndnu_neon_params,
703 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
704 benchmark::utils::CheckNEON);
705 }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)706 static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
707 GEMMEnd2EndBenchmark(state, model,
708 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
709 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r,
710 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
711 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
712 xnn_init_qs8_conv_minmax_rndnu_neon_params,
713 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
714 benchmark::utils::CheckNEON);
715 }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)716 static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
717 GEMMEnd2EndBenchmark(state, model,
718 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
719 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r,
720 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
721 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
722 xnn_init_qs8_conv_minmax_rndnu_neon_params,
723 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
724 benchmark::utils::CheckNEON);
725 }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)726 static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
727 GEMMEnd2EndBenchmark(state, model,
728 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
729 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r,
730 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
731 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
732 xnn_init_qs8_conv_minmax_rndnu_neon_params,
733 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
734 benchmark::utils::CheckNEON);
735 }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)736 static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
737 GEMMEnd2EndBenchmark(state, model,
738 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
739 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r,
740 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
741 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r,
742 xnn_init_qs8_conv_minmax_rndnu_neon_params,
743 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
744 benchmark::utils::CheckNEON);
745 }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)746 static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
747 GEMMEnd2EndBenchmark(state, model,
748 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
749 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r,
750 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
751 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r,
752 xnn_init_qs8_conv_minmax_rndnu_neon_params,
753 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
754 benchmark::utils::CheckNEON);
755 }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)756 static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
757 GEMMEnd2EndBenchmark(state, model,
758 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
759 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r,
760 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
761 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
762 xnn_init_qs8_conv_minmax_rndnu_neon_params,
763 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
764 benchmark::utils::CheckNEON);
765 }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)766 static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
767 GEMMEnd2EndBenchmark(state, model,
768 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
769 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r,
770 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
771 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
772 xnn_init_qs8_conv_minmax_rndnu_neon_params,
773 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
774 benchmark::utils::CheckNEON);
775 }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)776 static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
777 GEMMEnd2EndBenchmark(state, model,
778 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
779 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r,
780 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
781 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
782 xnn_init_qs8_conv_minmax_rndnu_neon_params,
783 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
784 benchmark::utils::CheckNEON);
785 }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)786 static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
787 GEMMEnd2EndBenchmark(state, model,
788 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
789 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r,
790 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
791 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
792 xnn_init_qs8_conv_minmax_rndnu_neon_params,
793 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
794 benchmark::utils::CheckNEON);
795 }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)796 static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
797 GEMMEnd2EndBenchmark(state, model,
798 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
799 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r,
800 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
801 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r,
802 xnn_init_qs8_conv_minmax_rndnu_neon_params,
803 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
804 benchmark::utils::CheckNEON);
805 }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)806 static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
807 GEMMEnd2EndBenchmark(state, model,
808 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
809 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r,
810 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
811 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r,
812 xnn_init_qs8_conv_minmax_rndnu_neon_params,
813 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
814 benchmark::utils::CheckNEON);
815 }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)816 static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
817 GEMMEnd2EndBenchmark(state, model,
818 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
819 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal,
820 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
821 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
822 xnn_init_qs8_conv_minmax_rndnu_neon_params,
823 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
824 benchmark::utils::CheckNEON);
825 }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)826 static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
827 GEMMEnd2EndBenchmark(state, model,
828 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
829 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal,
830 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
831 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
832 xnn_init_qs8_conv_minmax_rndnu_neon_params,
833 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
834 benchmark::utils::CheckNEON);
835 }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)836 static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
837 GEMMEnd2EndBenchmark(state, model,
838 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
839 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal,
840 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
841 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
842 xnn_init_qs8_conv_minmax_rndnu_neon_params,
843 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
844 benchmark::utils::CheckNEON);
845 }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)846 static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
847 GEMMEnd2EndBenchmark(state, model,
848 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
849 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal,
850 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
851 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
852 xnn_init_qs8_conv_minmax_rndnu_neon_params,
853 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
854 benchmark::utils::CheckNEON);
855 }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)856 static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
857 GEMMEnd2EndBenchmark(state, model,
858 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
859 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal,
860 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
861 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal,
862 xnn_init_qs8_conv_minmax_rndnu_neon_params,
863 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
864 benchmark::utils::CheckNEON);
865 }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)866 static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
867 GEMMEnd2EndBenchmark(state, model,
868 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
869 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal,
870 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
871 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal,
872 xnn_init_qs8_conv_minmax_rndnu_neon_params,
873 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
874 benchmark::utils::CheckNEON);
875 }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)876 static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
877 GEMMEnd2EndBenchmark(state, model,
878 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
879 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup,
880 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
881 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
882 xnn_init_qs8_conv_minmax_rndnu_neon_params,
883 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
884 benchmark::utils::CheckNEON);
885 }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)886 static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
887 GEMMEnd2EndBenchmark(state, model,
888 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
889 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup,
890 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
891 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
892 xnn_init_qs8_conv_minmax_rndnu_neon_params,
893 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
894 benchmark::utils::CheckNEON);
895 }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)896 static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
897 GEMMEnd2EndBenchmark(state, model,
898 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
899 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup,
900 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
901 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
902 xnn_init_qs8_conv_minmax_rndnu_neon_params,
903 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
904 benchmark::utils::CheckNEON);
905 }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)906 static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
907 GEMMEnd2EndBenchmark(state, model,
908 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
909 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup,
910 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
911 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
912 xnn_init_qs8_conv_minmax_rndnu_neon_params,
913 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
914 benchmark::utils::CheckNEON);
915 }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)916 static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
917 GEMMEnd2EndBenchmark(state, model,
918 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
919 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup,
920 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
921 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup,
922 xnn_init_qs8_conv_minmax_rndnu_neon_params,
923 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
924 benchmark::utils::CheckNEON);
925 }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,models::ExecutionPlanFactory model)926 static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
927 GEMMEnd2EndBenchmark(state, model,
928 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
929 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup,
930 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
931 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup,
932 xnn_init_qs8_conv_minmax_rndnu_neon_params,
933 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
934 benchmark::utils::CheckNEON);
935 }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)936 static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
937 GEMMEnd2EndBenchmark(state, model,
938 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
939 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r,
940 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
941 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
942 xnn_init_qs8_conv_minmax_rndnu_neon_params,
943 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
944 benchmark::utils::CheckNEON);
945 }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)946 static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
947 GEMMEnd2EndBenchmark(state, model,
948 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
949 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r,
950 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
951 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
952 xnn_init_qs8_conv_minmax_rndnu_neon_params,
953 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
954 benchmark::utils::CheckNEON);
955 }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)956 static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
957 GEMMEnd2EndBenchmark(state, model,
958 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
959 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r,
960 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
961 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
962 xnn_init_qs8_conv_minmax_rndnu_neon_params,
963 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
964 benchmark::utils::CheckNEON);
965 }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)966 static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
967 GEMMEnd2EndBenchmark(state, model,
968 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
969 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r,
970 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
971 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
972 xnn_init_qs8_conv_minmax_rndnu_neon_params,
973 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
974 benchmark::utils::CheckNEON);
975 }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)976 static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
977 GEMMEnd2EndBenchmark(state, model,
978 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
979 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r,
980 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
981 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r,
982 xnn_init_qs8_conv_minmax_rndnu_neon_params,
983 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
984 benchmark::utils::CheckNEON);
985 }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)986 static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
987 GEMMEnd2EndBenchmark(state, model,
988 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
989 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r,
990 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
991 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r,
992 xnn_init_qs8_conv_minmax_rndnu_neon_params,
993 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
994 benchmark::utils::CheckNEON);
995 }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)996 static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
997 GEMMEnd2EndBenchmark(state, model,
998 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
999 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r,
1000 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1001 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1002 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1003 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1004 benchmark::utils::CheckNEON);
1005 }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1006 static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1007 GEMMEnd2EndBenchmark(state, model,
1008 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
1009 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r,
1010 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1011 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1012 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1013 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1014 benchmark::utils::CheckNEON);
1015 }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1016 static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1017 GEMMEnd2EndBenchmark(state, model,
1018 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
1019 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r,
1020 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1021 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1022 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1023 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1024 benchmark::utils::CheckNEON);
1025 }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1026 static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1027 GEMMEnd2EndBenchmark(state, model,
1028 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
1029 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r,
1030 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1031 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1032 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1033 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1034 benchmark::utils::CheckNEON);
1035 }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1036 static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1037 GEMMEnd2EndBenchmark(state, model,
1038 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
1039 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r,
1040 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1041 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r,
1042 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1043 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1044 benchmark::utils::CheckNEON);
1045 }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1046 static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1047 GEMMEnd2EndBenchmark(state, model,
1048 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
1049 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r,
1050 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1051 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r,
1052 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1053 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1054 benchmark::utils::CheckNEON);
1055 }
qs8_gemm_2x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1056 static void qs8_gemm_2x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1057 GEMMEnd2EndBenchmark(state, model,
1058 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
1059 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mlal,
1060 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1061 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1062 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1063 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1064 benchmark::utils::CheckNEON);
1065 }
qs8_gemm_2x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1066 static void qs8_gemm_2x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1067 GEMMEnd2EndBenchmark(state, model,
1068 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
1069 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mlal,
1070 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1071 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1072 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1073 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1074 benchmark::utils::CheckNEON);
1075 }
qs8_gemm_3x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1076 static void qs8_gemm_3x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1077 GEMMEnd2EndBenchmark(state, model,
1078 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
1079 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal,
1080 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1081 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1082 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1083 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1084 benchmark::utils::CheckNEON);
1085 }
qs8_gemm_3x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1086 static void qs8_gemm_3x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1087 GEMMEnd2EndBenchmark(state, model,
1088 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1089 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal,
1090 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1091 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1092 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1093 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1094 benchmark::utils::CheckNEON);
1095 }
qs8_gemm_4x8c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1096 static void qs8_gemm_4x8c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1097 GEMMEnd2EndBenchmark(state, model,
1098 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1099 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal,
1100 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1101 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mlal,
1102 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1103 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1104 benchmark::utils::CheckNEON);
1105 }
qs8_gemm_4x16c4s2__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1106 static void qs8_gemm_4x16c4s2__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1107 GEMMEnd2EndBenchmark(state, model,
1108 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1109 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal,
1110 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1111 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal,
1112 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1113 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1114 benchmark::utils::CheckNEON);
1115 }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1116 static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1117 GEMMEnd2EndBenchmark(state, model,
1118 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1119 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup,
1120 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1121 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1122 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1123 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1124 benchmark::utils::CheckNEON);
1125 }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1126 static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1127 GEMMEnd2EndBenchmark(state, model,
1128 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1129 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup,
1130 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1131 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1132 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1133 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1134 benchmark::utils::CheckNEON);
1135 }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1136 static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1137 GEMMEnd2EndBenchmark(state, model,
1138 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1139 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup,
1140 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1141 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1142 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1143 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1144 benchmark::utils::CheckNEON);
1145 }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1146 static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1147 GEMMEnd2EndBenchmark(state, model,
1148 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1149 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup,
1150 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1151 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1152 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1153 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1154 benchmark::utils::CheckNEON);
1155 }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1156 static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1157 GEMMEnd2EndBenchmark(state, model,
1158 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1159 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup,
1160 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1161 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup,
1162 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1163 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1164 benchmark::utils::CheckNEON);
1165 }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1166 static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1167 GEMMEnd2EndBenchmark(state, model,
1168 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1169 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup,
1170 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1171 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup,
1172 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1173 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1174 benchmark::utils::CheckNEON);
1175 }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1176 static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1177 GEMMEnd2EndBenchmark(state, model,
1178 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1179 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r,
1180 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1181 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1182 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1183 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1184 benchmark::utils::CheckNEON);
1185 }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1186 static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1187 GEMMEnd2EndBenchmark(state, model,
1188 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1189 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r,
1190 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1191 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1192 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1193 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1194 benchmark::utils::CheckNEON);
1195 }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1196 static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1197 GEMMEnd2EndBenchmark(state, model,
1198 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1199 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r,
1200 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1201 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1202 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1203 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1204 benchmark::utils::CheckNEON);
1205 }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1206 static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1207 GEMMEnd2EndBenchmark(state, model,
1208 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1209 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r,
1210 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1211 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1212 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1213 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1214 benchmark::utils::CheckNEON);
1215 }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1216 static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1217 GEMMEnd2EndBenchmark(state, model,
1218 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1219 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r,
1220 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1221 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r,
1222 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1223 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1224 benchmark::utils::CheckNEON);
1225 }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1226 static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1227 GEMMEnd2EndBenchmark(state, model,
1228 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1229 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r,
1230 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1231 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r,
1232 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1233 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1234 benchmark::utils::CheckNEON);
1235 }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1236 static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1237 GEMMEnd2EndBenchmark(state, model,
1238 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1239 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r,
1240 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1241 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1242 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1243 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1244 benchmark::utils::CheckNEON);
1245 }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1246 static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1247 GEMMEnd2EndBenchmark(state, model,
1248 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1249 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r,
1250 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1251 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1252 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1253 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1254 benchmark::utils::CheckNEON);
1255 }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1256 static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1257 GEMMEnd2EndBenchmark(state, model,
1258 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1259 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r,
1260 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1261 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1262 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1263 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1264 benchmark::utils::CheckNEON);
1265 }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1266 static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1267 GEMMEnd2EndBenchmark(state, model,
1268 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1269 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r,
1270 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1271 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1272 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1273 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1274 benchmark::utils::CheckNEON);
1275 }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1276 static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1277 GEMMEnd2EndBenchmark(state, model,
1278 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1279 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r,
1280 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1281 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r,
1282 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1283 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1284 benchmark::utils::CheckNEON);
1285 }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1286 static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1287 GEMMEnd2EndBenchmark(state, model,
1288 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1289 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r,
1290 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1291 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r,
1292 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1293 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1294 benchmark::utils::CheckNEON);
1295 }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1296 static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1297 GEMMEnd2EndBenchmark(state, model,
1298 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1299 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r,
1300 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1301 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1302 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1303 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1304 benchmark::utils::CheckNEON);
1305 }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1306 static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1307 GEMMEnd2EndBenchmark(state, model,
1308 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1309 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r,
1310 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1311 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1312 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1313 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1314 benchmark::utils::CheckNEON);
1315 }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1316 static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1317 GEMMEnd2EndBenchmark(state, model,
1318 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1319 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r,
1320 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1321 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1322 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1323 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1324 benchmark::utils::CheckNEON);
1325 }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1326 static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1327 GEMMEnd2EndBenchmark(state, model,
1328 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1329 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r,
1330 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1331 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1332 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1333 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1334 benchmark::utils::CheckNEON);
1335 }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1336 static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1337 GEMMEnd2EndBenchmark(state, model,
1338 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1339 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r,
1340 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1341 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r,
1342 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1343 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1344 benchmark::utils::CheckNEON);
1345 }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,models::ExecutionPlanFactory model)1346 static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, models::ExecutionPlanFactory model) {
1347 GEMMEnd2EndBenchmark(state, model,
1348 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1349 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r,
1350 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1351 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r,
1352 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1353 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
1354 benchmark::utils::CheckNEON);
1355 }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1356 static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1357 GEMMEnd2EndBenchmark(state, model,
1358 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1359 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull,
1360 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1361 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1362 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1363 2 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1364 benchmark::utils::CheckNEON);
1365 }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1366 static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1367 GEMMEnd2EndBenchmark(state, model,
1368 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1369 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull,
1370 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1371 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1372 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1373 2 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1374 benchmark::utils::CheckNEON);
1375 }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1376 static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1377 GEMMEnd2EndBenchmark(state, model,
1378 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1379 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull,
1380 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1381 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1382 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1383 3 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1384 benchmark::utils::CheckNEON);
1385 }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1386 static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1387 GEMMEnd2EndBenchmark(state, model,
1388 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1389 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull,
1390 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1391 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1392 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1393 3 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1394 benchmark::utils::CheckNEON);
1395 }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1396 static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1397 GEMMEnd2EndBenchmark(state, model,
1398 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1399 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull,
1400 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1401 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull,
1402 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1403 4 /* mr */, 8 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1404 benchmark::utils::CheckNEON);
1405 }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1406 static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1407 GEMMEnd2EndBenchmark(state, model,
1408 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1409 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull,
1410 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1411 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull,
1412 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1413 4 /* mr */, 16 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
1414 benchmark::utils::CheckNEON);
1415 }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1416 static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1417 GEMMEnd2EndBenchmark(state, model,
1418 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1419 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup,
1420 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1421 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1422 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1423 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1424 benchmark::utils::CheckNEON);
1425 }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1426 static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1427 GEMMEnd2EndBenchmark(state, model,
1428 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1429 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup,
1430 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1431 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1432 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1433 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1434 benchmark::utils::CheckNEON);
1435 }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1436 static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1437 GEMMEnd2EndBenchmark(state, model,
1438 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1439 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup,
1440 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1441 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1442 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1443 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1444 benchmark::utils::CheckNEON);
1445 }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1446 static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1447 GEMMEnd2EndBenchmark(state, model,
1448 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1449 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup,
1450 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1451 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1452 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1453 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1454 benchmark::utils::CheckNEON);
1455 }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1456 static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1457 GEMMEnd2EndBenchmark(state, model,
1458 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1459 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup,
1460 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1461 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup,
1462 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1463 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1464 benchmark::utils::CheckNEON);
1465 }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,models::ExecutionPlanFactory model)1466 static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, models::ExecutionPlanFactory model) {
1467 GEMMEnd2EndBenchmark(state, model,
1468 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1469 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup,
1470 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1471 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup,
1472 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1473 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1474 benchmark::utils::CheckNEON);
1475 }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1476 static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1477 GEMMEnd2EndBenchmark(state, model,
1478 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1479 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r,
1480 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1481 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1482 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1483 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1484 benchmark::utils::CheckNEON);
1485 }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1486 static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1487 GEMMEnd2EndBenchmark(state, model,
1488 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1489 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r,
1490 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1491 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1492 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1493 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1494 benchmark::utils::CheckNEON);
1495 }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1496 static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1497 GEMMEnd2EndBenchmark(state, model,
1498 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1499 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r,
1500 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1501 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1502 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1503 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1504 benchmark::utils::CheckNEON);
1505 }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1506 static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1507 GEMMEnd2EndBenchmark(state, model,
1508 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1509 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r,
1510 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1511 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1512 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1513 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1514 benchmark::utils::CheckNEON);
1515 }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1516 static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1517 GEMMEnd2EndBenchmark(state, model,
1518 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1519 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r,
1520 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1521 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r,
1522 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1523 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1524 benchmark::utils::CheckNEON);
1525 }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,models::ExecutionPlanFactory model)1526 static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, models::ExecutionPlanFactory model) {
1527 GEMMEnd2EndBenchmark(state, model,
1528 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1529 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r,
1530 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1531 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r,
1532 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1533 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1534 benchmark::utils::CheckNEON);
1535 }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1536 static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1537 GEMMEnd2EndBenchmark(state, model,
1538 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1539 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r,
1540 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1541 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1542 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1543 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1544 benchmark::utils::CheckNEON);
1545 }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1546 static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1547 GEMMEnd2EndBenchmark(state, model,
1548 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1549 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r,
1550 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1551 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1552 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1553 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1554 benchmark::utils::CheckNEON);
1555 }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1556 static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1557 GEMMEnd2EndBenchmark(state, model,
1558 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1559 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r,
1560 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1561 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1562 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1563 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1564 benchmark::utils::CheckNEON);
1565 }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1566 static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1567 GEMMEnd2EndBenchmark(state, model,
1568 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1569 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r,
1570 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1571 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1572 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1573 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1574 benchmark::utils::CheckNEON);
1575 }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1576 static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1577 GEMMEnd2EndBenchmark(state, model,
1578 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1579 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r,
1580 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1581 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r,
1582 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1583 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1584 benchmark::utils::CheckNEON);
1585 }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,models::ExecutionPlanFactory model)1586 static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, models::ExecutionPlanFactory model) {
1587 GEMMEnd2EndBenchmark(state, model,
1588 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1589 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r,
1590 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1591 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r,
1592 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1593 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
1594 benchmark::utils::CheckNEON);
1595 }
qs8_gemm_2x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1596 static void qs8_gemm_2x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1597 GEMMEnd2EndBenchmark(state, model,
1598 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1599 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c4s2__neon_mull,
1600 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1601 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1602 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1603 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1604 benchmark::utils::CheckNEON);
1605 }
qs8_gemm_2x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1606 static void qs8_gemm_2x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1607 GEMMEnd2EndBenchmark(state, model,
1608 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1609 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4s2__neon_mull,
1610 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1611 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1612 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1613 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1614 benchmark::utils::CheckNEON);
1615 }
qs8_gemm_3x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1616 static void qs8_gemm_3x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1617 GEMMEnd2EndBenchmark(state, model,
1618 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1619 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mull,
1620 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1621 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1622 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1623 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1624 benchmark::utils::CheckNEON);
1625 }
qs8_gemm_3x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1626 static void qs8_gemm_3x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1627 GEMMEnd2EndBenchmark(state, model,
1628 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1629 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mull,
1630 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1631 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1632 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1633 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1634 benchmark::utils::CheckNEON);
1635 }
qs8_gemm_4x8c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1636 static void qs8_gemm_4x8c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1637 GEMMEnd2EndBenchmark(state, model,
1638 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1639 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull,
1640 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1641 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4s2__neon_mull,
1642 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1643 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1644 benchmark::utils::CheckNEON);
1645 }
qs8_gemm_4x16c4s2__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1646 static void qs8_gemm_4x16c4s2__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1647 GEMMEnd2EndBenchmark(state, model,
1648 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1649 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull,
1650 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1651 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4s2__neon_mull,
1652 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1653 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 1 /* log2_sr */,
1654 benchmark::utils::CheckNEON);
1655 }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1656 static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1657 GEMMEnd2EndBenchmark(state, model,
1658 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1659 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mull,
1660 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1661 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1662 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1663 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1664 benchmark::utils::CheckNEON);
1665 }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1666 static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1667 GEMMEnd2EndBenchmark(state, model,
1668 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1669 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull,
1670 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1671 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1672 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1673 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1674 benchmark::utils::CheckNEON);
1675 }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1676 static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1677 GEMMEnd2EndBenchmark(state, model,
1678 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1679 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mull,
1680 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1681 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1682 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1683 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1684 benchmark::utils::CheckNEON);
1685 }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1686 static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1687 GEMMEnd2EndBenchmark(state, model,
1688 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1689 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull,
1690 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1691 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1692 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1693 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1694 benchmark::utils::CheckNEON);
1695 }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1696 static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1697 GEMMEnd2EndBenchmark(state, model,
1698 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1699 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mull,
1700 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1701 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mull,
1702 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1703 4 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1704 benchmark::utils::CheckNEON);
1705 }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,models::ExecutionPlanFactory model)1706 static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, models::ExecutionPlanFactory model) {
1707 GEMMEnd2EndBenchmark(state, model,
1708 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1709 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull,
1710 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1711 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull,
1712 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1713 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1714 benchmark::utils::CheckNEON);
1715 }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1716 static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1717 GEMMEnd2EndBenchmark(state, model,
1718 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1719 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c16__neon_mlal,
1720 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1721 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1722 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1723 2 /* mr */, 8 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1724 benchmark::utils::CheckNEON);
1725 }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1726 static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1727 GEMMEnd2EndBenchmark(state, model,
1728 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1729 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal,
1730 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1731 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1732 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1733 2 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1734 benchmark::utils::CheckNEON);
1735 }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1736 static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1737 GEMMEnd2EndBenchmark(state, model,
1738 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1739 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c16__neon_mlal,
1740 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1741 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1742 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1743 4 /* mr */, 8 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1744 benchmark::utils::CheckNEON);
1745 }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1746 static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1747 GEMMEnd2EndBenchmark(state, model,
1748 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1749 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal,
1750 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1751 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1752 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1753 4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1754 benchmark::utils::CheckNEON);
1755 }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1756 static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1757 GEMMEnd2EndBenchmark(state, model,
1758 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1759 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c16__neon_mlal,
1760 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1761 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c16__neon_mlal,
1762 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1763 4 /* mr */, 8 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1764 benchmark::utils::CheckNEON);
1765 }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1766 static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1767 GEMMEnd2EndBenchmark(state, model,
1768 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1769 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal,
1770 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1771 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal,
1772 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1773 4 /* mr */, 16 /* nr */, 4 /* log2_kr */, 0 /* log2_sr */,
1774 benchmark::utils::CheckNEON);
1775 }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1776 static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1777 GEMMEnd2EndBenchmark(state, model,
1778 xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1779 xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__neon_mlal,
1780 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1781 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1782 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1783 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1784 benchmark::utils::CheckNEON);
1785 }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1786 static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1787 GEMMEnd2EndBenchmark(state, model,
1788 xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1789 xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal,
1790 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1791 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1792 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1793 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1794 benchmark::utils::CheckNEON);
1795 }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1796 static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1797 GEMMEnd2EndBenchmark(state, model,
1798 xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1799 xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal,
1800 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1801 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1802 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1803 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1804 benchmark::utils::CheckNEON);
1805 }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1806 static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1807 GEMMEnd2EndBenchmark(state, model,
1808 xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1809 xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal,
1810 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1811 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1812 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1813 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1814 benchmark::utils::CheckNEON);
1815 }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1816 static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1817 GEMMEnd2EndBenchmark(state, model,
1818 xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1819 xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal,
1820 xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1821 xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__neon_mlal,
1822 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1823 4 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1824 benchmark::utils::CheckNEON);
1825 }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,models::ExecutionPlanFactory model)1826 static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
1827 GEMMEnd2EndBenchmark(state, model,
1828 xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1829 xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal,
1830 xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1831 xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal,
1832 xnn_init_qs8_conv_minmax_rndnu_neon_params,
1833 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
1834 benchmark::utils::CheckNEON);
1835 }
1836
1837 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mlal);
1838 BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mlal);
1839 BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mlal);
1840 BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mlal);
1841 BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mlal);
1842 BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mlal);
1843
1844 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__neon_mull);
1845 BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__neon_mull);
1846 BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__neon_mull);
1847 BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__neon_mull);
1848 BENCHMARK_QS8_END2END(qs8_gemm_4x8c8__neon_mull);
1849 BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__neon_mull);
1850
1851 BENCHMARK_QS8_END2END(qs8_gemm_2x8c16__neon_mlal);
1852 BENCHMARK_QS8_END2END(qs8_gemm_2x16c16__neon_mlal);
1853 BENCHMARK_QS8_END2END(qs8_gemm_3x8c16__neon_mlal);
1854 BENCHMARK_QS8_END2END(qs8_gemm_3x16c16__neon_mlal);
1855 BENCHMARK_QS8_END2END(qs8_gemm_4x8c16__neon_mlal);
1856 BENCHMARK_QS8_END2END(qs8_gemm_4x16c16__neon_mlal);
1857
1858 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_dup);
1859 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_dup);
1860 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_dup);
1861 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_dup);
1862 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_dup);
1863 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_dup);
1864
1865 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_dup);
1866 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_dup);
1867 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_dup);
1868 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_dup);
1869 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_dup);
1870 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_dup);
1871
1872 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld1r);
1873 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld1r);
1874 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld1r);
1875 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld1r);
1876 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld1r);
1877 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld1r);
1878
1879 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld1r);
1880 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld1r);
1881 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld1r);
1882 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld1r);
1883 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld1r);
1884 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld1r);
1885
1886 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mlal_ld2r);
1887 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mlal_ld2r);
1888 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mlal_ld2r);
1889 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mlal_ld2r);
1890 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mlal_ld2r);
1891 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mlal_ld2r);
1892
1893 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4__neon_mull_ld2r);
1894 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4__neon_mull_ld2r);
1895 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4__neon_mull_ld2r);
1896 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4__neon_mull_ld2r);
1897 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4__neon_mull_ld2r);
1898 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__neon_mull_ld2r);
1899
1900 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mlal);
1901 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mlal);
1902 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mlal);
1903 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mlal);
1904 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mlal);
1905 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mlal);
1906
1907 BENCHMARK_QS8_END2END(qs8_gemm_2x8c4s2__neon_mull);
1908 BENCHMARK_QS8_END2END(qs8_gemm_2x16c4s2__neon_mull);
1909 BENCHMARK_QS8_END2END(qs8_gemm_3x8c4s2__neon_mull);
1910 BENCHMARK_QS8_END2END(qs8_gemm_3x16c4s2__neon_mull);
1911 BENCHMARK_QS8_END2END(qs8_gemm_4x8c4s2__neon_mull);
1912 BENCHMARK_QS8_END2END(qs8_gemm_4x16c4s2__neon_mull);
1913
1914 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_dup);
1915 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_dup);
1916 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_dup);
1917 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_dup);
1918 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_dup);
1919 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_dup);
1920
1921 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_dup);
1922 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_dup);
1923 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_dup);
1924 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_dup);
1925 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_dup);
1926 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_dup);
1927
1928 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld1r);
1929 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld1r);
1930 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld1r);
1931 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld1r);
1932 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld1r);
1933 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld1r);
1934
1935 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld1r);
1936 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld1r);
1937 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld1r);
1938 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld1r);
1939 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld1r);
1940 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld1r);
1941
1942 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld2r);
1943 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld2r);
1944 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld2r);
1945 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld2r);
1946 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld2r);
1947 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld2r);
1948
1949 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld2r);
1950 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld2r);
1951 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld2r);
1952 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld2r);
1953 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld2r);
1954 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld2r);
1955
1956 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mlal_ld4r);
1957 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mlal_ld4r);
1958 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mlal_ld4r);
1959 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mlal_ld4r);
1960 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mlal_ld4r);
1961 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mlal_ld4r);
1962
1963 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2__neon_mull_ld4r);
1964 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2__neon_mull_ld4r);
1965 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2__neon_mull_ld4r);
1966 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2__neon_mull_ld4r);
1967 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2__neon_mull_ld4r);
1968 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2__neon_mull_ld4r);
1969
1970 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mlal);
1971 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mlal);
1972 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mlal);
1973 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mlal);
1974 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mlal);
1975 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mlal);
1976
1977 BENCHMARK_QS8_END2END(qs8_gemm_2x8c2s4__neon_mull);
1978 BENCHMARK_QS8_END2END(qs8_gemm_2x16c2s4__neon_mull);
1979 BENCHMARK_QS8_END2END(qs8_gemm_3x8c2s4__neon_mull);
1980 BENCHMARK_QS8_END2END(qs8_gemm_3x16c2s4__neon_mull);
1981 BENCHMARK_QS8_END2END(qs8_gemm_4x8c2s4__neon_mull);
1982 BENCHMARK_QS8_END2END(qs8_gemm_4x16c2s4__neon_mull);
1983
1984 BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane);
1985 BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane);
1986 BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane);
1987 BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane);
1988 BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane);
1989 BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane);
1990 BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane);
1991 BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane);
1992
1993 BENCHMARK_QS8_END2END(qs8_gemm_2x8__neon_mlal_lane_prfm);
1994 BENCHMARK_QS8_END2END(qs8_gemm_2x16__neon_mlal_lane_prfm);
1995 BENCHMARK_QS8_END2END(qs8_gemm_3x8__neon_mlal_lane_prfm);
1996 BENCHMARK_QS8_END2END(qs8_gemm_3x16__neon_mlal_lane_prfm);
1997 BENCHMARK_QS8_END2END(qs8_gemm_4x8__neon_mlal_lane_prfm);
1998 BENCHMARK_QS8_END2END(qs8_gemm_4x16__neon_mlal_lane_prfm);
1999 BENCHMARK_QS8_END2END(qs8_gemm_6x8__neon_mlal_lane_prfm);
2000 BENCHMARK_QS8_END2END(qs8_gemm_6x16__neon_mlal_lane_prfm);
2001 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2002
2003
2004 #if XNN_ARCH_ARM
qs8_gemm_1x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2005 static void qs8_gemm_1x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2006 GEMMEnd2EndBenchmark(state, model,
2007 xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2008 xnn_qs8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2009 xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2010 xnn_qs8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2011 xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2012 1 /* mr */, 1 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2013 benchmark::utils::CheckARMV6);
2014 }
qs8_gemm_2x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2015 static void qs8_gemm_2x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2016 GEMMEnd2EndBenchmark(state, model,
2017 xnn_qs8_gemm_minmax_fp32_ukernel_2x1c4__armsimd32,
2018 xnn_qs8_igemm_minmax_fp32_ukernel_2x1c4__armsimd32,
2019 xnn_qs8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2020 xnn_qs8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
2021 xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2022 2 /* mr */, 1 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2023 benchmark::utils::CheckARMV6);
2024 }
qs8_gemm_1x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2025 static void qs8_gemm_1x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2026 GEMMEnd2EndBenchmark(state, model,
2027 xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2028 xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2029 xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2030 xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2031 xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2032 1 /* mr */, 2 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2033 benchmark::utils::CheckARMV6);
2034 }
qs8_gemm_2x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)2035 static void qs8_gemm_2x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
2036 GEMMEnd2EndBenchmark(state, model,
2037 xnn_qs8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32,
2038 xnn_qs8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32,
2039 xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2040 xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
2041 xnn_init_qs8_conv_minmax_fp32_armsimd32_params,
2042 2 /* mr */, 2 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
2043 benchmark::utils::CheckARMV6);
2044 }
2045
2046 BENCHMARK_QS8_END2END(qs8_gemm_1x1c4__armsimd32);
2047 BENCHMARK_QS8_END2END(qs8_gemm_2x1c4__armsimd32);
2048 BENCHMARK_QS8_END2END(qs8_gemm_1x2c4__armsimd32);
2049 BENCHMARK_QS8_END2END(qs8_gemm_2x2c4__armsimd32);
2050 #endif // XNN_ARCH_ARM
2051
2052 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2053 static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2054 GEMMEnd2EndBenchmark(state, model,
2055 xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
2056 xnn_qs8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
2057 xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2058 xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2059 xnn_init_qs8_conv_minmax_fp32_avx512_params,
2060 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2061 benchmark::utils::CheckAVX512F);
2062 }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2063 static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2064 GEMMEnd2EndBenchmark(state, model,
2065 xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
2066 xnn_qs8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
2067 xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2068 xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2069 xnn_init_qs8_conv_minmax_fp32_avx512_params,
2070 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2071 benchmark::utils::CheckAVX512F);
2072 }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)2073 static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
2074 GEMMEnd2EndBenchmark(state, model,
2075 xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2076 xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
2077 xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2078 xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
2079 xnn_init_qs8_conv_minmax_fp32_avx512_params,
2080 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2081 benchmark::utils::CheckAVX512F);
2082 }
qs8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2083 static void qs8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2084 GEMMEnd2EndBenchmark(state, model,
2085 xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
2086 xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
2087 xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2088 xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2089 xnn_init_qs8_conv_minmax_fp32_avx2_params,
2090 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2091 benchmark::utils::CheckAVX2);
2092 }
qs8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)2093 static void qs8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
2094 GEMMEnd2EndBenchmark(state, model,
2095 xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
2096 xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
2097 xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
2098 xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
2099 xnn_init_qs8_conv_minmax_fp32_avx2_params,
2100 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2101 benchmark::utils::CheckAVX2);
2102 }
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2103 static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2104 GEMMEnd2EndBenchmark(state, model,
2105 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2106 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
2107 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2108 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2109 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2110 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2111 benchmark::utils::CheckXOP);
2112 }
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2113 static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2114 GEMMEnd2EndBenchmark(state, model,
2115 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2116 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
2117 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2118 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2119 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2120 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2121 benchmark::utils::CheckXOP);
2122 }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2123 static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2124 GEMMEnd2EndBenchmark(state, model,
2125 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2126 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
2127 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2128 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2129 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2130 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2131 benchmark::utils::CheckXOP);
2132 }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2133 static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2134 GEMMEnd2EndBenchmark(state, model,
2135 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2136 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
2137 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2138 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2139 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2140 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2141 benchmark::utils::CheckXOP);
2142 }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2143 static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2144 GEMMEnd2EndBenchmark(state, model,
2145 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2146 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
2147 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2148 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
2149 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2150 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2151 benchmark::utils::CheckXOP);
2152 }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2153 static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2154 GEMMEnd2EndBenchmark(state, model,
2155 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2156 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
2157 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2158 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
2159 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2160 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2161 benchmark::utils::CheckXOP);
2162 }
qs8_gemm_2x4c2s4__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2163 static void qs8_gemm_2x4c2s4__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2164 GEMMEnd2EndBenchmark(state, model,
2165 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64,
2166 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64,
2167 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2168 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2169 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2170 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2171 benchmark::utils::CheckXOP);
2172 }
qs8_gemm_2x4c2s4__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2173 static void qs8_gemm_2x4c2s4__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2174 GEMMEnd2EndBenchmark(state, model,
2175 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld128,
2176 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__xop_ld128,
2177 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2178 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2179 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2180 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2181 benchmark::utils::CheckXOP);
2182 }
qs8_gemm_3x4c2s4__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2183 static void qs8_gemm_3x4c2s4__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2184 GEMMEnd2EndBenchmark(state, model,
2185 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld64,
2186 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__xop_ld64,
2187 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2188 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2189 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2190 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2191 benchmark::utils::CheckXOP);
2192 }
qs8_gemm_3x4c2s4__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2193 static void qs8_gemm_3x4c2s4__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2194 GEMMEnd2EndBenchmark(state, model,
2195 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128,
2196 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128,
2197 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2198 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2199 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2200 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2201 benchmark::utils::CheckXOP);
2202 }
qs8_gemm_4x4c2s4__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2203 static void qs8_gemm_4x4c2s4__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2204 GEMMEnd2EndBenchmark(state, model,
2205 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64,
2206 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64,
2207 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2208 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64,
2209 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2210 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2211 benchmark::utils::CheckXOP);
2212 }
qs8_gemm_4x4c2s4__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2213 static void qs8_gemm_4x4c2s4__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2214 GEMMEnd2EndBenchmark(state, model,
2215 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128,
2216 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128,
2217 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2218 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__xop_ld128,
2219 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2220 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2221 benchmark::utils::CheckXOP);
2222 }
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2223 static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2224 GEMMEnd2EndBenchmark(state, model,
2225 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2226 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
2227 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2228 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2229 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2230 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2231 benchmark::utils::CheckXOP);
2232 }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2233 static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2234 GEMMEnd2EndBenchmark(state, model,
2235 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2236 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
2237 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2238 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
2239 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2240 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2241 benchmark::utils::CheckXOP);
2242 }
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2243 static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2244 GEMMEnd2EndBenchmark(state, model,
2245 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2246 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
2247 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2248 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2249 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2250 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2251 benchmark::utils::CheckXOP);
2252 }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2253 static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2254 GEMMEnd2EndBenchmark(state, model,
2255 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2256 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
2257 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2258 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
2259 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2260 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2261 benchmark::utils::CheckXOP);
2262 }
2263
2264
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2265 static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2266 GEMMEnd2EndBenchmark(state, model,
2267 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2268 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
2269 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2270 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2271 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2272 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2273 benchmark::utils::CheckAVX);
2274 }
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2275 static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2276 GEMMEnd2EndBenchmark(state, model,
2277 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2278 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
2279 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2280 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2281 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2282 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2283 benchmark::utils::CheckAVX);
2284 }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2285 static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2286 GEMMEnd2EndBenchmark(state, model,
2287 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2288 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
2289 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2290 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2291 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2292 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2293 benchmark::utils::CheckAVX);
2294 }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2295 static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2296 GEMMEnd2EndBenchmark(state, model,
2297 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2298 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
2299 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2300 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2301 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2302 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2303 benchmark::utils::CheckAVX);
2304 }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2305 static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2306 GEMMEnd2EndBenchmark(state, model,
2307 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2308 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
2309 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2310 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
2311 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2312 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2313 benchmark::utils::CheckAVX);
2314 }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2315 static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2316 GEMMEnd2EndBenchmark(state, model,
2317 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2318 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
2319 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2320 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
2321 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2322 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2323 benchmark::utils::CheckAVX);
2324 }
qs8_gemm_2x4c2s4__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2325 static void qs8_gemm_2x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2326 GEMMEnd2EndBenchmark(state, model,
2327 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64,
2328 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64,
2329 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2330 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2331 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2332 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2333 benchmark::utils::CheckAVX);
2334 }
qs8_gemm_2x4c2s4__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2335 static void qs8_gemm_2x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2336 GEMMEnd2EndBenchmark(state, model,
2337 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128,
2338 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128,
2339 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2340 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2341 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2342 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2343 benchmark::utils::CheckAVX);
2344 }
qs8_gemm_3x4c2s4__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2345 static void qs8_gemm_3x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2346 GEMMEnd2EndBenchmark(state, model,
2347 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64,
2348 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__avx_ld64,
2349 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2350 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2351 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2352 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2353 benchmark::utils::CheckAVX);
2354 }
qs8_gemm_3x4c2s4__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2355 static void qs8_gemm_3x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2356 GEMMEnd2EndBenchmark(state, model,
2357 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128,
2358 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__avx_ld128,
2359 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2360 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2361 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2362 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2363 benchmark::utils::CheckAVX);
2364 }
qs8_gemm_4x4c2s4__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2365 static void qs8_gemm_4x4c2s4__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2366 GEMMEnd2EndBenchmark(state, model,
2367 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64,
2368 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64,
2369 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2370 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64,
2371 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2372 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2373 benchmark::utils::CheckAVX);
2374 }
qs8_gemm_4x4c2s4__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2375 static void qs8_gemm_4x4c2s4__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2376 GEMMEnd2EndBenchmark(state, model,
2377 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128,
2378 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__avx_ld128,
2379 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2380 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128,
2381 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2382 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2383 benchmark::utils::CheckAVX);
2384 }
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2385 static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2386 GEMMEnd2EndBenchmark(state, model,
2387 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2388 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
2389 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2390 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2391 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2392 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2393 benchmark::utils::CheckAVX);
2394 }
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2395 static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2396 GEMMEnd2EndBenchmark(state, model,
2397 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2398 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
2399 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2400 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2401 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2402 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2403 benchmark::utils::CheckAVX);
2404 }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2405 static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2406 GEMMEnd2EndBenchmark(state, model,
2407 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2408 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
2409 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2410 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
2411 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2412 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2413 benchmark::utils::CheckAVX);
2414 }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2415 static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2416 GEMMEnd2EndBenchmark(state, model,
2417 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2418 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
2419 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2420 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
2421 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2422 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2423 benchmark::utils::CheckAVX);
2424 }
2425
2426
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2427 static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2428 GEMMEnd2EndBenchmark(state, model,
2429 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2430 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
2431 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2432 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2433 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2434 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2435 benchmark::utils::CheckSSE41);
2436 }
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2437 static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2438 GEMMEnd2EndBenchmark(state, model,
2439 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2440 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
2441 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2442 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2443 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2444 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2445 benchmark::utils::CheckSSE41);
2446 }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2447 static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2448 GEMMEnd2EndBenchmark(state, model,
2449 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2450 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
2451 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2452 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2453 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2454 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2455 benchmark::utils::CheckSSE41);
2456 }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2457 static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2458 GEMMEnd2EndBenchmark(state, model,
2459 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2460 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
2461 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2462 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2463 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2464 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2465 benchmark::utils::CheckSSE41);
2466 }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2467 static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2468 GEMMEnd2EndBenchmark(state, model,
2469 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2470 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
2471 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2472 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
2473 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2474 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2475 benchmark::utils::CheckSSE41);
2476 }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2477 static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2478 GEMMEnd2EndBenchmark(state, model,
2479 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2480 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
2481 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2482 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
2483 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2484 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
2485 benchmark::utils::CheckSSE41);
2486 }
qs8_gemm_2x4c2s4__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2487 static void qs8_gemm_2x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2488 GEMMEnd2EndBenchmark(state, model,
2489 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64,
2490 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld64,
2491 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2492 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2493 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2494 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2495 benchmark::utils::CheckSSE41);
2496 }
qs8_gemm_2x4c2s4__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2497 static void qs8_gemm_2x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2498 GEMMEnd2EndBenchmark(state, model,
2499 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128,
2500 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128,
2501 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2502 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2503 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2504 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2505 benchmark::utils::CheckSSE41);
2506 }
qs8_gemm_3x4c2s4__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2507 static void qs8_gemm_3x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2508 GEMMEnd2EndBenchmark(state, model,
2509 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64,
2510 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld64,
2511 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2512 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2513 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2514 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2515 benchmark::utils::CheckSSE41);
2516 }
qs8_gemm_3x4c2s4__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2517 static void qs8_gemm_3x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2518 GEMMEnd2EndBenchmark(state, model,
2519 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128,
2520 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128,
2521 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2522 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2523 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2524 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2525 benchmark::utils::CheckSSE41);
2526 }
qs8_gemm_4x4c2s4__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2527 static void qs8_gemm_4x4c2s4__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2528 GEMMEnd2EndBenchmark(state, model,
2529 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64,
2530 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64,
2531 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2532 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64,
2533 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2534 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2535 benchmark::utils::CheckSSE41);
2536 }
qs8_gemm_4x4c2s4__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2537 static void qs8_gemm_4x4c2s4__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2538 GEMMEnd2EndBenchmark(state, model,
2539 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128,
2540 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld128,
2541 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2542 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128,
2543 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2544 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */,
2545 benchmark::utils::CheckSSE41);
2546 }
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2547 static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2548 GEMMEnd2EndBenchmark(state, model,
2549 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2550 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
2551 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2552 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2553 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2554 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2555 benchmark::utils::CheckSSE41);
2556 }
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2557 static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2558 GEMMEnd2EndBenchmark(state, model,
2559 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2560 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
2561 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2562 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2563 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2564 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2565 benchmark::utils::CheckSSE41);
2566 }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2567 static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2568 GEMMEnd2EndBenchmark(state, model,
2569 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2570 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
2571 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2572 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
2573 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2574 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2575 benchmark::utils::CheckSSE41);
2576 }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2577 static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2578 GEMMEnd2EndBenchmark(state, model,
2579 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2580 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
2581 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2582 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
2583 xnn_init_qs8_conv_minmax_fp32_sse4_params,
2584 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2585 benchmark::utils::CheckSSE41);
2586 }
2587
2588
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2589 static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2590 GEMMEnd2EndBenchmark(state, model,
2591 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2592 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64,
2593 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2594 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2595 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2596 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2597 benchmark::utils::CheckSSSE3);
2598 }
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2599 static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2600 GEMMEnd2EndBenchmark(state, model,
2601 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2602 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128,
2603 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2604 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2605 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2606 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2607 benchmark::utils::CheckSSSE3);
2608 }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2609 static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2610 GEMMEnd2EndBenchmark(state, model,
2611 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2612 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64,
2613 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2614 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld64,
2615 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2616 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2617 benchmark::utils::CheckSSSE3);
2618 }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2619 static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2620 GEMMEnd2EndBenchmark(state, model,
2621 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2622 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128,
2623 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2624 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__ssse3_ld128,
2625 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2626 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
2627 benchmark::utils::CheckSSSE3);
2628 }
2629
2630
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2631 static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2632 GEMMEnd2EndBenchmark(state, model,
2633 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2634 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
2635 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2636 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2637 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2638 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2639 }
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2640 static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2641 GEMMEnd2EndBenchmark(state, model,
2642 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2643 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
2644 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2645 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2646 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2647 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2648 }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2649 static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2650 GEMMEnd2EndBenchmark(state, model,
2651 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2652 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
2653 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2654 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2655 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2656 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2657 }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2658 static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2659 GEMMEnd2EndBenchmark(state, model,
2660 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2661 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
2662 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2663 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2664 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2665 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2666 }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2667 static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2668 GEMMEnd2EndBenchmark(state, model,
2669 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2670 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
2671 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2672 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
2673 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2674 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2675 }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2676 static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2677 GEMMEnd2EndBenchmark(state, model,
2678 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2679 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
2680 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2681 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
2682 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2683 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
2684 }
qs8_gemm_2x4c2s4__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2685 static void qs8_gemm_2x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2686 GEMMEnd2EndBenchmark(state, model,
2687 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64,
2688 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64,
2689 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2690 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2691 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2692 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2693 }
qs8_gemm_2x4c2s4__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2694 static void qs8_gemm_2x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2695 GEMMEnd2EndBenchmark(state, model,
2696 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128,
2697 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128,
2698 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2699 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2700 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2701 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2702 }
qs8_gemm_3x4c2s4__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2703 static void qs8_gemm_3x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2704 GEMMEnd2EndBenchmark(state, model,
2705 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64,
2706 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld64,
2707 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2708 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2709 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2710 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2711 }
qs8_gemm_3x4c2s4__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2712 static void qs8_gemm_3x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2713 GEMMEnd2EndBenchmark(state, model,
2714 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128,
2715 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128,
2716 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2717 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2718 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2719 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2720 }
qs8_gemm_4x4c2s4__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2721 static void qs8_gemm_4x4c2s4__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2722 GEMMEnd2EndBenchmark(state, model,
2723 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64,
2724 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld64,
2725 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2726 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64,
2727 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2728 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2729 }
qs8_gemm_4x4c2s4__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2730 static void qs8_gemm_4x4c2s4__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2731 GEMMEnd2EndBenchmark(state, model,
2732 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128,
2733 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__sse2_ld128,
2734 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2735 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld128,
2736 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2737 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2738 }
qs8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2739 static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2740 GEMMEnd2EndBenchmark(state, model,
2741 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2742 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
2743 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2744 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2745 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2746 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2747 }
qs8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2748 static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2749 GEMMEnd2EndBenchmark(state, model,
2750 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2751 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
2752 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2753 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2754 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2755 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2756 }
qs8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2757 static void qs8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2758 GEMMEnd2EndBenchmark(state, model,
2759 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2760 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
2761 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2762 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
2763 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2764 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2765 }
qs8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2766 static void qs8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2767 GEMMEnd2EndBenchmark(state, model,
2768 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2769 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
2770 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2771 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
2772 xnn_init_qs8_conv_minmax_fp32_sse2_params,
2773 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
2774 }
2775
2776
2777 BENCHMARK_QS8_END2END(qs8_gemm_2x16c8__avx512skx);
2778 BENCHMARK_QS8_END2END(qs8_gemm_3x16c8__avx512skx);
2779 BENCHMARK_QS8_END2END(qs8_gemm_4x16c8__avx512skx);
2780
2781 BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__avx2);
2782 BENCHMARK_QS8_END2END(qs8_gemm_3x8c8__avx2);
2783
2784 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld64);
2785 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__xop_ld128);
2786 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld64);
2787 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__xop_ld128);
2788 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld64);
2789 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__xop_ld128);
2790
2791 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__xop_ld64);
2792 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__xop_ld128);
2793 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__xop_ld64);
2794 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__xop_ld128);
2795 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__xop_ld64);
2796 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__xop_ld128);
2797
2798 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld64);
2799 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__xop_ld128);
2800 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld64);
2801 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__xop_ld128);
2802
2803 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld64);
2804 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__avx_ld128);
2805 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld64);
2806 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__avx_ld128);
2807 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld64);
2808 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__avx_ld128);
2809
2810 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__avx_ld64);
2811 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__avx_ld128);
2812 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__avx_ld64);
2813 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__avx_ld128);
2814 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__avx_ld64);
2815 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__avx_ld128);
2816
2817 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld64);
2818 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__avx_ld128);
2819 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld64);
2820 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__avx_ld128);
2821
2822 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld64);
2823 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse41_ld128);
2824 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld64);
2825 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse41_ld128);
2826 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld64);
2827 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse41_ld128);
2828
2829 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse41_ld64);
2830 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse41_ld128);
2831 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse41_ld64);
2832 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse41_ld128);
2833 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse41_ld64);
2834 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse41_ld128);
2835
2836 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld64);
2837 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse41_ld128);
2838 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld64);
2839 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse41_ld128);
2840
2841 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld64);
2842 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__ssse3_ld128);
2843 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld64);
2844 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__ssse3_ld128);
2845
2846 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld64);
2847 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2__sse2_ld128);
2848 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld64);
2849 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2__sse2_ld128);
2850 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld64);
2851 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2__sse2_ld128);
2852
2853 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse2_ld64);
2854 BENCHMARK_QS8_END2END(qs8_gemm_2x4c2s4__sse2_ld128);
2855 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse2_ld64);
2856 BENCHMARK_QS8_END2END(qs8_gemm_3x4c2s4__sse2_ld128);
2857 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse2_ld64);
2858 BENCHMARK_QS8_END2END(qs8_gemm_4x4c2s4__sse2_ld128);
2859
2860 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld64);
2861 BENCHMARK_QS8_END2END(qs8_gemm_2x4c8__sse2_ld128);
2862 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld64);
2863 BENCHMARK_QS8_END2END(qs8_gemm_3x4c8__sse2_ld128);
2864 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2865
2866 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2867 static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2868 GEMMEnd2EndBenchmark(state, model,
2869 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2870 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
2871 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2872 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2873 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2874 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2875 }
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2876 static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2877 GEMMEnd2EndBenchmark(state, model,
2878 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2879 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
2880 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2881 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2882 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2883 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2884 }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2885 static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2886 GEMMEnd2EndBenchmark(state, model,
2887 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2888 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
2889 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2890 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2891 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2892 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2893 }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2894 static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2895 GEMMEnd2EndBenchmark(state, model,
2896 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2897 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
2898 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2899 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2900 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2901 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2902 }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2903 static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2904 GEMMEnd2EndBenchmark(state, model,
2905 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2906 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
2907 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2908 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
2909 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2910 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2911 }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2912 static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2913 GEMMEnd2EndBenchmark(state, model,
2914 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2915 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
2916 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2917 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
2918 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2919 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
2920 }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2921 static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2922 GEMMEnd2EndBenchmark(state, model,
2923 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2924 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
2925 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2926 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2927 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2928 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2929 }
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2930 static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2931 GEMMEnd2EndBenchmark(state, model,
2932 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2933 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
2934 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2935 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2936 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2937 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2938 }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2939 static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2940 GEMMEnd2EndBenchmark(state, model,
2941 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2942 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
2943 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2944 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2945 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2946 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2947 }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2948 static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2949 GEMMEnd2EndBenchmark(state, model,
2950 xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2951 xnn_qs8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
2952 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2953 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2954 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2955 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2956 }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2957 static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2958 GEMMEnd2EndBenchmark(state, model,
2959 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2960 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
2961 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2962 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
2963 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2964 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2965 }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2966 static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2967 GEMMEnd2EndBenchmark(state, model,
2968 xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2969 xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
2970 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2971 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
2972 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2973 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
2974 }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2975 static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
2976 GEMMEnd2EndBenchmark(state, model,
2977 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2978 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
2979 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2980 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
2981 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2982 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2983 }
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)2984 static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
2985 GEMMEnd2EndBenchmark(state, model,
2986 xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2987 xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
2988 xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2989 xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
2990 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
2991 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
2992 }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)2993