1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cmath>
8 #include <cstring>
9 #include <functional>
10 #include <random>
11 #include <vector>
12
13 #include <xnnpack.h>
14
15 #include <benchmark/benchmark.h>
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19
20 #include <xnnpack.h>
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/microfnptr.h>
24 #include <xnnpack/microparams-init.h>
25
26
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_qu8_igemm_minmax_ukernel_function igemm,xnn_qu8_gemm_minmax_ukernel_function gemm1,xnn_qu8_igemm_minmax_ukernel_function igemm1,xnn_init_qu8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)27 static void GEMMEnd2EndBenchmark(
28 benchmark::State& state,
29 models::ExecutionPlanFactory model_factory,
30 xnn_qu8_gemm_minmax_ukernel_function gemm,
31 xnn_qu8_igemm_minmax_ukernel_function igemm,
32 xnn_qu8_gemm_minmax_ukernel_function gemm1,
33 xnn_qu8_igemm_minmax_ukernel_function igemm1,
34 xnn_init_qu8_conv_minmax_params_fn init_params,
35 uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
36 benchmark::utils::IsaCheckFunction isa_check = nullptr)
37 {
38 if (isa_check && !isa_check(state)) {
39 return;
40 }
41 if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
42 state.SkipWithError("failed to initialize XNNPACK");
43 return;
44 }
45
46 // Override microkernels chosen in xnn_initialize
47 // Note: do not directly assign to xnn_params.qu8.gemm because it breaks older gcc.
48 std::memset(&xnn_params.qu8.gemm, 0, sizeof(xnn_params.qu8.gemm));
49 xnn_params.qu8.gemm.minmax.gemm[mr-1] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
50 xnn_params.qu8.gemm.minmax.igemm[mr-1] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
51 xnn_params.qu8.gemm.minmax.gemm[0] = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
52 xnn_params.qu8.gemm.minmax.igemm[0] = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
53 xnn_params.qu8.gemm.init.qu8 = init_params;
54 xnn_params.qu8.gemm.mr = mr;
55 xnn_params.qu8.gemm.nr = nr;
56 xnn_params.qu8.gemm.log2_kr = log2_kr;
57 xnn_params.qu8.gemm.log2_sr = log2_sr;
58
59 auto execution_plan = model_factory(nullptr);
60 if (execution_plan.empty()) {
61 state.SkipWithError("failed to create a model");
62 return;
63 }
64
65 for (auto _ : state) {
66 for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
67 xnn_status status = xnn_run_operator(op.get(), nullptr);
68 if (status != xnn_status_success) {
69 state.SkipWithError("failed to run a model");
70 return;
71 }
72 }
73 }
74
75 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
76 if (cpu_frequency != 0) {
77 state.counters["cpufreq"] = cpu_frequency;
78 }
79 }
80
81 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)82 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
83 GEMMEnd2EndBenchmark(state, model,
84 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
85 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
86 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
87 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
88 xnn_init_qu8_conv_minmax_rndnu_neon_params,
89 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
90 benchmark::utils::CheckNEON);
91 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)92 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
93 GEMMEnd2EndBenchmark(state, model,
94 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
95 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
96 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
97 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
98 xnn_init_qu8_conv_minmax_rndnu_neon_params,
99 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
100 benchmark::utils::CheckNEON);
101 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)102 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
103 GEMMEnd2EndBenchmark(state, model,
104 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
105 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
106 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
107 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
108 xnn_init_qu8_conv_minmax_rndnu_neon_params,
109 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
110 benchmark::utils::CheckNEON);
111 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)112 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
113 GEMMEnd2EndBenchmark(state, model,
114 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
115 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
116 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
117 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
118 xnn_init_qu8_conv_minmax_rndnu_neon_params,
119 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
120 benchmark::utils::CheckNEON);
121 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)122 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
123 GEMMEnd2EndBenchmark(state, model,
124 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
125 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
126 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
127 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
128 xnn_init_qu8_conv_minmax_rndnu_neon_params,
129 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
130 benchmark::utils::CheckNEON);
131 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)132 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
133 GEMMEnd2EndBenchmark(state, model,
134 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
135 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
136 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
137 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
138 xnn_init_qu8_conv_minmax_rndnu_neon_params,
139 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
140 benchmark::utils::CheckNEON);
141 }
142 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)143 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
144 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
145 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
146 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
147 BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
148 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
149
150
151 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
152 static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
153 GEMMEnd2EndBenchmark(state, model,
154 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
155 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
156 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
157 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
158 xnn_init_qu8_conv_minmax_rndnu_neon_params,
159 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
160 benchmark::utils::CheckNEONDOT);
161 }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)162 static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
163 GEMMEnd2EndBenchmark(state, model,
164 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
165 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
166 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
167 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
168 xnn_init_qu8_conv_minmax_rndnu_neon_params,
169 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
170 benchmark::utils::CheckNEONDOT);
171 }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)172 static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
173 GEMMEnd2EndBenchmark(state, model,
174 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
175 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
176 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
177 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
178 xnn_init_qu8_conv_minmax_rndnu_neon_params,
179 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
180 benchmark::utils::CheckNEONDOT);
181 }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)182 static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
183 GEMMEnd2EndBenchmark(state, model,
184 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
185 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
186 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
187 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
188 xnn_init_qu8_conv_minmax_rndnu_neon_params,
189 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
190 benchmark::utils::CheckNEONDOT);
191 }
192
193 BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55);
194 BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55);
195 BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_ld128);
196 BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_ld128);
197 #endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
198
199
200 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)201 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
202 GEMMEnd2EndBenchmark(state, model,
203 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
204 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
205 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
206 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
207 xnn_init_qu8_conv_minmax_rndnu_neon_params,
208 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
209 benchmark::utils::CheckNEON);
210 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)211 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
212 GEMMEnd2EndBenchmark(state, model,
213 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
214 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
215 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
216 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
217 xnn_init_qu8_conv_minmax_rndnu_neon_params,
218 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
219 benchmark::utils::CheckNEON);
220 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)221 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
222 GEMMEnd2EndBenchmark(state, model,
223 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
224 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
225 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
226 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
227 xnn_init_qu8_conv_minmax_rndnu_neon_params,
228 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
229 benchmark::utils::CheckNEON);
230 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)231 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
232 GEMMEnd2EndBenchmark(state, model,
233 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
234 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
235 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
236 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
237 xnn_init_qu8_conv_minmax_rndnu_neon_params,
238 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
239 benchmark::utils::CheckNEON);
240 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)241 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
242 GEMMEnd2EndBenchmark(state, model,
243 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
244 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
245 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
246 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
247 xnn_init_qu8_conv_minmax_rndnu_neon_params,
248 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
249 benchmark::utils::CheckNEON);
250 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)251 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
252 GEMMEnd2EndBenchmark(state, model,
253 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
254 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
255 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
256 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
257 xnn_init_qu8_conv_minmax_rndnu_neon_params,
258 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
259 benchmark::utils::CheckNEON);
260 }
261
262 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75);
263 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
264 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53);
265 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
266 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64);
267 BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64);
268 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
269
270
271 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
qu8_gemm_1x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)272 static void qu8_gemm_1x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
273 GEMMEnd2EndBenchmark(state, model,
274 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
275 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
276 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
277 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
278 xnn_init_qu8_conv_minmax_rndnu_neon_params,
279 1 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
280 benchmark::utils::CheckNEONDOT);
281 }
qu8_gemm_2x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)282 static void qu8_gemm_2x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
283 GEMMEnd2EndBenchmark(state, model,
284 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
285 xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c4__neondot,
286 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
287 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
288 xnn_init_qu8_conv_minmax_rndnu_neon_params,
289 2 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
290 benchmark::utils::CheckNEONDOT);
291 }
qu8_gemm_3x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)292 static void qu8_gemm_3x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
293 GEMMEnd2EndBenchmark(state, model,
294 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
295 xnn_qu8_igemm_minmax_rndnu_ukernel_3x8c4__neondot,
296 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
297 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
298 xnn_init_qu8_conv_minmax_rndnu_neon_params,
299 3 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
300 benchmark::utils::CheckNEONDOT);
301 }
qu8_gemm_4x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)302 static void qu8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
303 GEMMEnd2EndBenchmark(state, model,
304 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
305 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
306 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
307 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
308 xnn_init_qu8_conv_minmax_rndnu_neon_params,
309 4 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
310 benchmark::utils::CheckNEONDOT);
311 }
qu8_gemm_5x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)312 static void qu8_gemm_5x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
313 GEMMEnd2EndBenchmark(state, model,
314 xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
315 xnn_qu8_igemm_minmax_rndnu_ukernel_5x8c4__neondot,
316 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
317 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
318 xnn_init_qu8_conv_minmax_rndnu_neon_params,
319 5 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
320 benchmark::utils::CheckNEONDOT);
321 }
qu8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)322 static void qu8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
323 GEMMEnd2EndBenchmark(state, model,
324 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
325 xnn_qu8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
326 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
327 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
328 xnn_init_qu8_conv_minmax_rndnu_neon_params,
329 6 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
330 benchmark::utils::CheckNEONDOT);
331 }
qu8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)332 static void qu8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
333 GEMMEnd2EndBenchmark(state, model,
334 xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
335 xnn_qu8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
336 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
337 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
338 xnn_init_qu8_conv_minmax_rndnu_neon_params,
339 8 /* mr */, 8 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
340 benchmark::utils::CheckNEONDOT);
341 }
qu8_gemm_1x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)342 static void qu8_gemm_1x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
343 GEMMEnd2EndBenchmark(state, model,
344 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
345 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
346 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
347 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
348 xnn_init_qu8_conv_minmax_rndnu_neon_params,
349 1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
350 benchmark::utils::CheckNEONDOT);
351 }
qu8_gemm_2x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)352 static void qu8_gemm_2x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
353 GEMMEnd2EndBenchmark(state, model,
354 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
355 xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot,
356 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
357 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
358 xnn_init_qu8_conv_minmax_rndnu_neon_params,
359 2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
360 benchmark::utils::CheckNEONDOT);
361 }
qu8_gemm_3x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)362 static void qu8_gemm_3x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
363 GEMMEnd2EndBenchmark(state, model,
364 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
365 xnn_qu8_igemm_minmax_rndnu_ukernel_3x16c4__neondot,
366 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
367 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
368 xnn_init_qu8_conv_minmax_rndnu_neon_params,
369 3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
370 benchmark::utils::CheckNEONDOT);
371 }
qu8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)372 static void qu8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
373 GEMMEnd2EndBenchmark(state, model,
374 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
375 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
376 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
377 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
378 xnn_init_qu8_conv_minmax_rndnu_neon_params,
379 4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
380 benchmark::utils::CheckNEONDOT);
381 }
qu8_gemm_5x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)382 static void qu8_gemm_5x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
383 GEMMEnd2EndBenchmark(state, model,
384 xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
385 xnn_qu8_igemm_minmax_rndnu_ukernel_5x16c4__neondot,
386 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
387 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
388 xnn_init_qu8_conv_minmax_rndnu_neon_params,
389 5 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
390 benchmark::utils::CheckNEONDOT);
391 }
qu8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)392 static void qu8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
393 GEMMEnd2EndBenchmark(state, model,
394 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
395 xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
396 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
397 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
398 xnn_init_qu8_conv_minmax_rndnu_neon_params,
399 6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
400 benchmark::utils::CheckNEONDOT);
401 }
qu8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)402 static void qu8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
403 GEMMEnd2EndBenchmark(state, model,
404 xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
405 xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
406 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
407 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
408 xnn_init_qu8_conv_minmax_rndnu_neon_params,
409 8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
410 benchmark::utils::CheckNEONDOT);
411 }
qu8_gemm_2x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)412 static void qu8_gemm_2x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
413 GEMMEnd2EndBenchmark(state, model,
414 xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
415 xnn_qu8_igemm_minmax_rndnu_ukernel_2x32c4__neondot,
416 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
417 xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
418 xnn_init_qu8_conv_minmax_rndnu_neon_params,
419 2 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
420 benchmark::utils::CheckNEONDOT);
421 }
qu8_gemm_3x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)422 static void qu8_gemm_3x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
423 GEMMEnd2EndBenchmark(state, model,
424 xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
425 xnn_qu8_igemm_minmax_rndnu_ukernel_3x32c4__neondot,
426 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
427 xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
428 xnn_init_qu8_conv_minmax_rndnu_neon_params,
429 3 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
430 benchmark::utils::CheckNEONDOT);
431 }
432
433 BENCHMARK_QU8_END2END(qu8_gemm_1x8c4__neondot);
434 BENCHMARK_QU8_END2END(qu8_gemm_2x8c4__neondot);
435 BENCHMARK_QU8_END2END(qu8_gemm_3x8c4__neondot);
436 BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__neondot);
437 BENCHMARK_QU8_END2END(qu8_gemm_5x8c4__neondot);
438 BENCHMARK_QU8_END2END(qu8_gemm_6x8c4__neondot);
439 BENCHMARK_QU8_END2END(qu8_gemm_8x8c4__neondot);
440 BENCHMARK_QU8_END2END(qu8_gemm_1x16c4__neondot);
441 BENCHMARK_QU8_END2END(qu8_gemm_2x16c4__neondot);
442 BENCHMARK_QU8_END2END(qu8_gemm_3x16c4__neondot);
443 BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__neondot);
444 BENCHMARK_QU8_END2END(qu8_gemm_5x16c4__neondot);
445 BENCHMARK_QU8_END2END(qu8_gemm_6x16c4__neondot);
446 BENCHMARK_QU8_END2END(qu8_gemm_8x16c4__neondot);
447 BENCHMARK_QU8_END2END(qu8_gemm_2x32c4__neondot);
448 BENCHMARK_QU8_END2END(qu8_gemm_3x32c4__neondot);
449 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
450
451
452 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)453 static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
454 GEMMEnd2EndBenchmark(state, model,
455 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
456 xnn_qu8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
457 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
458 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
459 xnn_init_qu8_conv_minmax_rndnu_neon_params,
460 2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
461 benchmark::utils::CheckNEON);
462 }
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)463 static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
464 GEMMEnd2EndBenchmark(state, model,
465 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
466 xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
467 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
468 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
469 xnn_init_qu8_conv_minmax_rndnu_neon_params,
470 3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
471 benchmark::utils::CheckNEON);
472 }
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)473 static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
474 GEMMEnd2EndBenchmark(state, model,
475 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
476 xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
477 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
478 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
479 xnn_init_qu8_conv_minmax_rndnu_neon_params,
480 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
481 benchmark::utils::CheckNEON);
482 }
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)483 static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
484 GEMMEnd2EndBenchmark(state, model,
485 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
486 xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
487 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
488 xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
489 xnn_init_qu8_conv_minmax_rndnu_neon_params,
490 6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
491 benchmark::utils::CheckNEON);
492 }
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)493 static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
494 GEMMEnd2EndBenchmark(state, model,
495 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
496 xnn_qu8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
497 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
498 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
499 xnn_init_qu8_conv_minmax_rndnu_neon_params,
500 2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
501 benchmark::utils::CheckNEON);
502 }
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)503 static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
504 GEMMEnd2EndBenchmark(state, model,
505 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
506 xnn_qu8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
507 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
508 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
509 xnn_init_qu8_conv_minmax_rndnu_neon_params,
510 3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
511 benchmark::utils::CheckNEON);
512 }
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)513 static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
514 GEMMEnd2EndBenchmark(state, model,
515 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
516 xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
517 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
518 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
519 xnn_init_qu8_conv_minmax_rndnu_neon_params,
520 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
521 benchmark::utils::CheckNEON);
522 }
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)523 static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
524 GEMMEnd2EndBenchmark(state, model,
525 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
526 xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
527 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
528 xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
529 xnn_init_qu8_conv_minmax_rndnu_neon_params,
530 6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
531 benchmark::utils::CheckNEON);
532 }
533
534 BENCHMARK_QU8_END2END(qu8_gemm_2x8__neon_mlal_lane);
535 BENCHMARK_QU8_END2END(qu8_gemm_3x8__neon_mlal_lane);
536 BENCHMARK_QU8_END2END(qu8_gemm_4x8__neon_mlal_lane);
537 BENCHMARK_QU8_END2END(qu8_gemm_6x8__neon_mlal_lane);
538 BENCHMARK_QU8_END2END(qu8_gemm_2x16__neon_mlal_lane);
539 BENCHMARK_QU8_END2END(qu8_gemm_3x16__neon_mlal_lane);
540 BENCHMARK_QU8_END2END(qu8_gemm_4x16__neon_mlal_lane);
541 BENCHMARK_QU8_END2END(qu8_gemm_6x16__neon_mlal_lane);
542 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
543
544
545 #if XNN_ARCH_ARM
qu8_gemm_1x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)546 static void qu8_gemm_1x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
547 GEMMEnd2EndBenchmark(state, model,
548 xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
549 xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
550 xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
551 xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
552 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
553 1 /* mr */, 1 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
554 benchmark::utils::CheckARMV6);
555 }
qu8_gemm_2x1c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)556 static void qu8_gemm_2x1c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
557 GEMMEnd2EndBenchmark(state, model,
558 xnn_qu8_gemm_minmax_fp32_ukernel_2x1c4__armsimd32,
559 xnn_qu8_igemm_minmax_fp32_ukernel_2x1c4__armsimd32,
560 xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
561 xnn_qu8_igemm_minmax_fp32_ukernel_1x1c4__armsimd32,
562 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
563 2 /* mr */, 1 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
564 benchmark::utils::CheckARMV6);
565 }
qu8_gemm_1x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)566 static void qu8_gemm_1x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
567 GEMMEnd2EndBenchmark(state, model,
568 xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
569 xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
570 xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
571 xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
572 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
573 1 /* mr */, 2 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
574 benchmark::utils::CheckARMV6);
575 }
qu8_gemm_2x2c4__armsimd32(benchmark::State & state,models::ExecutionPlanFactory model)576 static void qu8_gemm_2x2c4__armsimd32(benchmark::State& state, models::ExecutionPlanFactory model) {
577 GEMMEnd2EndBenchmark(state, model,
578 xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32,
579 xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32,
580 xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
581 xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32,
582 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
583 2 /* mr */, 2 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
584 benchmark::utils::CheckARMV6);
585 }
586
587 BENCHMARK_QU8_END2END(qu8_gemm_1x1c4__armsimd32);
588 BENCHMARK_QU8_END2END(qu8_gemm_2x1c4__armsimd32);
589 BENCHMARK_QU8_END2END(qu8_gemm_1x2c4__armsimd32);
590 BENCHMARK_QU8_END2END(qu8_gemm_2x2c4__armsimd32);
591 #endif // XNN_ARCH_ARM
592
593
594 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)595 static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
596 GEMMEnd2EndBenchmark(state, model,
597 xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
598 xnn_qu8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
599 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
600 xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
601 xnn_init_qu8_conv_minmax_fp32_avx512_params,
602 2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
603 benchmark::utils::CheckAVX512F);
604 }
605
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)606 static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
607 GEMMEnd2EndBenchmark(state, model,
608 xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
609 xnn_qu8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
610 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
611 xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
612 xnn_init_qu8_conv_minmax_fp32_avx512_params,
613 3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
614 benchmark::utils::CheckAVX512F);
615 }
616
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)617 static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
618 GEMMEnd2EndBenchmark(state, model,
619 xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
620 xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
621 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
622 xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
623 xnn_init_qu8_conv_minmax_fp32_avx512_params,
624 4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
625 benchmark::utils::CheckAVX512F);
626 }
627
qu8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)628 static void qu8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
629 GEMMEnd2EndBenchmark(state, model,
630 xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
631 xnn_qu8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
632 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
633 xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
634 xnn_init_qu8_conv_minmax_fp32_avx2_params,
635 2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
636 benchmark::utils::CheckAVX2);
637 }
qu8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)638 static void qu8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
639 GEMMEnd2EndBenchmark(state, model,
640 xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
641 xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
642 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
643 xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
644 xnn_init_qu8_conv_minmax_fp32_avx2_params,
645 3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
646 benchmark::utils::CheckAVX2);
647 }
648
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)649 static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
650 GEMMEnd2EndBenchmark(state, model,
651 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
652 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
653 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
654 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
655 xnn_init_qu8_conv_minmax_fp32_sse2_params,
656 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
657 benchmark::utils::CheckXOP);
658 }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)659 static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
660 GEMMEnd2EndBenchmark(state, model,
661 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
662 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
663 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
664 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
665 xnn_init_qu8_conv_minmax_fp32_sse2_params,
666 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
667 benchmark::utils::CheckXOP);
668 }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)669 static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
670 GEMMEnd2EndBenchmark(state, model,
671 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
672 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
673 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
674 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
675 xnn_init_qu8_conv_minmax_fp32_sse2_params,
676 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
677 benchmark::utils::CheckXOP);
678 }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)679 static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
680 GEMMEnd2EndBenchmark(state, model,
681 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
682 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
683 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
684 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
685 xnn_init_qu8_conv_minmax_fp32_sse2_params,
686 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
687 benchmark::utils::CheckXOP);
688 }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)689 static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
690 GEMMEnd2EndBenchmark(state, model,
691 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
692 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
693 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
694 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
695 xnn_init_qu8_conv_minmax_fp32_sse2_params,
696 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
697 benchmark::utils::CheckXOP);
698 }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)699 static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
700 GEMMEnd2EndBenchmark(state, model,
701 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
702 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
703 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
704 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
705 xnn_init_qu8_conv_minmax_fp32_sse2_params,
706 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
707 benchmark::utils::CheckXOP);
708 }
709
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)710 static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
711 GEMMEnd2EndBenchmark(state, model,
712 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
713 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
714 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
715 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
716 xnn_init_qu8_conv_minmax_fp32_sse2_params,
717 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
718 benchmark::utils::CheckXOP);
719 }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)720 static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
721 GEMMEnd2EndBenchmark(state, model,
722 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
723 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
724 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
725 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
726 xnn_init_qu8_conv_minmax_fp32_sse2_params,
727 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
728 benchmark::utils::CheckXOP);
729 }
730
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)731 static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
732 GEMMEnd2EndBenchmark(state, model,
733 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
734 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
735 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
736 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
737 xnn_init_qu8_conv_minmax_fp32_sse2_params,
738 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
739 benchmark::utils::CheckXOP);
740 }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)741 static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
742 GEMMEnd2EndBenchmark(state, model,
743 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
744 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
745 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
746 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
747 xnn_init_qu8_conv_minmax_fp32_sse2_params,
748 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
749 benchmark::utils::CheckXOP);
750 }
751
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)752 static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
753 GEMMEnd2EndBenchmark(state, model,
754 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
755 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
756 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
757 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
758 xnn_init_qu8_conv_minmax_fp32_sse2_params,
759 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
760 benchmark::utils::CheckAVX);
761 }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)762 static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
763 GEMMEnd2EndBenchmark(state, model,
764 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
765 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
766 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
767 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
768 xnn_init_qu8_conv_minmax_fp32_sse2_params,
769 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
770 benchmark::utils::CheckAVX);
771 }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)772 static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
773 GEMMEnd2EndBenchmark(state, model,
774 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
775 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
776 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
777 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
778 xnn_init_qu8_conv_minmax_fp32_sse2_params,
779 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
780 benchmark::utils::CheckAVX);
781 }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)782 static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
783 GEMMEnd2EndBenchmark(state, model,
784 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
785 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
786 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
787 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
788 xnn_init_qu8_conv_minmax_fp32_sse2_params,
789 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
790 benchmark::utils::CheckAVX);
791 }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)792 static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
793 GEMMEnd2EndBenchmark(state, model,
794 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
795 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
796 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
797 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
798 xnn_init_qu8_conv_minmax_fp32_sse2_params,
799 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
800 benchmark::utils::CheckAVX);
801 }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)802 static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
803 GEMMEnd2EndBenchmark(state, model,
804 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
805 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
806 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
807 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
808 xnn_init_qu8_conv_minmax_fp32_sse2_params,
809 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
810 benchmark::utils::CheckAVX);
811 }
812
813
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)814 static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
815 GEMMEnd2EndBenchmark(state, model,
816 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
817 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
818 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
819 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
820 xnn_init_qu8_conv_minmax_fp32_sse2_params,
821 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
822 benchmark::utils::CheckAVX);
823 }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)824 static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
825 GEMMEnd2EndBenchmark(state, model,
826 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
827 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
828 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
829 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
830 xnn_init_qu8_conv_minmax_fp32_sse2_params,
831 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
832 benchmark::utils::CheckAVX);
833 }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)834 static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
835 GEMMEnd2EndBenchmark(state, model,
836 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
837 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
838 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
839 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
840 xnn_init_qu8_conv_minmax_fp32_sse2_params,
841 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
842 benchmark::utils::CheckAVX);
843 }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)844 static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
845 GEMMEnd2EndBenchmark(state, model,
846 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
847 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
848 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
849 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
850 xnn_init_qu8_conv_minmax_fp32_sse2_params,
851 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
852 benchmark::utils::CheckAVX);
853 }
854
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)855 static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
856 GEMMEnd2EndBenchmark(state, model,
857 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
858 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
859 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
860 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
861 xnn_init_qu8_conv_minmax_fp32_sse2_params,
862 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
863 benchmark::utils::CheckSSE41);
864 }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)865 static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
866 GEMMEnd2EndBenchmark(state, model,
867 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
868 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
869 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
870 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
871 xnn_init_qu8_conv_minmax_fp32_sse2_params,
872 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
873 benchmark::utils::CheckSSE41);
874 }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)875 static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
876 GEMMEnd2EndBenchmark(state, model,
877 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
878 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
879 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
880 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
881 xnn_init_qu8_conv_minmax_fp32_sse2_params,
882 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
883 benchmark::utils::CheckSSE41);
884 }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)885 static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
886 GEMMEnd2EndBenchmark(state, model,
887 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
888 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
889 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
890 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
891 xnn_init_qu8_conv_minmax_fp32_sse2_params,
892 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
893 benchmark::utils::CheckSSE41);
894 }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)895 static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
896 GEMMEnd2EndBenchmark(state, model,
897 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
898 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
899 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
900 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
901 xnn_init_qu8_conv_minmax_fp32_sse2_params,
902 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
903 benchmark::utils::CheckSSE41);
904 }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)905 static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
906 GEMMEnd2EndBenchmark(state, model,
907 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
908 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
909 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
910 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
911 xnn_init_qu8_conv_minmax_fp32_sse2_params,
912 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
913 benchmark::utils::CheckSSE41);
914 }
915
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)916 static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
917 GEMMEnd2EndBenchmark(state, model,
918 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
919 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
920 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
921 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
922 xnn_init_qu8_conv_minmax_fp32_sse2_params,
923 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
924 benchmark::utils::CheckSSE41);
925 }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)926 static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
927 GEMMEnd2EndBenchmark(state, model,
928 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
929 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
930 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
931 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
932 xnn_init_qu8_conv_minmax_fp32_sse2_params,
933 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
934 benchmark::utils::CheckSSE41);
935 }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)936 static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
937 GEMMEnd2EndBenchmark(state, model,
938 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
939 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
940 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
941 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
942 xnn_init_qu8_conv_minmax_fp32_sse2_params,
943 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
944 benchmark::utils::CheckSSE41);
945 }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)946 static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
947 GEMMEnd2EndBenchmark(state, model,
948 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
949 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
950 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
951 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
952 xnn_init_qu8_conv_minmax_fp32_sse2_params,
953 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
954 benchmark::utils::CheckSSE41);
955 }
956
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)957 static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
958 GEMMEnd2EndBenchmark(state, model,
959 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
960 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
961 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
962 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
963 xnn_init_qu8_conv_minmax_fp32_sse2_params,
964 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
965 }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)966 static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
967 GEMMEnd2EndBenchmark(state, model,
968 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
969 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
970 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
971 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
972 xnn_init_qu8_conv_minmax_fp32_sse2_params,
973 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
974 }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)975 static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
976 GEMMEnd2EndBenchmark(state, model,
977 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
978 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
979 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
980 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
981 xnn_init_qu8_conv_minmax_fp32_sse2_params,
982 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
983 }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)984 static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
985 GEMMEnd2EndBenchmark(state, model,
986 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
987 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
988 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
989 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
990 xnn_init_qu8_conv_minmax_fp32_sse2_params,
991 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
992 }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)993 static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
994 GEMMEnd2EndBenchmark(state, model,
995 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
996 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
997 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
998 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
999 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1000 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
1001 }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1002 static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1003 GEMMEnd2EndBenchmark(state, model,
1004 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
1005 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
1006 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
1007 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
1008 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1009 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
1010 }
1011
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1012 static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1013 GEMMEnd2EndBenchmark(state, model,
1014 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1015 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1016 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1017 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1018 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1019 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1020 }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1021 static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1022 GEMMEnd2EndBenchmark(state, model,
1023 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1024 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1025 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1026 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1027 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1028 2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1029 }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1030 static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1031 GEMMEnd2EndBenchmark(state, model,
1032 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1033 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1034 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1035 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1036 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1037 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1038 }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1039 static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1040 GEMMEnd2EndBenchmark(state, model,
1041 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1042 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1043 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1044 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1045 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1046 3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
1047 }
1048
1049
1050 BENCHMARK_QU8_END2END(qu8_gemm_2x16c8__avx512skx);
1051 BENCHMARK_QU8_END2END(qu8_gemm_3x16c8__avx512skx);
1052 BENCHMARK_QU8_END2END(qu8_gemm_4x16c8__avx512skx);
1053
1054 BENCHMARK_QU8_END2END(qu8_gemm_2x8c8__avx2);
1055 BENCHMARK_QU8_END2END(qu8_gemm_3x8c8__avx2);
1056
1057 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld64);
1058 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld128);
1059 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld64);
1060 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld128);
1061 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld64);
1062 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld128);
1063
1064 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld64);
1065 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld128);
1066 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld64);
1067 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld128);
1068
1069 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld64);
1070 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld128);
1071 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld64);
1072 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld128);
1073 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld64);
1074 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld128);
1075
1076 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld64);
1077 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld128);
1078 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld64);
1079 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld128);
1080
1081 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld64);
1082 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld128);
1083 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld64);
1084 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld128);
1085 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld64);
1086 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld128);
1087
1088 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld64);
1089 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld128);
1090 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld64);
1091 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld128);
1092
1093 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld64);
1094 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld128);
1095 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld64);
1096 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld128);
1097 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld64);
1098 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld128);
1099
1100 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld64);
1101 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld128);
1102 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld64);
1103 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld128);
1104 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1105
1106
1107 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1108 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1109 GEMMEnd2EndBenchmark(state, model,
1110 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1111 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1112 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1113 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1114 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1115 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1116 }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1117 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1118 GEMMEnd2EndBenchmark(state, model,
1119 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1120 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1121 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1122 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1123 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1124 2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1125 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1126 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1127 GEMMEnd2EndBenchmark(state, model,
1128 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1129 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1130 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1131 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1132 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1133 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1134 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1135 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1136 GEMMEnd2EndBenchmark(state, model,
1137 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1138 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1139 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1140 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1141 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1142 3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1143 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1144 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1145 GEMMEnd2EndBenchmark(state, model,
1146 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1147 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1148 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1149 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1150 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1151 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1152 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1153 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1154 GEMMEnd2EndBenchmark(state, model,
1155 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1156 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1157 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1158 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1159 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1160 4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1161 }
1162
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1163 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1164 GEMMEnd2EndBenchmark(state, model,
1165 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1166 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1167 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1168 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1169 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1170 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1171 }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1172 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1173 GEMMEnd2EndBenchmark(state, model,
1174 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1175 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1176 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1177 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1178 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1179 2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1180 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1181 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1182 GEMMEnd2EndBenchmark(state, model,
1183 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1184 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1185 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1186 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1187 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1189 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1190 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1191 GEMMEnd2EndBenchmark(state, model,
1192 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1193 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1194 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1195 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1196 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1197 3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1198 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1199 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1200 GEMMEnd2EndBenchmark(state, model,
1201 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1202 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1203 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1204 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1205 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1206 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1207 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1208 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1209 GEMMEnd2EndBenchmark(state, model,
1210 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1211 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1212 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1213 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1214 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1215 4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1216 }
1217
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1218 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1219 GEMMEnd2EndBenchmark(state, model,
1220 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1221 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1222 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1223 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1224 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1226 }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1227 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1228 GEMMEnd2EndBenchmark(state, model,
1229 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1230 xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1231 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1232 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1233 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1234 2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1235 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1236 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1237 GEMMEnd2EndBenchmark(state, model,
1238 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1239 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1240 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1241 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1242 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1243 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1244 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1245 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1246 GEMMEnd2EndBenchmark(state, model,
1247 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1248 xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1249 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1250 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1251 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1252 3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1253 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1254 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1255 GEMMEnd2EndBenchmark(state, model,
1256 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1257 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1258 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1259 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1260 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1261 4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1262 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1263 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1264 GEMMEnd2EndBenchmark(state, model,
1265 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1266 xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1267 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1268 xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1269 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1270 4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1271 }
1272
1273 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)1274 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1275 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1276 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1277 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1278 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1279
1280 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1281 BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1282 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1283 BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1284 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1285 BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1286
1287 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1288 BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1289 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1290 BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1291 BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1292 BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1293 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1294
1295
1296 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1297 static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1298 GEMMEnd2EndBenchmark(state, model,
1299 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1300 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1301 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1302 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1303 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1304 2 /* mr */, 2 /* nr */);
1305 }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1306 static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1307 GEMMEnd2EndBenchmark(state, model,
1308 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1309 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1310 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1311 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1312 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1313 3 /* mr */, 2 /* nr */);
1314 }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1315 static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1316 GEMMEnd2EndBenchmark(state, model,
1317 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1318 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1319 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1320 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1321 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1322 4 /* mr */, 2 /* nr */);
1323 }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1324 static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1325 GEMMEnd2EndBenchmark(state, model,
1326 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1327 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1328 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1329 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1330 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1331 2 /* mr */, 4 /* nr */);
1332 }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1333 static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1334 GEMMEnd2EndBenchmark(state, model,
1335 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1336 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1337 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1338 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1339 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1340 3 /* mr */, 4 /* nr */);
1341 }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1342 static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1343 GEMMEnd2EndBenchmark(state, model,
1344 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1345 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1346 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1347 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1348 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1349 4 /* mr */, 4 /* nr */);
1350 }
1351
1352 BENCHMARK_QU8_END2END(qu8_gemm_2x2__wasm_fmagic)
BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)1353 BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)
1354 BENCHMARK_QU8_END2END(qu8_gemm_4x2__wasm_fmagic)
1355 BENCHMARK_QU8_END2END(qu8_gemm_2x4__wasm_fmagic)
1356 BENCHMARK_QU8_END2END(qu8_gemm_3x4__wasm_fmagic)
1357 BENCHMARK_QU8_END2END(qu8_gemm_4x4__wasm_fmagic)
1358 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1359
1360
1361 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1362 GEMMEnd2EndBenchmark(state, model,
1363 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1364 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1365 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1366 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1367 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368 2 /* mr */, 2 /* nr */);
1369 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1370 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1371 GEMMEnd2EndBenchmark(state, model,
1372 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1373 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1374 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1375 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1376 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1377 3 /* mr */, 2 /* nr */);
1378 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1379 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1380 GEMMEnd2EndBenchmark(state, model,
1381 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1382 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1383 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1384 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1385 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386 4 /* mr */, 2 /* nr */);
1387 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1388 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1389 GEMMEnd2EndBenchmark(state, model,
1390 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1391 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1392 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1393 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1394 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1395 2 /* mr */, 4 /* nr */);
1396 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1397 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1398 GEMMEnd2EndBenchmark(state, model,
1399 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1400 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1401 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1402 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1403 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1404 3 /* mr */, 4 /* nr */);
1405 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1406 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1407 GEMMEnd2EndBenchmark(state, model,
1408 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1409 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1410 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1411 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1412 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1413 4 /* mr */, 4 /* nr */);
1414 }
1415
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1416 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1417 GEMMEnd2EndBenchmark(state, model,
1418 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1419 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1420 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1421 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1422 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1423 2 /* mr */, 2 /* nr */);
1424 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1425 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1426 GEMMEnd2EndBenchmark(state, model,
1427 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1428 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1429 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1430 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1431 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1432 3 /* mr */, 2 /* nr */);
1433 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1434 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1435 GEMMEnd2EndBenchmark(state, model,
1436 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1437 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1438 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1439 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1440 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1441 4 /* mr */, 2 /* nr */);
1442 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1443 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1444 GEMMEnd2EndBenchmark(state, model,
1445 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1446 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1447 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1448 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1449 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1450 2 /* mr */, 4 /* nr */);
1451 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1452 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1453 GEMMEnd2EndBenchmark(state, model,
1454 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1455 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1456 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1457 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1458 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1459 3 /* mr */, 4 /* nr */);
1460 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1461 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1462 GEMMEnd2EndBenchmark(state, model,
1463 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1464 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1465 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1466 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1467 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1468 4 /* mr */, 4 /* nr */);
1469 }
1470
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1471 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1472 GEMMEnd2EndBenchmark(state, model,
1473 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1474 xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1475 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1476 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1477 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1478 2 /* mr */, 2 /* nr */);
1479 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1480 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1481 GEMMEnd2EndBenchmark(state, model,
1482 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1483 xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1484 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1485 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1486 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1487 3 /* mr */, 2 /* nr */);
1488 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1489 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1490 GEMMEnd2EndBenchmark(state, model,
1491 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1492 xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1493 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1494 xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1495 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1496 4 /* mr */, 2 /* nr */);
1497 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1498 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1499 GEMMEnd2EndBenchmark(state, model,
1500 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1501 xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1502 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1503 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1504 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1505 2 /* mr */, 4 /* nr */);
1506 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1507 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1508 GEMMEnd2EndBenchmark(state, model,
1509 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1510 xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1511 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1512 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1513 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1514 3 /* mr */, 4 /* nr */);
1515 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1516 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1517 GEMMEnd2EndBenchmark(state, model,
1518 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1519 xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1520 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1521 xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1522 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1523 4 /* mr */, 4 /* nr */);
1524 }
1525
1526 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_fmagic)
1527 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_fmagic)
1528 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_fmagic)
1529 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_fmagic)
1530 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_fmagic)
1531 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_fmagic)
1532
1533 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_imagic)
1534 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_imagic)
1535 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_imagic)
1536 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_imagic)
1537 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_imagic)
1538 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_imagic)
1539
1540 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_lrintf)
1541 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_lrintf)
1542 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_lrintf)
1543 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_lrintf)
1544 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_lrintf)
1545 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_lrintf)
1546
1547
1548 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1549 BENCHMARK_MAIN();
1550 #endif
1551