1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <limits>
15 #include <mutex>
16 #include <random>
17 #include <vector>
18
19 #include <cpuinfo.h>
20
21 #include <benchmark/benchmark.h>
22 #ifdef BENCHMARK_GEMMLOWP
23 #include "gemmlowp/public/gemmlowp.h"
24 #endif // BENCHMARK_GEMMLOWP
25 #ifdef BENCHMARK_RUY
26 #include "ruy/ruy.h"
27 #endif // BENCHMARK_RUY
28 #include "bench/gemm.h"
29 #include "bench/utils.h"
30
31 #include <xnnpack.h>
32 #include <xnnpack/aligned-allocator.h>
33 #include <xnnpack/common.h>
34 #include <xnnpack/gemm.h>
35 #include <xnnpack/math.h>
36 #include <xnnpack/microfnptr.h>
37 #include <xnnpack/microparams-init.h>
38 #include <xnnpack/pack.h>
39
40
GEMMBenchmark(benchmark::State & state,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_init_qu8_conv_minmax_params_fn init_params,size_t mr,size_t nr,size_t kr,size_t sr,benchmark::utils::IsaCheckFunction isa_check=nullptr)41 static void GEMMBenchmark(benchmark::State& state,
42 xnn_qu8_gemm_minmax_ukernel_function gemm,
43 xnn_init_qu8_conv_minmax_params_fn init_params,
44 size_t mr, size_t nr, size_t kr, size_t sr,
45 benchmark::utils::IsaCheckFunction isa_check = nullptr)
46 {
47 if (!cpuinfo_initialize()) {
48 state.SkipWithError("cpuinfo initialization failed");
49 return;
50 }
51 if (isa_check && !isa_check(state)) {
52 return;
53 }
54
55 const size_t mc = state.range(0);
56 const size_t nc = state.range(1);
57 const size_t kc = state.range(2);
58
59 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
60 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
61
62 std::random_device random_device;
63 auto rng = std::mt19937(random_device());
64 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
65 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
66
67 std::vector<uint8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint8_t));
68 std::generate(a.begin(), a.end(), std::ref(u8rng));
69 std::vector<uint8_t> k(nc * kc);
70 std::generate(k.begin(), k.end(), std::ref(u8rng));
71 std::vector<int32_t> b(nc);
72 std::generate(b.begin(), b.end(), std::ref(i32rng));
73
74 const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
75 const size_t c_elements = mc * nc;
76 const size_t num_buffers = 1 +
77 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
78 sizeof(uint8_t) * (w_elements + c_elements));
79
80 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> w(w_elements * num_buffers);
81 std::fill(w.begin(), w.end(), 0);
82 const xnn_qu8_packing_params packing_params = { 127, 127 };
83 xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
84 std::vector<uint8_t> c(c_elements * num_buffers);
85 std::fill(c.begin(), c.end(), 0xA5);
86
87 union xnn_qu8_conv_minmax_params quantization_params;
88 init_params(&quantization_params, 127, 0.75f, 127, 1, 254);
89
90 size_t buffer_index = 0;
91 for (auto _ : state) {
92 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
93 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
94 // - W is not in cache (for any cache level)
95 // - C is not in cache (for any cache level)
96 state.PauseTiming();
97 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
98 buffer_index = (buffer_index + 1) % num_buffers;
99 state.ResumeTiming();
100
101 for (uint32_t m = 0; m < mc; m += mr) {
102 const uint32_t mb = min(mc - m, mr);
103 for (uint32_t n = 0; n < nc; n += nr) {
104 const uint32_t nb = min(nc - n, nr);
105 gemm(
106 mb, nb, kc * sizeof(uint8_t),
107 a.data() + m * kc, kc * sizeof(uint8_t),
108 w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
109 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
110 &quantization_params);
111 }
112 }
113 }
114
115 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
116 if (cpu_frequency != 0) {
117 state.counters["cpufreq"] = cpu_frequency;
118 }
119
120 state.counters["OPS"] = benchmark::Counter(
121 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
122 }
123
124 #ifdef BENCHMARK_GEMMLOWP
125 struct GemmlowpOutputPipeline {
126 typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
127 typedef std::tuple<
128 gemmlowp::OutputStageBiasAddition<ColVectorMap>,
129 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
130 gemmlowp::OutputStageClamp,
131 gemmlowp::OutputStageSaturatingCastToUint8>
132 Pipeline;
133
MakeGemmlowpOutputPipeline134 static Pipeline Make(
135 const int32_t* bias_data,
136 int output_rows,
137 int32_t output_offset,
138 int32_t output_multiplier,
139 int output_shift,
140 int32_t output_activation_min,
141 int32_t output_activation_max)
142 {
143 ColVectorMap bias_vector(bias_data, output_rows);
144 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
145 bias_addition_stage.bias_vector = bias_vector;
146 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
147 quantize_down_stage.result_offset_after_shift = output_offset;
148 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
149 quantize_down_stage.result_shift = output_shift;
150 gemmlowp::OutputStageClamp clamp_stage;
151 clamp_stage.min = output_activation_min;
152 clamp_stage.max = output_activation_max;
153 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
154 return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
155 }
156 };
157
GemmlowpBenchmark(benchmark::State & state,uint32_t threads)158 static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
159 {
160 const size_t mc = state.range(0);
161 const size_t nc = state.range(1);
162 const size_t kc = state.range(2);
163
164 std::random_device random_device;
165 auto rng = std::mt19937(random_device());
166 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
167 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
168
169 std::vector<uint8_t> a(mc * kc);
170 std::generate(a.begin(), a.end(), std::ref(u8rng));
171
172 const size_t kElements = nc * kc;
173 const size_t bElements = nc;
174 const size_t c_elements = mc * nc;
175 const size_t num_buffers = 1 +
176 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
177 kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
178
179 std::vector<uint8_t> k(kElements * num_buffers);
180 std::generate(k.begin(), k.end(), std::ref(u8rng));
181 std::vector<int32_t> b(bElements * num_buffers);
182 std::generate(b.begin(), b.end(), std::ref(i32rng));
183 std::vector<uint8_t> c(c_elements * num_buffers);
184 std::fill(c.begin(), c.end(), 0xA5);
185
186 gemmlowp::MultiThreadGemmContext threadingContext;
187 threadingContext.set_max_num_threads(threads);
188
189 size_t buffer_index = 0;
190 for (auto _ : state) {
191 state.PauseTiming();
192 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
193 buffer_index = (buffer_index + 1) % num_buffers;
194 state.ResumeTiming();
195
196 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
197 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
198 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
199 const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
200 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
201 &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
202 }
203
204 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
205 if (cpu_frequency != 0) {
206 state.counters["cpufreq"] = cpu_frequency;
207 }
208
209 state.counters["OPS"] = benchmark::Counter(
210 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
211 }
212
gemmlowp_st(benchmark::State & state,const char * net)213 static void gemmlowp_st(benchmark::State& state, const char* net)
214 {
215 GemmlowpBenchmark(state, 1);
216 }
217 #endif // BENCHMARK_GEMMLOWP
218
219
220 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)221 static void RuyBenchmark(benchmark::State& state, size_t threads)
222 {
223 const size_t mc = state.range(0);
224 const size_t nc = state.range(1);
225 const size_t kc = state.range(2);
226
227 std::random_device random_device;
228 auto rng = std::mt19937(random_device());
229 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
230 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
231
232 const size_t num_buffers = 1 +
233 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
234 nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
235
236 std::vector<uint8_t> a(mc * kc);
237 std::generate(a.begin(), a.end(), std::ref(u8rng));
238 std::vector<uint8_t> k(num_buffers * nc * kc);
239 std::generate(k.begin(), k.end(), std::ref(u8rng));
240 std::vector<int32_t> b(num_buffers * nc);
241 std::generate(b.begin(), b.end(), std::ref(i32rng));
242 std::vector<uint8_t> c(num_buffers * nc * mc);
243 std::fill(c.begin(), c.end(), std::nanf(""));
244
245 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
246 static ruy::Context context;
247 context.set_max_num_threads(threads);
248
249 ruy::Matrix<uint8_t> ruy_a;
250 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
251 ruy_a.set_zero_point(127);
252 ruy::Matrix<uint8_t> ruy_b;
253 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
254 ruy_b.set_data(a.data());
255 ruy_b.set_zero_point(127);
256 ruy::Matrix<uint8_t> ruy_c;
257 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
258 ruy_c.set_zero_point(127);
259
260 ruy::MulParams<int32_t, uint8_t> mul_params;
261 mul_params.set_multiplier_fixedpoint(0x40000000);
262
263 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
264 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
265 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
266 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
267 static std::once_flag warmup;
268 std::call_once(warmup, [&](){
269 auto start = std::chrono::steady_clock::now();
270 do {
271 ruy_a.set_data(k.data());
272 ruy_c.set_data(c.data());
273 mul_params.set_bias(b.data());
274
275 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
276 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
277 });
278
279 size_t buffer_index = 0;
280 for (auto _ : state) {
281 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
282 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
283 // - K is not in cache (for any cache level)
284 // - B is not in cache (for any cache level)
285 // - C is not in cache (for any cache level)
286 state.PauseTiming();
287 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
288 buffer_index = (buffer_index + 1) % num_buffers;
289 state.ResumeTiming();
290
291 ruy_a.set_data(k.data() + buffer_index * nc * kc);
292 ruy_c.set_data(c.data() + buffer_index * mc * nc);
293 mul_params.set_bias(b.data() + buffer_index * nc);
294
295 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
296 }
297
298 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
299 if (cpu_frequency != 0) {
300 state.counters["cpufreq"] = cpu_frequency;
301 }
302
303 state.counters["OPS"] = benchmark::Counter(
304 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
305 }
306
ruy_st(benchmark::State & state,const char * net)307 static void ruy_st(benchmark::State& state, const char* net)
308 {
309 RuyBenchmark(state, 1);
310 }
311 #endif // BENCHMARK_RUY
312
313
314 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)315 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
316 GEMMBenchmark(state,
317 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
318 xnn_init_qu8_conv_minmax_rndnu_neon_params,
319 4, 8, 1, 1, benchmark::utils::CheckNEON);
320 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)321 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
322 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
323 xnn_init_qu8_conv_minmax_rndnu_neon_params,
324 4, 8, 1, 1, benchmark::utils::CheckNEON);
325 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)326 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
327 GEMMBenchmark(state,
328 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
329 xnn_init_qu8_conv_minmax_rndnu_neon_params,
330 4, 8, 1, 1, benchmark::utils::CheckNEON);
331 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)332 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
333 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
334 xnn_init_qu8_conv_minmax_rndnu_neon_params,
335 4, 8, 1, 1, benchmark::utils::CheckNEON);
336 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)337 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
338 GEMMBenchmark(state,
339 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
340 xnn_init_qu8_conv_minmax_rndnu_neon_params,
341 4, 8, 1, 1, benchmark::utils::CheckNEON);
342 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)343 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
344 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
345 xnn_init_qu8_conv_minmax_rndnu_neon_params,
346 4, 8, 1, 1, benchmark::utils::CheckNEON);
347 }
qu8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)348 static void qu8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
349 GEMMBenchmark(state,
350 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
351 xnn_init_qu8_conv_minmax_rndnu_neon_params,
352 1, 8, 1, 1, benchmark::utils::CheckNEON);
353 }
qu8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)354 static void qu8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
355 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
356 xnn_init_qu8_conv_minmax_rndnu_neon_params,
357 1, 8, 1, 1, benchmark::utils::CheckNEON);
358 }
359
360 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)361 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
362 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
363 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
364 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
365 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
366 BENCHMARK_GEMM(qu8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7)
367 BENCHMARK_GEMM(qu8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
368 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
369
370 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
371 static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
372 GEMMBenchmark(state,
373 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
374 xnn_init_qu8_conv_minmax_rndnu_neon_params,
375 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
376 }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)377 static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
378 GEMMBenchmark(state,
379 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
380 xnn_init_qu8_conv_minmax_rndnu_neon_params,
381 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
382 }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)383 static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
384 GEMMBenchmark(state,
385 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
386 xnn_init_qu8_conv_minmax_rndnu_neon_params,
387 4, 8, 4, 1,
388 benchmark::utils::CheckNEONDOT);
389 }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,const char * net)390 static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
391 GEMMBenchmark(state,
392 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
393 xnn_init_qu8_conv_minmax_rndnu_neon_params,
394 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
395 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)396 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
397 GEMMBenchmark(state,
398 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
399 xnn_init_qu8_conv_minmax_rndnu_neon_params,
400 4, 16, 1, 1,
401 benchmark::utils::CheckNEON);
402 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)403 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
404 GEMMBenchmark(state,
405 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
406 xnn_init_qu8_conv_minmax_rndnu_neon_params,
407 4, 16, 1, 1,
408 benchmark::utils::CheckNEON);
409 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)410 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
411 GEMMBenchmark(state,
412 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
413 xnn_init_qu8_conv_minmax_rndnu_neon_params,
414 4, 16, 1, 1,
415 benchmark::utils::CheckNEON);
416 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)417 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
418 GEMMBenchmark(state,
419 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
420 xnn_init_qu8_conv_minmax_rndnu_neon_params,
421 4, 16, 1, 1,
422 benchmark::utils::CheckNEON);
423 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,const char * net)424 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, const char* net) {
425 GEMMBenchmark(state,
426 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
427 xnn_init_qu8_conv_minmax_rndnu_neon_params,
428 4, 16, 1, 1,
429 benchmark::utils::CheckNEON);
430 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,const char * net)431 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, const char* net) {
432 GEMMBenchmark(state,
433 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
434 xnn_init_qu8_conv_minmax_rndnu_neon_params,
435 4, 16, 1, 1,
436 benchmark::utils::CheckNEON);
437 }
438 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55)
BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)439 BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)
440 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_ld128)
441 BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_ld128)
442 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
443 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
444 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
445 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
446 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75)
447 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75)
448 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
449
450
451 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
452 static void qu8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
453 GEMMBenchmark(state,
454 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
455 xnn_init_qu8_conv_minmax_rndnu_neon_params,
456 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
457 }
qu8_gemm_2x8c4__neondot(benchmark::State & state,const char * net)458 static void qu8_gemm_2x8c4__neondot(benchmark::State& state, const char* net) {
459 GEMMBenchmark(state,
460 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
461 xnn_init_qu8_conv_minmax_rndnu_neon_params,
462 2, 8, 4, 1, benchmark::utils::CheckNEONDOT);
463 }
qu8_gemm_3x8c4__neondot(benchmark::State & state,const char * net)464 static void qu8_gemm_3x8c4__neondot(benchmark::State& state, const char* net) {
465 GEMMBenchmark(state,
466 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
467 xnn_init_qu8_conv_minmax_rndnu_neon_params,
468 3, 8, 4, 1, benchmark::utils::CheckNEONDOT);
469 }
qu8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)470 static void qu8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
471 GEMMBenchmark(state,
472 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
473 xnn_init_qu8_conv_minmax_rndnu_neon_params,
474 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
475 }
qu8_gemm_5x8c4__neondot(benchmark::State & state,const char * net)476 static void qu8_gemm_5x8c4__neondot(benchmark::State& state, const char* net) {
477 GEMMBenchmark(state,
478 xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
479 xnn_init_qu8_conv_minmax_rndnu_neon_params,
480 5, 8, 4, 1, benchmark::utils::CheckNEONDOT);
481 }
qu8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)482 static void qu8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
483 GEMMBenchmark(state,
484 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
485 xnn_init_qu8_conv_minmax_rndnu_neon_params,
486 6, 8, 4, 1, benchmark::utils::CheckNEONDOT);
487 }
qu8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)488 static void qu8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
489 GEMMBenchmark(state,
490 xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
491 xnn_init_qu8_conv_minmax_rndnu_neon_params,
492 8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
493 }
qu8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)494 static void qu8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
495 GEMMBenchmark(state,
496 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
497 xnn_init_qu8_conv_minmax_rndnu_neon_params,
498 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
499 }
qu8_gemm_2x16c4__neondot(benchmark::State & state,const char * net)500 static void qu8_gemm_2x16c4__neondot(benchmark::State& state, const char* net) {
501 GEMMBenchmark(state,
502 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
503 xnn_init_qu8_conv_minmax_rndnu_neon_params,
504 2, 16, 4, 1, benchmark::utils::CheckNEONDOT);
505 }
qu8_gemm_3x16c4__neondot(benchmark::State & state,const char * net)506 static void qu8_gemm_3x16c4__neondot(benchmark::State& state, const char* net) {
507 GEMMBenchmark(state,
508 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
509 xnn_init_qu8_conv_minmax_rndnu_neon_params,
510 3, 16, 4, 1, benchmark::utils::CheckNEONDOT);
511 }
qu8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)512 static void qu8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
513 GEMMBenchmark(state,
514 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
515 xnn_init_qu8_conv_minmax_rndnu_neon_params,
516 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
517 }
qu8_gemm_5x16c4__neondot(benchmark::State & state,const char * net)518 static void qu8_gemm_5x16c4__neondot(benchmark::State& state, const char* net) {
519 GEMMBenchmark(state,
520 xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
521 xnn_init_qu8_conv_minmax_rndnu_neon_params,
522 5, 16, 4, 1, benchmark::utils::CheckNEONDOT);
523 }
qu8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)524 static void qu8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
525 GEMMBenchmark(state,
526 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
527 xnn_init_qu8_conv_minmax_rndnu_neon_params,
528 6, 16, 4, 1, benchmark::utils::CheckNEONDOT);
529 }
qu8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)530 static void qu8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
531 GEMMBenchmark(state,
532 xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
533 xnn_init_qu8_conv_minmax_rndnu_neon_params,
534 8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
535 }
qu8_gemm_1x32c4__neondot(benchmark::State & state,const char * net)536 static void qu8_gemm_1x32c4__neondot(benchmark::State& state, const char* net) {
537 GEMMBenchmark(state,
538 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
539 xnn_init_qu8_conv_minmax_rndnu_neon_params,
540 1, 32, 4, 1, benchmark::utils::CheckNEONDOT);
541 }
qu8_gemm_2x32c4__neondot(benchmark::State & state,const char * net)542 static void qu8_gemm_2x32c4__neondot(benchmark::State& state, const char* net) {
543 GEMMBenchmark(state,
544 xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
545 xnn_init_qu8_conv_minmax_rndnu_neon_params,
546 2, 32, 4, 1, benchmark::utils::CheckNEONDOT);
547 }
qu8_gemm_3x32c4__neondot(benchmark::State & state,const char * net)548 static void qu8_gemm_3x32c4__neondot(benchmark::State& state, const char* net) {
549 GEMMBenchmark(state,
550 xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
551 xnn_init_qu8_conv_minmax_rndnu_neon_params,
552 3, 32, 4, 1, benchmark::utils::CheckNEONDOT);
553 }
554
555 BENCHMARK_GEMM(qu8_gemm_1x8c4__neondot)
BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)556 BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)
557 BENCHMARK_GEMM(qu8_gemm_3x8c4__neondot)
558 BENCHMARK_GEMM(qu8_gemm_4x8c4__neondot)
559 BENCHMARK_GEMM(qu8_gemm_5x8c4__neondot)
560 BENCHMARK_GEMM(qu8_gemm_6x8c4__neondot)
561 BENCHMARK_GEMM(qu8_gemm_8x8c4__neondot)
562 BENCHMARK_GEMM(qu8_gemm_1x16c4__neondot)
563 BENCHMARK_GEMM(qu8_gemm_2x16c4__neondot)
564 BENCHMARK_GEMM(qu8_gemm_3x16c4__neondot)
565 BENCHMARK_GEMM(qu8_gemm_4x16c4__neondot)
566 BENCHMARK_GEMM(qu8_gemm_5x16c4__neondot)
567 BENCHMARK_GEMM(qu8_gemm_6x16c4__neondot)
568 BENCHMARK_GEMM(qu8_gemm_8x16c4__neondot)
569 BENCHMARK_GEMM(qu8_gemm_1x32c4__neondot)
570 BENCHMARK_GEMM(qu8_gemm_2x32c4__neondot)
571 BENCHMARK_GEMM(qu8_gemm_3x32c4__neondot)
572 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
573
574
575 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
576 static void qu8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
577 GEMMBenchmark(state,
578 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
579 xnn_init_qu8_conv_minmax_rndnu_neon_params,
580 1, 8, 1, 1, benchmark::utils::CheckNEON);
581 }
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)582 static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
583 GEMMBenchmark(state,
584 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
585 xnn_init_qu8_conv_minmax_rndnu_neon_params,
586 2, 8, 1, 1, benchmark::utils::CheckNEON);
587 }
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)588 static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
589 GEMMBenchmark(state,
590 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
591 xnn_init_qu8_conv_minmax_rndnu_neon_params,
592 3, 8, 1, 1, benchmark::utils::CheckNEON);
593 }
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)594 static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
595 GEMMBenchmark(state,
596 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
597 xnn_init_qu8_conv_minmax_rndnu_neon_params,
598 4, 8, 1, 1, benchmark::utils::CheckNEON);
599 }
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)600 static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
601 GEMMBenchmark(state,
602 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
603 xnn_init_qu8_conv_minmax_rndnu_neon_params,
604 6, 8, 1, 1, benchmark::utils::CheckNEON);
605 }
qu8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)606 static void qu8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
607 GEMMBenchmark(state,
608 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
609 xnn_init_qu8_conv_minmax_rndnu_neon_params,
610 1, 16, 1, 1, benchmark::utils::CheckNEON);
611 }
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)612 static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
613 GEMMBenchmark(state,
614 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
615 xnn_init_qu8_conv_minmax_rndnu_neon_params,
616 2, 16, 1, 1, benchmark::utils::CheckNEON);
617 }
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)618 static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
619 GEMMBenchmark(state,
620 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
621 xnn_init_qu8_conv_minmax_rndnu_neon_params,
622 3, 16, 1, 1, benchmark::utils::CheckNEON);
623 }
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)624 static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
625 GEMMBenchmark(state,
626 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
627 xnn_init_qu8_conv_minmax_rndnu_neon_params,
628 4, 16, 1, 1, benchmark::utils::CheckNEON);
629 }
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)630 static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
631 GEMMBenchmark(state,
632 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
633 xnn_init_qu8_conv_minmax_rndnu_neon_params,
634 6, 16, 1, 1, benchmark::utils::CheckNEON);
635 }
636
637 BENCHMARK_GEMM(qu8_gemm_1x8__neon_mlal_lane)
BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)638 BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)
639 BENCHMARK_GEMM(qu8_gemm_3x8__neon_mlal_lane)
640 BENCHMARK_GEMM(qu8_gemm_4x8__neon_mlal_lane)
641 BENCHMARK_GEMM(qu8_gemm_6x8__neon_mlal_lane)
642 BENCHMARK_GEMM(qu8_gemm_1x16__neon_mlal_lane)
643 BENCHMARK_GEMM(qu8_gemm_2x16__neon_mlal_lane)
644 BENCHMARK_GEMM(qu8_gemm_3x16__neon_mlal_lane)
645 BENCHMARK_GEMM(qu8_gemm_4x16__neon_mlal_lane)
646 BENCHMARK_GEMM(qu8_gemm_6x16__neon_mlal_lane)
647 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
648
649
650 #if XNN_ARCH_ARM
651 static void qu8_gemm_1x1c4__armsimd32(benchmark::State& state, const char* net) {
652 GEMMBenchmark(state,
653 xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
654 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
655 1, 1, 4, 1, benchmark::utils::CheckARMV6);
656 }
qu8_gemm_2x1c4__armsimd32(benchmark::State & state,const char * net)657 static void qu8_gemm_2x1c4__armsimd32(benchmark::State& state, const char* net) {
658 GEMMBenchmark(state,
659 xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
660 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
661 2, 1, 4, 1, benchmark::utils::CheckARMV6);
662 }
qu8_gemm_1x2c4__armsimd32(benchmark::State & state,const char * net)663 static void qu8_gemm_1x2c4__armsimd32(benchmark::State& state, const char* net) {
664 GEMMBenchmark(state,
665 xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
666 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
667 1, 2, 4, 1, benchmark::utils::CheckARMV6);
668 }
qu8_gemm_2x2c4__armsimd32(benchmark::State & state,const char * net)669 static void qu8_gemm_2x2c4__armsimd32(benchmark::State& state, const char* net) {
670 GEMMBenchmark(state,
671 xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
672 xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
673 2, 2, 4, 1, benchmark::utils::CheckARMV6);
674 }
675
676 BENCHMARK_GEMM(qu8_gemm_1x1c4__armsimd32)
BENCHMARK_GEMM(qu8_gemm_2x1c4__armsimd32)677 BENCHMARK_GEMM(qu8_gemm_2x1c4__armsimd32)
678 BENCHMARK_GEMM(qu8_gemm_1x2c4__armsimd32)
679 BENCHMARK_GEMM(qu8_gemm_2x2c4__armsimd32)
680 #endif // XNN_ARCH_ARM
681
682
683 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
684 static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
685 GEMMBenchmark(state,
686 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
687 xnn_init_qu8_conv_minmax_fp32_avx512_params,
688 1, 16, 8, 1,
689 benchmark::utils::CheckAVX512SKX);
690 }
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,const char * net)691 static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
692 GEMMBenchmark(state,
693 xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
694 xnn_init_qu8_conv_minmax_fp32_avx512_params,
695 2, 16, 8, 1,
696 benchmark::utils::CheckAVX512SKX);
697 }
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)698 static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
699 GEMMBenchmark(state,
700 xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
701 xnn_init_qu8_conv_minmax_fp32_avx512_params,
702 3, 16, 8, 1,
703 benchmark::utils::CheckAVX512SKX);
704 }
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)705 static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
706 GEMMBenchmark(state,
707 xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
708 xnn_init_qu8_conv_minmax_fp32_avx512_params,
709 4, 16, 8, 1,
710 benchmark::utils::CheckAVX512SKX);
711 }
qu8_gemm_1x8c8__avx2(benchmark::State & state,const char * net)712 static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
713 GEMMBenchmark(state,
714 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
715 xnn_init_qu8_conv_minmax_fp32_avx2_params,
716 1, 8, 8, 1,
717 benchmark::utils::CheckAVX2);
718 }
qu8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)719 static void qu8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
720 GEMMBenchmark(state,
721 xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
722 xnn_init_qu8_conv_minmax_fp32_avx2_params,
723 2, 8, 8, 1,
724 benchmark::utils::CheckAVX2);
725 }
qu8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)726 static void qu8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
727 GEMMBenchmark(state,
728 xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
729 xnn_init_qu8_conv_minmax_fp32_avx2_params,
730 3, 8, 8, 1,
731 benchmark::utils::CheckAVX2);
732 }
qu8_gemm_1x4c2__xop_ld64(benchmark::State & state,const char * net)733 static void qu8_gemm_1x4c2__xop_ld64(benchmark::State& state, const char* net) {
734 GEMMBenchmark(state,
735 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
736 xnn_init_qu8_conv_minmax_fp32_sse2_params,
737 1, 4, 2, 1,
738 benchmark::utils::CheckXOP);
739 }
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)740 static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
741 GEMMBenchmark(state,
742 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
743 xnn_init_qu8_conv_minmax_fp32_sse2_params,
744 2, 4, 2, 1,
745 benchmark::utils::CheckXOP);
746 }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)747 static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
748 GEMMBenchmark(state,
749 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
750 xnn_init_qu8_conv_minmax_fp32_sse2_params,
751 3, 4, 2, 1,
752 benchmark::utils::CheckXOP);
753 }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)754 static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
755 GEMMBenchmark(state,
756 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
757 xnn_init_qu8_conv_minmax_fp32_sse2_params,
758 4, 4, 2, 1,
759 benchmark::utils::CheckXOP);
760 }
qu8_gemm_1x4c2__xop_ld128(benchmark::State & state,const char * net)761 static void qu8_gemm_1x4c2__xop_ld128(benchmark::State& state, const char* net) {
762 GEMMBenchmark(state,
763 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
764 xnn_init_qu8_conv_minmax_fp32_sse2_params,
765 1, 4, 2, 1,
766 benchmark::utils::CheckXOP);
767 }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)768 static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
769 GEMMBenchmark(state,
770 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
771 xnn_init_qu8_conv_minmax_fp32_sse2_params,
772 2, 4, 2, 1,
773 benchmark::utils::CheckXOP);
774 }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)775 static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
776 GEMMBenchmark(state,
777 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
778 xnn_init_qu8_conv_minmax_fp32_sse2_params,
779 3, 4, 2, 1,
780 benchmark::utils::CheckXOP);
781 }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)782 static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
783 GEMMBenchmark(state,
784 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
785 xnn_init_qu8_conv_minmax_fp32_sse2_params,
786 4, 4, 2, 1,
787 benchmark::utils::CheckXOP);
788 }
qu8_gemm_1x4c8__xop_ld64(benchmark::State & state,const char * net)789 static void qu8_gemm_1x4c8__xop_ld64(benchmark::State& state, const char* net) {
790 GEMMBenchmark(state,
791 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
792 xnn_init_qu8_conv_minmax_fp32_sse2_params,
793 1, 4, 8, 1,
794 benchmark::utils::CheckXOP);
795 }
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)796 static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
797 GEMMBenchmark(state,
798 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
799 xnn_init_qu8_conv_minmax_fp32_sse2_params,
800 2, 4, 8, 1,
801 benchmark::utils::CheckXOP);
802 }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)803 static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
804 GEMMBenchmark(state,
805 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
806 xnn_init_qu8_conv_minmax_fp32_sse2_params,
807 3, 4, 8, 1,
808 benchmark::utils::CheckXOP);
809 }
qu8_gemm_1x4c8__xop_ld128(benchmark::State & state,const char * net)810 static void qu8_gemm_1x4c8__xop_ld128(benchmark::State& state, const char* net) {
811 GEMMBenchmark(state,
812 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
813 xnn_init_qu8_conv_minmax_fp32_sse2_params,
814 1, 4, 8, 1,
815 benchmark::utils::CheckXOP);
816 }
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)817 static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
818 GEMMBenchmark(state,
819 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
820 xnn_init_qu8_conv_minmax_fp32_sse2_params,
821 2, 4, 8, 1,
822 benchmark::utils::CheckXOP);
823 }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)824 static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state,
826 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
827 xnn_init_qu8_conv_minmax_fp32_sse2_params,
828 3, 4, 8, 1,
829 benchmark::utils::CheckXOP);
830 }
qu8_gemm_1x4c2__avx_ld64(benchmark::State & state,const char * net)831 static void qu8_gemm_1x4c2__avx_ld64(benchmark::State& state, const char* net) {
832 GEMMBenchmark(state,
833 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
834 xnn_init_qu8_conv_minmax_fp32_sse2_params,
835 1, 4, 2, 1,
836 benchmark::utils::CheckAVX);
837 }
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)838 static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
839 GEMMBenchmark(state,
840 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
841 xnn_init_qu8_conv_minmax_fp32_sse2_params,
842 2, 4, 2, 1,
843 benchmark::utils::CheckAVX);
844 }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)845 static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
846 GEMMBenchmark(state,
847 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
848 xnn_init_qu8_conv_minmax_fp32_sse2_params,
849 3, 4, 2, 1,
850 benchmark::utils::CheckAVX);
851 }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)852 static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
853 GEMMBenchmark(state,
854 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
855 xnn_init_qu8_conv_minmax_fp32_sse2_params,
856 4, 4, 2, 1,
857 benchmark::utils::CheckAVX);
858 }
qu8_gemm_1x4c2__avx_ld128(benchmark::State & state,const char * net)859 static void qu8_gemm_1x4c2__avx_ld128(benchmark::State& state, const char* net) {
860 GEMMBenchmark(state,
861 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
862 xnn_init_qu8_conv_minmax_fp32_sse2_params,
863 1, 4, 2, 1,
864 benchmark::utils::CheckAVX);
865 }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)866 static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
867 GEMMBenchmark(state,
868 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
869 xnn_init_qu8_conv_minmax_fp32_sse2_params,
870 2, 4, 2, 1,
871 benchmark::utils::CheckAVX);
872 }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)873 static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
874 GEMMBenchmark(state,
875 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
876 xnn_init_qu8_conv_minmax_fp32_sse2_params,
877 3, 4, 2, 1,
878 benchmark::utils::CheckAVX);
879 }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)880 static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
881 GEMMBenchmark(state,
882 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
883 xnn_init_qu8_conv_minmax_fp32_sse2_params,
884 4, 4, 2, 1,
885 benchmark::utils::CheckAVX);
886 }
qu8_gemm_1x4c8__avx_ld64(benchmark::State & state,const char * net)887 static void qu8_gemm_1x4c8__avx_ld64(benchmark::State& state, const char* net) {
888 GEMMBenchmark(state,
889 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
890 xnn_init_qu8_conv_minmax_fp32_sse2_params,
891 1, 4, 8, 1,
892 benchmark::utils::CheckAVX);
893 }
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)894 static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
895 GEMMBenchmark(state,
896 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
897 xnn_init_qu8_conv_minmax_fp32_sse2_params,
898 2, 4, 8, 1,
899 benchmark::utils::CheckAVX);
900 }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)901 static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
902 GEMMBenchmark(state,
903 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
904 xnn_init_qu8_conv_minmax_fp32_sse2_params,
905 3, 4, 8, 1,
906 benchmark::utils::CheckAVX);
907 }
qu8_gemm_1x4c8__avx_ld128(benchmark::State & state,const char * net)908 static void qu8_gemm_1x4c8__avx_ld128(benchmark::State& state, const char* net) {
909 GEMMBenchmark(state,
910 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
911 xnn_init_qu8_conv_minmax_fp32_sse2_params,
912 1, 4, 8, 1,
913 benchmark::utils::CheckAVX);
914 }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)915 static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
916 GEMMBenchmark(state,
917 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
918 xnn_init_qu8_conv_minmax_fp32_sse2_params,
919 2, 4, 8, 1,
920 benchmark::utils::CheckAVX);
921 }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)922 static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
923 GEMMBenchmark(state,
924 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
925 xnn_init_qu8_conv_minmax_fp32_sse2_params,
926 3, 4, 8, 1,
927 benchmark::utils::CheckAVX);
928 }
qu8_gemm_1x4c2__sse41_ld64(benchmark::State & state,const char * net)929 static void qu8_gemm_1x4c2__sse41_ld64(benchmark::State& state, const char* net) {
930 GEMMBenchmark(state,
931 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
932 xnn_init_qu8_conv_minmax_fp32_sse2_params,
933 1, 4, 2, 1,
934 benchmark::utils::CheckSSE41);
935 }
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)936 static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
937 GEMMBenchmark(state,
938 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
939 xnn_init_qu8_conv_minmax_fp32_sse2_params,
940 2, 4, 2, 1,
941 benchmark::utils::CheckSSE41);
942 }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)943 static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
944 GEMMBenchmark(state,
945 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
946 xnn_init_qu8_conv_minmax_fp32_sse2_params,
947 3, 4, 2, 1,
948 benchmark::utils::CheckSSE41);
949 }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)950 static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
951 GEMMBenchmark(state,
952 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
953 xnn_init_qu8_conv_minmax_fp32_sse2_params,
954 4, 4, 2, 1,
955 benchmark::utils::CheckSSE41);
956 }
qu8_gemm_1x4c2__sse41_ld128(benchmark::State & state,const char * net)957 static void qu8_gemm_1x4c2__sse41_ld128(benchmark::State& state, const char* net) {
958 GEMMBenchmark(state,
959 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
960 xnn_init_qu8_conv_minmax_fp32_sse2_params,
961 1, 4, 2, 1,
962 benchmark::utils::CheckSSE41);
963 }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)964 static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
965 GEMMBenchmark(state,
966 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
967 xnn_init_qu8_conv_minmax_fp32_sse2_params,
968 2, 4, 2, 1,
969 benchmark::utils::CheckSSE41);
970 }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)971 static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
972 GEMMBenchmark(state,
973 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
974 xnn_init_qu8_conv_minmax_fp32_sse2_params,
975 3, 4, 2, 1,
976 benchmark::utils::CheckSSE41);
977 }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)978 static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
979 GEMMBenchmark(state,
980 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
981 xnn_init_qu8_conv_minmax_fp32_sse2_params,
982 4, 4, 2, 1,
983 benchmark::utils::CheckSSE41);
984 }
qu8_gemm_1x4c8__sse41_ld64(benchmark::State & state,const char * net)985 static void qu8_gemm_1x4c8__sse41_ld64(benchmark::State& state, const char* net) {
986 GEMMBenchmark(state,
987 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
988 xnn_init_qu8_conv_minmax_fp32_sse2_params,
989 1, 4, 8, 1,
990 benchmark::utils::CheckSSE41);
991 }
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)992 static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
993 GEMMBenchmark(state,
994 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
995 xnn_init_qu8_conv_minmax_fp32_sse2_params,
996 2, 4, 8, 1,
997 benchmark::utils::CheckSSE41);
998 }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)999 static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1000 GEMMBenchmark(state,
1001 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
1002 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1003 3, 4, 8, 1,
1004 benchmark::utils::CheckSSE41);
1005 }
qu8_gemm_1x4c8__sse41_ld128(benchmark::State & state,const char * net)1006 static void qu8_gemm_1x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1007 GEMMBenchmark(state,
1008 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
1009 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1010 1, 4, 8, 1,
1011 benchmark::utils::CheckSSE41);
1012 }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)1013 static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1014 GEMMBenchmark(state,
1015 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
1016 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1017 2, 4, 8, 1,
1018 benchmark::utils::CheckSSE41);
1019 }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)1020 static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1021 GEMMBenchmark(state,
1022 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
1023 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1024 3, 4, 8, 1,
1025 benchmark::utils::CheckSSE41);
1026 }
qu8_gemm_1x4c2__sse2_ld64(benchmark::State & state,const char * net)1027 static void qu8_gemm_1x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1028 GEMMBenchmark(state,
1029 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1030 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1031 1, 4, 2, 1);
1032 }
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)1033 static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1034 GEMMBenchmark(state,
1035 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1036 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1037 2, 4, 2, 1);
1038 }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)1039 static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1040 GEMMBenchmark(state,
1041 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1042 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1043 3, 4, 2, 1);
1044 }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)1045 static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1046 GEMMBenchmark(state,
1047 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1048 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1049 4, 4, 2, 1);
1050 }
qu8_gemm_1x4c2__sse2_ld128(benchmark::State & state,const char * net)1051 static void qu8_gemm_1x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1052 GEMMBenchmark(state,
1053 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1054 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1055 1, 4, 2, 1);
1056 }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1057 static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1058 GEMMBenchmark(state,
1059 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1060 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1061 2, 4, 2, 1);
1062 }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1063 static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1064 GEMMBenchmark(state,
1065 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1066 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1067 3, 4, 2, 1);
1068 }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1069 static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1070 GEMMBenchmark(state,
1071 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1072 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1073 4, 4, 2, 1);
1074 }
qu8_gemm_1x4c8__sse2_ld64(benchmark::State & state,const char * net)1075 static void qu8_gemm_1x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1076 GEMMBenchmark(state,
1077 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1078 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1079 1, 4, 8, 1);
1080 }
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,const char * net)1081 static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1082 GEMMBenchmark(state,
1083 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1084 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1085 2, 4, 8, 1);
1086 }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,const char * net)1087 static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1088 GEMMBenchmark(state,
1089 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1090 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1091 3, 4, 8, 1);
1092 }
qu8_gemm_1x4c8__sse2_ld128(benchmark::State & state,const char * net)1093 static void qu8_gemm_1x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1094 GEMMBenchmark(state,
1095 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1096 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1097 1, 4, 8, 1);
1098 }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,const char * net)1099 static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1100 GEMMBenchmark(state,
1101 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1102 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1103 2, 4, 8, 1);
1104 }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,const char * net)1105 static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1106 GEMMBenchmark(state,
1107 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1108 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1109 3, 4, 8, 1);
1110 }
1111
1112 BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)1113 BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)
1114 BENCHMARK_GEMM(qu8_gemm_3x16c8__avx512skx)
1115 BENCHMARK_GEMM(qu8_gemm_4x16c8__avx512skx)
1116
1117 BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
1118 BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
1119 BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
1120
1121 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld64)
1122 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld64)
1123 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld64)
1124 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld64)
1125 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld128)
1126 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld128)
1127 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld128)
1128 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld128)
1129 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld64)
1130 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld64)
1131 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld64)
1132 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld128)
1133 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld128)
1134 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld128)
1135
1136 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld64)
1137 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld64)
1138 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld64)
1139 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld64)
1140 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld128)
1141 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld128)
1142 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld128)
1143 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld128)
1144 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld64)
1145 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld64)
1146 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld64)
1147 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld128)
1148 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld128)
1149 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld128)
1150
1151 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld64)
1152 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld64)
1153 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld64)
1154 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld64)
1155 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld128)
1156 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld128)
1157 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld128)
1158 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld128)
1159 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld64)
1160 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld64)
1161 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld64)
1162 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld128)
1163 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld128)
1164 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld128)
1165
1166 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld64)
1167 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld64)
1168 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld64)
1169 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld64)
1170 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld128)
1171 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld128)
1172 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld128)
1173 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld128)
1174 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld64)
1175 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld64)
1176 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld64)
1177 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld128)
1178 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld128)
1179 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld128)
1180 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1181
1182
1183 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1184 static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1185 GEMMBenchmark(state,
1186 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1187 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188 1, 4, 2, 1);
1189 }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1190 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1191 GEMMBenchmark(state,
1192 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1193 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1194 2, 4, 2, 1);
1195 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1196 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1197 GEMMBenchmark(state,
1198 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1199 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1200 3, 4, 2, 1);
1201 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1202 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1203 GEMMBenchmark(state,
1204 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1205 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1206 4, 4, 2, 1);
1207 }
1208
qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1209 static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1210 GEMMBenchmark(state,
1211 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1212 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1213 1, 4, 2, 1);
1214 }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1215 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1216 GEMMBenchmark(state,
1217 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1218 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1219 2, 4, 2, 1);
1220 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1221 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1222 GEMMBenchmark(state,
1223 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1224 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225 3, 4, 2, 1);
1226 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1227 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1228 GEMMBenchmark(state,
1229 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1230 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1231 4, 4, 2, 1);
1232 }
1233
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1234 static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1235 GEMMBenchmark(state,
1236 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1237 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1238 1, 4, 2, 4);
1239 }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1240 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1241 GEMMBenchmark(state,
1242 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1243 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244 2, 4, 2, 4);
1245 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1246 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1247 GEMMBenchmark(state,
1248 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1249 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1250 3, 4, 2, 4);
1251 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1252 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1253 GEMMBenchmark(state,
1254 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1255 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1256 4, 4, 2, 4);
1257 }
1258
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1259 static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1260 GEMMBenchmark(state,
1261 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1262 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1263 1, 4, 2, 4);
1264 }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1265 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1266 GEMMBenchmark(state,
1267 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1268 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1269 2, 4, 2, 4);
1270 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1271 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1272 GEMMBenchmark(state,
1273 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1274 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1275 3, 4, 2, 4);
1276 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1277 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1278 GEMMBenchmark(state,
1279 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1280 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1281 4, 4, 2, 4);
1282 }
1283
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1284 static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1285 GEMMBenchmark(state,
1286 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1287 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1288 1, 4, 8, 1);
1289 }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1290 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1291 GEMMBenchmark(state,
1292 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1293 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1294 2, 4, 8, 1);
1295 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1296 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1297 GEMMBenchmark(state,
1298 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1299 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1300 3, 4, 8, 1);
1301 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1302 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1303 GEMMBenchmark(state,
1304 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1305 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1306 4, 4, 8, 1);
1307 }
1308
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1309 static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1310 GEMMBenchmark(state,
1311 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1312 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1313 1, 4, 8, 1);
1314 }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1315 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1316 GEMMBenchmark(state,
1317 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1318 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1319 2, 4, 8, 1);
1320 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1321 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1322 GEMMBenchmark(state,
1323 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1324 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1325 3, 4, 8, 1);
1326 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1327 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1328 GEMMBenchmark(state,
1329 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1330 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1331 4, 4, 8, 1);
1332 }
1333
1334 BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)1335 BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
1336 BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1337 BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1338 BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128)
1339 BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1340 BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1341 BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1342
1343 BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64)
1344 BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1345 BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1346 BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1347 BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128)
1348 BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1349 BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1350 BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1351
1352 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64)
1353 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1354 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1355 BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1356 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128)
1357 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1358 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1359 BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1360 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1361
1362
1363 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1364 static void qu8_gemm_1x2__wasm_fmagic(benchmark::State& state, const char* net) {
1365 GEMMBenchmark(state,
1366 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1367 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368 1, 2, 1, 1);
1369 }
qu8_gemm_2x2__wasm_fmagic(benchmark::State & state,const char * net)1370 static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
1371 GEMMBenchmark(state,
1372 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1373 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374 2, 2, 1, 1);
1375 }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,const char * net)1376 static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
1377 GEMMBenchmark(state,
1378 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1379 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1380 3, 2, 1, 1);
1381 }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,const char * net)1382 static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
1383 GEMMBenchmark(state,
1384 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1385 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386 4, 2, 1, 1);
1387 }
qu8_gemm_1x4__wasm_fmagic(benchmark::State & state,const char * net)1388 static void qu8_gemm_1x4__wasm_fmagic(benchmark::State& state, const char* net) {
1389 GEMMBenchmark(state,
1390 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1391 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392 1, 4, 1, 1);
1393 }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,const char * net)1394 static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
1395 GEMMBenchmark(state,
1396 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1397 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1398 2, 4, 1, 1);
1399 }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,const char * net)1400 static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
1401 GEMMBenchmark(state,
1402 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1403 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1404 3, 4, 1, 1);
1405 }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,const char * net)1406 static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
1407 GEMMBenchmark(state,
1408 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1409 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1410 4, 4, 1, 1);
1411 }
1412
1413 BENCHMARK_GEMM(qu8_gemm_1x2__wasm_fmagic)
BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)1414 BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)
1415 BENCHMARK_GEMM(qu8_gemm_3x2__wasm_fmagic)
1416 BENCHMARK_GEMM(qu8_gemm_4x2__wasm_fmagic)
1417 BENCHMARK_GEMM(qu8_gemm_1x4__wasm_fmagic)
1418 BENCHMARK_GEMM(qu8_gemm_2x4__wasm_fmagic)
1419 BENCHMARK_GEMM(qu8_gemm_3x4__wasm_fmagic)
1420 BENCHMARK_GEMM(qu8_gemm_4x4__wasm_fmagic)
1421 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1422
1423
1424 static void qu8_gemm_1x2__scalar_fmagic(benchmark::State& state, const char* net) {
1425 GEMMBenchmark(state,
1426 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1427 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1428 1, 2, 1, 1);
1429 }
qu8_gemm_2x2__scalar_fmagic(benchmark::State & state,const char * net)1430 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
1431 GEMMBenchmark(state,
1432 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1433 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1434 2, 2, 1, 1);
1435 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,const char * net)1436 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
1437 GEMMBenchmark(state,
1438 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1439 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1440 3, 2, 1, 1);
1441 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,const char * net)1442 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
1443 GEMMBenchmark(state,
1444 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1445 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1446 4, 2, 1, 1);
1447 }
qu8_gemm_1x4__scalar_fmagic(benchmark::State & state,const char * net)1448 static void qu8_gemm_1x4__scalar_fmagic(benchmark::State& state, const char* net) {
1449 GEMMBenchmark(state,
1450 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1451 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1452 1, 4, 1, 1);
1453 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,const char * net)1454 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
1455 GEMMBenchmark(state,
1456 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1457 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1458 2, 4, 1, 1);
1459 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,const char * net)1460 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
1461 GEMMBenchmark(state,
1462 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1463 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1464 3, 4, 1, 1);
1465 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,const char * net)1466 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
1467 GEMMBenchmark(state,
1468 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1469 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1470 4, 4, 1, 1);
1471 }
1472
qu8_gemm_1x2__scalar_imagic(benchmark::State & state,const char * net)1473 static void qu8_gemm_1x2__scalar_imagic(benchmark::State& state, const char* net) {
1474 GEMMBenchmark(state,
1475 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1476 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1477 1, 2, 1, 1);
1478 }
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,const char * net)1479 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
1480 GEMMBenchmark(state,
1481 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1482 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1483 2, 2, 1, 1);
1484 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,const char * net)1485 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
1486 GEMMBenchmark(state,
1487 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1488 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1489 3, 2, 1, 1);
1490 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,const char * net)1491 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
1492 GEMMBenchmark(state,
1493 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1494 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1495 4, 2, 1, 1);
1496 }
qu8_gemm_1x4__scalar_imagic(benchmark::State & state,const char * net)1497 static void qu8_gemm_1x4__scalar_imagic(benchmark::State& state, const char* net) {
1498 GEMMBenchmark(state,
1499 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1500 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1501 1, 4, 1, 1);
1502 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,const char * net)1503 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
1504 GEMMBenchmark(state,
1505 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1506 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1507 2, 4, 1, 1);
1508 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,const char * net)1509 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
1510 GEMMBenchmark(state,
1511 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1512 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1513 3, 4, 1, 1);
1514 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,const char * net)1515 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
1516 GEMMBenchmark(state,
1517 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1518 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1519 4, 4, 1, 1);
1520 }
1521
qu8_gemm_1x2__scalar_lrintf(benchmark::State & state,const char * net)1522 static void qu8_gemm_1x2__scalar_lrintf(benchmark::State& state, const char* net) {
1523 GEMMBenchmark(state,
1524 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1525 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1526 1, 2, 1, 1);
1527 }
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,const char * net)1528 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
1529 GEMMBenchmark(state,
1530 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1531 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1532 2, 2, 1, 1);
1533 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,const char * net)1534 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
1535 GEMMBenchmark(state,
1536 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1537 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1538 3, 2, 1, 1);
1539 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,const char * net)1540 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
1541 GEMMBenchmark(state,
1542 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1543 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1544 4, 2, 1, 1);
1545 }
qu8_gemm_1x4__scalar_lrintf(benchmark::State & state,const char * net)1546 static void qu8_gemm_1x4__scalar_lrintf(benchmark::State& state, const char* net) {
1547 GEMMBenchmark(state,
1548 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1549 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1550 1, 4, 1, 1);
1551 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,const char * net)1552 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
1553 GEMMBenchmark(state,
1554 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1555 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1556 2, 4, 1, 1);
1557 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,const char * net)1558 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
1559 GEMMBenchmark(state,
1560 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1561 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1562 3, 4, 1, 1);
1563 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,const char * net)1564 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
1565 GEMMBenchmark(state,
1566 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1567 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1568 4, 4, 1, 1);
1569 }
1570
1571 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_fmagic)
1572 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_fmagic)
1573 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_fmagic)
1574 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_fmagic)
1575 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_fmagic)
1576 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_fmagic)
1577 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_fmagic)
1578 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_fmagic)
1579
1580 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_imagic)
1581 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_imagic)
1582 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_imagic)
1583 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_imagic)
1584 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_imagic)
1585 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_imagic)
1586 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_imagic)
1587 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_imagic)
1588
1589 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_lrintf)
1590 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_lrintf)
1591 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_lrintf)
1592 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_lrintf)
1593 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_lrintf)
1594 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_lrintf)
1595 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_lrintf)
1596 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_lrintf)
1597
1598
1599 #ifdef BENCHMARK_RUY
1600 BENCHMARK_GEMM(ruy_st)
1601 #endif // BENCHMARK_RUY
1602 #ifdef BENCHMARK_GEMMLOWP
1603 BENCHMARK_GEMM(gemmlowp_st)
1604 #endif // BENCHMARK_GEMMLOWP
1605
1606 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1607 BENCHMARK_MAIN();
1608 #endif
1609