xref: /aosp_15_r20/external/XNNPACK/bench/qu8-gemm.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <limits>
15 #include <mutex>
16 #include <random>
17 #include <vector>
18 
19 #include <cpuinfo.h>
20 
21 #include <benchmark/benchmark.h>
22 #ifdef BENCHMARK_GEMMLOWP
23 #include "gemmlowp/public/gemmlowp.h"
24 #endif  // BENCHMARK_GEMMLOWP
25 #ifdef BENCHMARK_RUY
26 #include "ruy/ruy.h"
27 #endif  // BENCHMARK_RUY
28 #include "bench/gemm.h"
29 #include "bench/utils.h"
30 
31 #include <xnnpack.h>
32 #include <xnnpack/aligned-allocator.h>
33 #include <xnnpack/common.h>
34 #include <xnnpack/gemm.h>
35 #include <xnnpack/math.h>
36 #include <xnnpack/microfnptr.h>
37 #include <xnnpack/microparams-init.h>
38 #include <xnnpack/pack.h>
39 
40 
GEMMBenchmark(benchmark::State & state,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_init_qu8_conv_minmax_params_fn init_params,size_t mr,size_t nr,size_t kr,size_t sr,benchmark::utils::IsaCheckFunction isa_check=nullptr)41 static void GEMMBenchmark(benchmark::State& state,
42   xnn_qu8_gemm_minmax_ukernel_function gemm,
43   xnn_init_qu8_conv_minmax_params_fn init_params,
44   size_t mr, size_t nr, size_t kr, size_t sr,
45   benchmark::utils::IsaCheckFunction isa_check = nullptr)
46 {
47   if (!cpuinfo_initialize()) {
48     state.SkipWithError("cpuinfo initialization failed");
49     return;
50   }
51   if (isa_check && !isa_check(state)) {
52     return;
53   }
54 
55   const size_t mc = state.range(0);
56   const size_t nc = state.range(1);
57   const size_t kc = state.range(2);
58 
59   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
60   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
61 
62   std::random_device random_device;
63   auto rng = std::mt19937(random_device());
64   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
65   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
66 
67   std::vector<uint8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint8_t));
68   std::generate(a.begin(), a.end(), std::ref(u8rng));
69   std::vector<uint8_t> k(nc * kc);
70   std::generate(k.begin(), k.end(), std::ref(u8rng));
71   std::vector<int32_t> b(nc);
72   std::generate(b.begin(), b.end(), std::ref(i32rng));
73 
74   const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
75   const size_t c_elements = mc * nc;
76   const size_t num_buffers = 1 +
77     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
78       sizeof(uint8_t) * (w_elements + c_elements));
79 
80   std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> w(w_elements * num_buffers);
81   std::fill(w.begin(), w.end(), 0);
82   const xnn_qu8_packing_params packing_params = { 127, 127 };
83   xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
84   std::vector<uint8_t> c(c_elements * num_buffers);
85   std::fill(c.begin(), c.end(), 0xA5);
86 
87   union xnn_qu8_conv_minmax_params quantization_params;
88   init_params(&quantization_params, 127, 0.75f, 127, 1, 254);
89 
90   size_t buffer_index = 0;
91   for (auto _ : state) {
92     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
93     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
94     // - W is not in cache (for any cache level)
95     // - C is not in cache (for any cache level)
96     state.PauseTiming();
97     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
98     buffer_index = (buffer_index + 1) % num_buffers;
99     state.ResumeTiming();
100 
101     for (uint32_t m = 0; m < mc; m += mr) {
102       const uint32_t mb = min(mc - m, mr);
103       for (uint32_t n = 0; n < nc; n += nr) {
104         const uint32_t nb = min(nc - n, nr);
105         gemm(
106           mb, nb, kc * sizeof(uint8_t),
107           a.data() + m * kc, kc * sizeof(uint8_t),
108           w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
109           c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
110           &quantization_params);
111       }
112     }
113   }
114 
115   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
116   if (cpu_frequency != 0) {
117     state.counters["cpufreq"] = cpu_frequency;
118   }
119 
120   state.counters["OPS"] = benchmark::Counter(
121     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
122 }
123 
124 #ifdef BENCHMARK_GEMMLOWP
125 struct GemmlowpOutputPipeline {
126   typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
127   typedef std::tuple<
128       gemmlowp::OutputStageBiasAddition<ColVectorMap>,
129       gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
130       gemmlowp::OutputStageClamp,
131       gemmlowp::OutputStageSaturatingCastToUint8>
132       Pipeline;
133 
MakeGemmlowpOutputPipeline134   static Pipeline Make(
135       const int32_t* bias_data,
136       int output_rows,
137       int32_t output_offset,
138       int32_t output_multiplier,
139       int output_shift,
140       int32_t output_activation_min,
141       int32_t output_activation_max)
142   {
143     ColVectorMap bias_vector(bias_data, output_rows);
144     gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
145     bias_addition_stage.bias_vector = bias_vector;
146     gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
147     quantize_down_stage.result_offset_after_shift = output_offset;
148     quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
149     quantize_down_stage.result_shift = output_shift;
150     gemmlowp::OutputStageClamp clamp_stage;
151     clamp_stage.min = output_activation_min;
152     clamp_stage.max = output_activation_max;
153     gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
154     return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
155   }
156 };
157 
GemmlowpBenchmark(benchmark::State & state,uint32_t threads)158 static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
159 {
160   const size_t mc = state.range(0);
161   const size_t nc = state.range(1);
162   const size_t kc = state.range(2);
163 
164   std::random_device random_device;
165   auto rng = std::mt19937(random_device());
166   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
167   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
168 
169   std::vector<uint8_t> a(mc * kc);
170   std::generate(a.begin(), a.end(), std::ref(u8rng));
171 
172   const size_t kElements = nc * kc;
173   const size_t bElements = nc;
174   const size_t c_elements = mc * nc;
175   const size_t num_buffers = 1 +
176     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
177       kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
178 
179   std::vector<uint8_t> k(kElements * num_buffers);
180   std::generate(k.begin(), k.end(), std::ref(u8rng));
181   std::vector<int32_t> b(bElements * num_buffers);
182   std::generate(b.begin(), b.end(), std::ref(i32rng));
183   std::vector<uint8_t> c(c_elements * num_buffers);
184   std::fill(c.begin(), c.end(), 0xA5);
185 
186   gemmlowp::MultiThreadGemmContext threadingContext;
187   threadingContext.set_max_num_threads(threads);
188 
189   size_t buffer_index = 0;
190   for (auto _ : state) {
191     state.PauseTiming();
192     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
193     buffer_index = (buffer_index + 1) % num_buffers;
194     state.ResumeTiming();
195 
196     gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
197     gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
198     gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
199     const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
200     gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
201         &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
202   }
203 
204   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
205   if (cpu_frequency != 0) {
206     state.counters["cpufreq"] = cpu_frequency;
207   }
208 
209   state.counters["OPS"] = benchmark::Counter(
210     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
211 }
212 
gemmlowp_st(benchmark::State & state,const char * net)213 static void gemmlowp_st(benchmark::State& state, const char* net)
214 {
215   GemmlowpBenchmark(state, 1);
216 }
217 #endif  // BENCHMARK_GEMMLOWP
218 
219 
220 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)221 static void RuyBenchmark(benchmark::State& state, size_t threads)
222 {
223   const size_t mc = state.range(0);
224   const size_t nc = state.range(1);
225   const size_t kc = state.range(2);
226 
227   std::random_device random_device;
228   auto rng = std::mt19937(random_device());
229   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
230   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
231 
232   const size_t num_buffers = 1 +
233     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
234       nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
235 
236   std::vector<uint8_t> a(mc * kc);
237   std::generate(a.begin(), a.end(), std::ref(u8rng));
238   std::vector<uint8_t> k(num_buffers * nc * kc);
239   std::generate(k.begin(), k.end(), std::ref(u8rng));
240   std::vector<int32_t> b(num_buffers * nc);
241   std::generate(b.begin(), b.end(), std::ref(i32rng));
242   std::vector<uint8_t> c(num_buffers * nc * mc);
243   std::fill(c.begin(), c.end(), std::nanf(""));
244 
245   // Note: context must be static to avoid the cost of re-creating it for each benchmark.
246   static ruy::Context context;
247   context.set_max_num_threads(threads);
248 
249   ruy::Matrix<uint8_t> ruy_a;
250   ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
251   ruy_a.set_zero_point(127);
252   ruy::Matrix<uint8_t> ruy_b;
253   ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
254   ruy_b.set_data(a.data());
255   ruy_b.set_zero_point(127);
256   ruy::Matrix<uint8_t> ruy_c;
257   ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
258   ruy_c.set_zero_point(127);
259 
260   ruy::MulParams<int32_t, uint8_t> mul_params;
261   mul_params.set_multiplier_fixedpoint(0x40000000);
262 
263   // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
264   // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
265   // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
266   // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
267   static std::once_flag warmup;
268   std::call_once(warmup, [&](){
269     auto start = std::chrono::steady_clock::now();
270     do {
271       ruy_a.set_data(k.data());
272       ruy_c.set_data(c.data());
273       mul_params.set_bias(b.data());
274 
275       ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
276     } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
277   });
278 
279   size_t buffer_index = 0;
280   for (auto _ : state) {
281     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
282     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
283     // - K is not in cache (for any cache level)
284     // - B is not in cache (for any cache level)
285     // - C is not in cache (for any cache level)
286     state.PauseTiming();
287     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
288     buffer_index = (buffer_index + 1) % num_buffers;
289     state.ResumeTiming();
290 
291     ruy_a.set_data(k.data() + buffer_index * nc * kc);
292     ruy_c.set_data(c.data() + buffer_index * mc * nc);
293     mul_params.set_bias(b.data() + buffer_index * nc);
294 
295     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
296   }
297 
298   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
299   if (cpu_frequency != 0) {
300     state.counters["cpufreq"] = cpu_frequency;
301   }
302 
303   state.counters["OPS"] = benchmark::Counter(
304     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
305 }
306 
ruy_st(benchmark::State & state,const char * net)307 static void ruy_st(benchmark::State& state, const char* net)
308 {
309   RuyBenchmark(state, 1);
310 }
311 #endif  // BENCHMARK_RUY
312 
313 
314 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)315   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
316     GEMMBenchmark(state,
317       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
318       xnn_init_qu8_conv_minmax_rndnu_neon_params,
319       4, 8, 1, 1, benchmark::utils::CheckNEON);
320   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)321   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
322     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
323       xnn_init_qu8_conv_minmax_rndnu_neon_params,
324       4, 8, 1, 1, benchmark::utils::CheckNEON);
325   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)326   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
327     GEMMBenchmark(state,
328       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
329       xnn_init_qu8_conv_minmax_rndnu_neon_params,
330       4, 8, 1, 1, benchmark::utils::CheckNEON);
331   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)332   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
333     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
334       xnn_init_qu8_conv_minmax_rndnu_neon_params,
335       4, 8, 1, 1, benchmark::utils::CheckNEON);
336   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)337   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
338     GEMMBenchmark(state,
339       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
340       xnn_init_qu8_conv_minmax_rndnu_neon_params,
341       4, 8, 1, 1, benchmark::utils::CheckNEON);
342   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)343   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
344     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
345       xnn_init_qu8_conv_minmax_rndnu_neon_params,
346       4, 8, 1, 1, benchmark::utils::CheckNEON);
347   }
qu8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)348   static void qu8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
349     GEMMBenchmark(state,
350       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7,
351       xnn_init_qu8_conv_minmax_rndnu_neon_params,
352       1, 8, 1, 1, benchmark::utils::CheckNEON);
353   }
qu8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)354   static void qu8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
355     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
356       xnn_init_qu8_conv_minmax_rndnu_neon_params,
357       1, 8, 1, 1, benchmark::utils::CheckNEON);
358   }
359 
360   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)361   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
362   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
363   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
364   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
365   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
366   BENCHMARK_GEMM(qu8_gemm_1x8__aarch32_neon_mlal_lane_cortex_a7)
367   BENCHMARK_GEMM(qu8_gemm_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
368 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
369 
370 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
371   static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
372     GEMMBenchmark(state,
373       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
374       xnn_init_qu8_conv_minmax_rndnu_neon_params,
375       4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
376   }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)377   static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
378     GEMMBenchmark(state,
379       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
380       xnn_init_qu8_conv_minmax_rndnu_neon_params,
381       4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
382   }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)383   static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
384     GEMMBenchmark(state,
385       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
386       xnn_init_qu8_conv_minmax_rndnu_neon_params,
387       4, 8, 4, 1,
388       benchmark::utils::CheckNEONDOT);
389   }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,const char * net)390   static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
391     GEMMBenchmark(state,
392       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
393       xnn_init_qu8_conv_minmax_rndnu_neon_params,
394       4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
395   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)396   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
397     GEMMBenchmark(state,
398       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
399       xnn_init_qu8_conv_minmax_rndnu_neon_params,
400       4, 16, 1, 1,
401       benchmark::utils::CheckNEON);
402   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)403   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
404     GEMMBenchmark(state,
405       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
406       xnn_init_qu8_conv_minmax_rndnu_neon_params,
407       4, 16, 1, 1,
408       benchmark::utils::CheckNEON);
409   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)410   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
411     GEMMBenchmark(state,
412       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
413       xnn_init_qu8_conv_minmax_rndnu_neon_params,
414       4, 16, 1, 1,
415       benchmark::utils::CheckNEON);
416   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)417   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
418     GEMMBenchmark(state,
419       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
420       xnn_init_qu8_conv_minmax_rndnu_neon_params,
421       4, 16, 1, 1,
422       benchmark::utils::CheckNEON);
423   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,const char * net)424   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, const char* net) {
425     GEMMBenchmark(state,
426       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
427       xnn_init_qu8_conv_minmax_rndnu_neon_params,
428       4, 16, 1, 1,
429       benchmark::utils::CheckNEON);
430   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,const char * net)431   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, const char* net) {
432     GEMMBenchmark(state,
433       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
434       xnn_init_qu8_conv_minmax_rndnu_neon_params,
435       4, 16, 1, 1,
436       benchmark::utils::CheckNEON);
437   }
438   BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55)
BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)439   BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)
440   BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_ld128)
441   BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_ld128)
442   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
443   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
444   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
445   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
446   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75)
447   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75)
448 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
449 
450 
451 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
452   static void qu8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
453     GEMMBenchmark(state,
454       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
455       xnn_init_qu8_conv_minmax_rndnu_neon_params,
456       1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
457   }
qu8_gemm_2x8c4__neondot(benchmark::State & state,const char * net)458   static void qu8_gemm_2x8c4__neondot(benchmark::State& state, const char* net) {
459     GEMMBenchmark(state,
460       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
461       xnn_init_qu8_conv_minmax_rndnu_neon_params,
462       2, 8, 4, 1, benchmark::utils::CheckNEONDOT);
463   }
qu8_gemm_3x8c4__neondot(benchmark::State & state,const char * net)464   static void qu8_gemm_3x8c4__neondot(benchmark::State& state, const char* net) {
465     GEMMBenchmark(state,
466       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
467       xnn_init_qu8_conv_minmax_rndnu_neon_params,
468       3, 8, 4, 1, benchmark::utils::CheckNEONDOT);
469   }
qu8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)470   static void qu8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
471     GEMMBenchmark(state,
472       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
473       xnn_init_qu8_conv_minmax_rndnu_neon_params,
474       4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
475   }
qu8_gemm_5x8c4__neondot(benchmark::State & state,const char * net)476   static void qu8_gemm_5x8c4__neondot(benchmark::State& state, const char* net) {
477     GEMMBenchmark(state,
478       xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
479       xnn_init_qu8_conv_minmax_rndnu_neon_params,
480       5, 8, 4, 1, benchmark::utils::CheckNEONDOT);
481   }
qu8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)482   static void qu8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
483     GEMMBenchmark(state,
484       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
485       xnn_init_qu8_conv_minmax_rndnu_neon_params,
486       6, 8, 4, 1, benchmark::utils::CheckNEONDOT);
487   }
qu8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)488   static void qu8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
489     GEMMBenchmark(state,
490       xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
491       xnn_init_qu8_conv_minmax_rndnu_neon_params,
492       8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
493   }
qu8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)494   static void qu8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
495     GEMMBenchmark(state,
496       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
497       xnn_init_qu8_conv_minmax_rndnu_neon_params,
498       1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
499   }
qu8_gemm_2x16c4__neondot(benchmark::State & state,const char * net)500   static void qu8_gemm_2x16c4__neondot(benchmark::State& state, const char* net) {
501     GEMMBenchmark(state,
502       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
503       xnn_init_qu8_conv_minmax_rndnu_neon_params,
504       2, 16, 4, 1, benchmark::utils::CheckNEONDOT);
505   }
qu8_gemm_3x16c4__neondot(benchmark::State & state,const char * net)506   static void qu8_gemm_3x16c4__neondot(benchmark::State& state, const char* net) {
507     GEMMBenchmark(state,
508       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
509       xnn_init_qu8_conv_minmax_rndnu_neon_params,
510       3, 16, 4, 1, benchmark::utils::CheckNEONDOT);
511   }
qu8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)512   static void qu8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
513     GEMMBenchmark(state,
514       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
515       xnn_init_qu8_conv_minmax_rndnu_neon_params,
516       4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
517   }
qu8_gemm_5x16c4__neondot(benchmark::State & state,const char * net)518   static void qu8_gemm_5x16c4__neondot(benchmark::State& state, const char* net) {
519     GEMMBenchmark(state,
520       xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
521       xnn_init_qu8_conv_minmax_rndnu_neon_params,
522       5, 16, 4, 1, benchmark::utils::CheckNEONDOT);
523   }
qu8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)524   static void qu8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
525     GEMMBenchmark(state,
526       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
527       xnn_init_qu8_conv_minmax_rndnu_neon_params,
528       6, 16, 4, 1, benchmark::utils::CheckNEONDOT);
529   }
qu8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)530   static void qu8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
531     GEMMBenchmark(state,
532       xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
533       xnn_init_qu8_conv_minmax_rndnu_neon_params,
534       8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
535   }
qu8_gemm_1x32c4__neondot(benchmark::State & state,const char * net)536   static void qu8_gemm_1x32c4__neondot(benchmark::State& state, const char* net) {
537     GEMMBenchmark(state,
538       xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
539       xnn_init_qu8_conv_minmax_rndnu_neon_params,
540       1, 32, 4, 1, benchmark::utils::CheckNEONDOT);
541   }
qu8_gemm_2x32c4__neondot(benchmark::State & state,const char * net)542   static void qu8_gemm_2x32c4__neondot(benchmark::State& state, const char* net) {
543     GEMMBenchmark(state,
544       xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
545       xnn_init_qu8_conv_minmax_rndnu_neon_params,
546       2, 32, 4, 1, benchmark::utils::CheckNEONDOT);
547   }
qu8_gemm_3x32c4__neondot(benchmark::State & state,const char * net)548   static void qu8_gemm_3x32c4__neondot(benchmark::State& state, const char* net) {
549     GEMMBenchmark(state,
550       xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
551       xnn_init_qu8_conv_minmax_rndnu_neon_params,
552       3, 32, 4, 1, benchmark::utils::CheckNEONDOT);
553   }
554 
555   BENCHMARK_GEMM(qu8_gemm_1x8c4__neondot)
BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)556   BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)
557   BENCHMARK_GEMM(qu8_gemm_3x8c4__neondot)
558   BENCHMARK_GEMM(qu8_gemm_4x8c4__neondot)
559   BENCHMARK_GEMM(qu8_gemm_5x8c4__neondot)
560   BENCHMARK_GEMM(qu8_gemm_6x8c4__neondot)
561   BENCHMARK_GEMM(qu8_gemm_8x8c4__neondot)
562   BENCHMARK_GEMM(qu8_gemm_1x16c4__neondot)
563   BENCHMARK_GEMM(qu8_gemm_2x16c4__neondot)
564   BENCHMARK_GEMM(qu8_gemm_3x16c4__neondot)
565   BENCHMARK_GEMM(qu8_gemm_4x16c4__neondot)
566   BENCHMARK_GEMM(qu8_gemm_5x16c4__neondot)
567   BENCHMARK_GEMM(qu8_gemm_6x16c4__neondot)
568   BENCHMARK_GEMM(qu8_gemm_8x16c4__neondot)
569   BENCHMARK_GEMM(qu8_gemm_1x32c4__neondot)
570   BENCHMARK_GEMM(qu8_gemm_2x32c4__neondot)
571   BENCHMARK_GEMM(qu8_gemm_3x32c4__neondot)
572 #endif  // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
573 
574 
575 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
576   static void qu8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
577     GEMMBenchmark(state,
578       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
579       xnn_init_qu8_conv_minmax_rndnu_neon_params,
580       1, 8, 1, 1, benchmark::utils::CheckNEON);
581   }
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)582   static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
583     GEMMBenchmark(state,
584       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
585       xnn_init_qu8_conv_minmax_rndnu_neon_params,
586       2, 8, 1, 1, benchmark::utils::CheckNEON);
587   }
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)588   static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
589     GEMMBenchmark(state,
590       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
591       xnn_init_qu8_conv_minmax_rndnu_neon_params,
592       3, 8, 1, 1, benchmark::utils::CheckNEON);
593   }
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)594   static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
595     GEMMBenchmark(state,
596       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
597       xnn_init_qu8_conv_minmax_rndnu_neon_params,
598       4, 8, 1, 1, benchmark::utils::CheckNEON);
599   }
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)600   static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
601     GEMMBenchmark(state,
602       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
603       xnn_init_qu8_conv_minmax_rndnu_neon_params,
604       6, 8, 1, 1, benchmark::utils::CheckNEON);
605   }
qu8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)606   static void qu8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
607     GEMMBenchmark(state,
608       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
609       xnn_init_qu8_conv_minmax_rndnu_neon_params,
610       1, 16, 1, 1, benchmark::utils::CheckNEON);
611   }
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)612   static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
613     GEMMBenchmark(state,
614       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
615       xnn_init_qu8_conv_minmax_rndnu_neon_params,
616       2, 16, 1, 1, benchmark::utils::CheckNEON);
617   }
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)618   static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
619     GEMMBenchmark(state,
620       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
621       xnn_init_qu8_conv_minmax_rndnu_neon_params,
622       3, 16, 1, 1, benchmark::utils::CheckNEON);
623   }
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)624   static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
625     GEMMBenchmark(state,
626       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
627       xnn_init_qu8_conv_minmax_rndnu_neon_params,
628       4, 16, 1, 1, benchmark::utils::CheckNEON);
629   }
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)630   static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
631     GEMMBenchmark(state,
632       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
633       xnn_init_qu8_conv_minmax_rndnu_neon_params,
634       6, 16, 1, 1, benchmark::utils::CheckNEON);
635   }
636 
637   BENCHMARK_GEMM(qu8_gemm_1x8__neon_mlal_lane)
BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)638   BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)
639   BENCHMARK_GEMM(qu8_gemm_3x8__neon_mlal_lane)
640   BENCHMARK_GEMM(qu8_gemm_4x8__neon_mlal_lane)
641   BENCHMARK_GEMM(qu8_gemm_6x8__neon_mlal_lane)
642   BENCHMARK_GEMM(qu8_gemm_1x16__neon_mlal_lane)
643   BENCHMARK_GEMM(qu8_gemm_2x16__neon_mlal_lane)
644   BENCHMARK_GEMM(qu8_gemm_3x16__neon_mlal_lane)
645   BENCHMARK_GEMM(qu8_gemm_4x16__neon_mlal_lane)
646   BENCHMARK_GEMM(qu8_gemm_6x16__neon_mlal_lane)
647 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
648 
649 
650 #if XNN_ARCH_ARM
651   static void qu8_gemm_1x1c4__armsimd32(benchmark::State& state, const char* net) {
652     GEMMBenchmark(state,
653       xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
654       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
655       1, 1, 4, 1, benchmark::utils::CheckARMV6);
656   }
qu8_gemm_2x1c4__armsimd32(benchmark::State & state,const char * net)657   static void qu8_gemm_2x1c4__armsimd32(benchmark::State& state, const char* net) {
658     GEMMBenchmark(state,
659       xnn_qu8_gemm_minmax_fp32_ukernel_1x1c4__armsimd32,
660       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
661       2, 1, 4, 1, benchmark::utils::CheckARMV6);
662   }
qu8_gemm_1x2c4__armsimd32(benchmark::State & state,const char * net)663   static void qu8_gemm_1x2c4__armsimd32(benchmark::State& state, const char* net) {
664     GEMMBenchmark(state,
665       xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
666       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
667       1, 2, 4, 1, benchmark::utils::CheckARMV6);
668   }
qu8_gemm_2x2c4__armsimd32(benchmark::State & state,const char * net)669   static void qu8_gemm_2x2c4__armsimd32(benchmark::State& state, const char* net) {
670     GEMMBenchmark(state,
671       xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32,
672       xnn_init_qu8_conv_minmax_fp32_armsimd32_params,
673       2, 2, 4, 1, benchmark::utils::CheckARMV6);
674   }
675 
676   BENCHMARK_GEMM(qu8_gemm_1x1c4__armsimd32)
BENCHMARK_GEMM(qu8_gemm_2x1c4__armsimd32)677   BENCHMARK_GEMM(qu8_gemm_2x1c4__armsimd32)
678   BENCHMARK_GEMM(qu8_gemm_1x2c4__armsimd32)
679   BENCHMARK_GEMM(qu8_gemm_2x2c4__armsimd32)
680 #endif  // XNN_ARCH_ARM
681 
682 
683 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
684   static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
685     GEMMBenchmark(state,
686       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
687       xnn_init_qu8_conv_minmax_fp32_avx512_params,
688       1, 16, 8, 1,
689       benchmark::utils::CheckAVX512SKX);
690   }
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,const char * net)691   static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
692     GEMMBenchmark(state,
693       xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
694       xnn_init_qu8_conv_minmax_fp32_avx512_params,
695       2, 16, 8, 1,
696       benchmark::utils::CheckAVX512SKX);
697   }
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)698   static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
699     GEMMBenchmark(state,
700       xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
701       xnn_init_qu8_conv_minmax_fp32_avx512_params,
702       3, 16, 8, 1,
703       benchmark::utils::CheckAVX512SKX);
704   }
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)705   static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
706     GEMMBenchmark(state,
707       xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
708       xnn_init_qu8_conv_minmax_fp32_avx512_params,
709       4, 16, 8, 1,
710       benchmark::utils::CheckAVX512SKX);
711   }
qu8_gemm_1x8c8__avx2(benchmark::State & state,const char * net)712   static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
713     GEMMBenchmark(state,
714       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
715       xnn_init_qu8_conv_minmax_fp32_avx2_params,
716       1, 8, 8, 1,
717       benchmark::utils::CheckAVX2);
718   }
qu8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)719   static void qu8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
720     GEMMBenchmark(state,
721       xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
722       xnn_init_qu8_conv_minmax_fp32_avx2_params,
723       2, 8, 8, 1,
724       benchmark::utils::CheckAVX2);
725   }
qu8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)726   static void qu8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
727     GEMMBenchmark(state,
728       xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
729       xnn_init_qu8_conv_minmax_fp32_avx2_params,
730       3, 8, 8, 1,
731       benchmark::utils::CheckAVX2);
732   }
qu8_gemm_1x4c2__xop_ld64(benchmark::State & state,const char * net)733   static void qu8_gemm_1x4c2__xop_ld64(benchmark::State& state, const char* net) {
734     GEMMBenchmark(state,
735       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
736       xnn_init_qu8_conv_minmax_fp32_sse2_params,
737       1, 4, 2, 1,
738       benchmark::utils::CheckXOP);
739   }
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)740   static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
741     GEMMBenchmark(state,
742       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
743       xnn_init_qu8_conv_minmax_fp32_sse2_params,
744       2, 4, 2, 1,
745       benchmark::utils::CheckXOP);
746   }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)747   static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
748     GEMMBenchmark(state,
749       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
750       xnn_init_qu8_conv_minmax_fp32_sse2_params,
751       3, 4, 2, 1,
752       benchmark::utils::CheckXOP);
753   }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)754   static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
755     GEMMBenchmark(state,
756       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
757       xnn_init_qu8_conv_minmax_fp32_sse2_params,
758       4, 4, 2, 1,
759       benchmark::utils::CheckXOP);
760   }
qu8_gemm_1x4c2__xop_ld128(benchmark::State & state,const char * net)761   static void qu8_gemm_1x4c2__xop_ld128(benchmark::State& state, const char* net) {
762     GEMMBenchmark(state,
763       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
764       xnn_init_qu8_conv_minmax_fp32_sse2_params,
765       1, 4, 2, 1,
766       benchmark::utils::CheckXOP);
767   }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)768   static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
769     GEMMBenchmark(state,
770       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
771       xnn_init_qu8_conv_minmax_fp32_sse2_params,
772       2, 4, 2, 1,
773       benchmark::utils::CheckXOP);
774   }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)775   static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
776     GEMMBenchmark(state,
777       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
778       xnn_init_qu8_conv_minmax_fp32_sse2_params,
779       3, 4, 2, 1,
780       benchmark::utils::CheckXOP);
781   }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)782   static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
783     GEMMBenchmark(state,
784       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
785       xnn_init_qu8_conv_minmax_fp32_sse2_params,
786       4, 4, 2, 1,
787       benchmark::utils::CheckXOP);
788   }
qu8_gemm_1x4c8__xop_ld64(benchmark::State & state,const char * net)789   static void qu8_gemm_1x4c8__xop_ld64(benchmark::State& state, const char* net) {
790     GEMMBenchmark(state,
791       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
792       xnn_init_qu8_conv_minmax_fp32_sse2_params,
793       1, 4, 8, 1,
794       benchmark::utils::CheckXOP);
795   }
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)796   static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
797     GEMMBenchmark(state,
798       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
799       xnn_init_qu8_conv_minmax_fp32_sse2_params,
800       2, 4, 8, 1,
801       benchmark::utils::CheckXOP);
802   }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)803   static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
804     GEMMBenchmark(state,
805       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
806       xnn_init_qu8_conv_minmax_fp32_sse2_params,
807       3, 4, 8, 1,
808       benchmark::utils::CheckXOP);
809   }
qu8_gemm_1x4c8__xop_ld128(benchmark::State & state,const char * net)810   static void qu8_gemm_1x4c8__xop_ld128(benchmark::State& state, const char* net) {
811     GEMMBenchmark(state,
812       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
813       xnn_init_qu8_conv_minmax_fp32_sse2_params,
814       1, 4, 8, 1,
815       benchmark::utils::CheckXOP);
816   }
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)817   static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
818     GEMMBenchmark(state,
819       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
820       xnn_init_qu8_conv_minmax_fp32_sse2_params,
821       2, 4, 8, 1,
822       benchmark::utils::CheckXOP);
823   }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)824   static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
825     GEMMBenchmark(state,
826       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
827       xnn_init_qu8_conv_minmax_fp32_sse2_params,
828       3, 4, 8, 1,
829       benchmark::utils::CheckXOP);
830   }
qu8_gemm_1x4c2__avx_ld64(benchmark::State & state,const char * net)831   static void qu8_gemm_1x4c2__avx_ld64(benchmark::State& state, const char* net) {
832     GEMMBenchmark(state,
833       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
834       xnn_init_qu8_conv_minmax_fp32_sse2_params,
835       1, 4, 2, 1,
836       benchmark::utils::CheckAVX);
837   }
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)838   static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
839     GEMMBenchmark(state,
840       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
841       xnn_init_qu8_conv_minmax_fp32_sse2_params,
842       2, 4, 2, 1,
843       benchmark::utils::CheckAVX);
844   }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)845   static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
846     GEMMBenchmark(state,
847       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
848       xnn_init_qu8_conv_minmax_fp32_sse2_params,
849       3, 4, 2, 1,
850       benchmark::utils::CheckAVX);
851   }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)852   static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
853     GEMMBenchmark(state,
854       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
855       xnn_init_qu8_conv_minmax_fp32_sse2_params,
856       4, 4, 2, 1,
857       benchmark::utils::CheckAVX);
858   }
qu8_gemm_1x4c2__avx_ld128(benchmark::State & state,const char * net)859   static void qu8_gemm_1x4c2__avx_ld128(benchmark::State& state, const char* net) {
860     GEMMBenchmark(state,
861       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
862       xnn_init_qu8_conv_minmax_fp32_sse2_params,
863       1, 4, 2, 1,
864       benchmark::utils::CheckAVX);
865   }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)866   static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
867     GEMMBenchmark(state,
868       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
869       xnn_init_qu8_conv_minmax_fp32_sse2_params,
870       2, 4, 2, 1,
871       benchmark::utils::CheckAVX);
872   }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)873   static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
874     GEMMBenchmark(state,
875       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
876       xnn_init_qu8_conv_minmax_fp32_sse2_params,
877       3, 4, 2, 1,
878       benchmark::utils::CheckAVX);
879   }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)880   static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
881     GEMMBenchmark(state,
882       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
883       xnn_init_qu8_conv_minmax_fp32_sse2_params,
884       4, 4, 2, 1,
885       benchmark::utils::CheckAVX);
886   }
qu8_gemm_1x4c8__avx_ld64(benchmark::State & state,const char * net)887   static void qu8_gemm_1x4c8__avx_ld64(benchmark::State& state, const char* net) {
888     GEMMBenchmark(state,
889       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
890       xnn_init_qu8_conv_minmax_fp32_sse2_params,
891       1, 4, 8, 1,
892       benchmark::utils::CheckAVX);
893   }
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)894   static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
895     GEMMBenchmark(state,
896       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
897       xnn_init_qu8_conv_minmax_fp32_sse2_params,
898       2, 4, 8, 1,
899       benchmark::utils::CheckAVX);
900   }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)901   static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
902     GEMMBenchmark(state,
903       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
904       xnn_init_qu8_conv_minmax_fp32_sse2_params,
905       3, 4, 8, 1,
906       benchmark::utils::CheckAVX);
907   }
qu8_gemm_1x4c8__avx_ld128(benchmark::State & state,const char * net)908   static void qu8_gemm_1x4c8__avx_ld128(benchmark::State& state, const char* net) {
909     GEMMBenchmark(state,
910       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
911       xnn_init_qu8_conv_minmax_fp32_sse2_params,
912       1, 4, 8, 1,
913       benchmark::utils::CheckAVX);
914   }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)915   static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
916     GEMMBenchmark(state,
917       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
918       xnn_init_qu8_conv_minmax_fp32_sse2_params,
919       2, 4, 8, 1,
920       benchmark::utils::CheckAVX);
921   }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)922   static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
923     GEMMBenchmark(state,
924       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
925       xnn_init_qu8_conv_minmax_fp32_sse2_params,
926       3, 4, 8, 1,
927       benchmark::utils::CheckAVX);
928   }
qu8_gemm_1x4c2__sse41_ld64(benchmark::State & state,const char * net)929   static void qu8_gemm_1x4c2__sse41_ld64(benchmark::State& state, const char* net) {
930     GEMMBenchmark(state,
931       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
932       xnn_init_qu8_conv_minmax_fp32_sse2_params,
933       1, 4, 2, 1,
934       benchmark::utils::CheckSSE41);
935   }
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)936   static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
937     GEMMBenchmark(state,
938       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
939       xnn_init_qu8_conv_minmax_fp32_sse2_params,
940       2, 4, 2, 1,
941       benchmark::utils::CheckSSE41);
942   }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)943   static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
944     GEMMBenchmark(state,
945       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
946       xnn_init_qu8_conv_minmax_fp32_sse2_params,
947       3, 4, 2, 1,
948       benchmark::utils::CheckSSE41);
949   }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)950   static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
951     GEMMBenchmark(state,
952       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
953       xnn_init_qu8_conv_minmax_fp32_sse2_params,
954       4, 4, 2, 1,
955       benchmark::utils::CheckSSE41);
956   }
qu8_gemm_1x4c2__sse41_ld128(benchmark::State & state,const char * net)957   static void qu8_gemm_1x4c2__sse41_ld128(benchmark::State& state, const char* net) {
958     GEMMBenchmark(state,
959       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
960       xnn_init_qu8_conv_minmax_fp32_sse2_params,
961       1, 4, 2, 1,
962       benchmark::utils::CheckSSE41);
963   }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)964   static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
965     GEMMBenchmark(state,
966       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
967       xnn_init_qu8_conv_minmax_fp32_sse2_params,
968       2, 4, 2, 1,
969       benchmark::utils::CheckSSE41);
970   }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)971   static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
972     GEMMBenchmark(state,
973       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
974       xnn_init_qu8_conv_minmax_fp32_sse2_params,
975       3, 4, 2, 1,
976       benchmark::utils::CheckSSE41);
977   }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)978   static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
979     GEMMBenchmark(state,
980       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
981       xnn_init_qu8_conv_minmax_fp32_sse2_params,
982       4, 4, 2, 1,
983       benchmark::utils::CheckSSE41);
984   }
qu8_gemm_1x4c8__sse41_ld64(benchmark::State & state,const char * net)985   static void qu8_gemm_1x4c8__sse41_ld64(benchmark::State& state, const char* net) {
986     GEMMBenchmark(state,
987       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
988       xnn_init_qu8_conv_minmax_fp32_sse2_params,
989       1, 4, 8, 1,
990       benchmark::utils::CheckSSE41);
991   }
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)992   static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
993     GEMMBenchmark(state,
994       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
995       xnn_init_qu8_conv_minmax_fp32_sse2_params,
996       2, 4, 8, 1,
997       benchmark::utils::CheckSSE41);
998   }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)999   static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1000     GEMMBenchmark(state,
1001       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
1002       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1003       3, 4, 8, 1,
1004       benchmark::utils::CheckSSE41);
1005   }
qu8_gemm_1x4c8__sse41_ld128(benchmark::State & state,const char * net)1006   static void qu8_gemm_1x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1007     GEMMBenchmark(state,
1008       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
1009       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1010       1, 4, 8, 1,
1011       benchmark::utils::CheckSSE41);
1012   }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)1013   static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1014     GEMMBenchmark(state,
1015       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
1016       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1017       2, 4, 8, 1,
1018       benchmark::utils::CheckSSE41);
1019   }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)1020   static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1021     GEMMBenchmark(state,
1022       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
1023       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1024       3, 4, 8, 1,
1025       benchmark::utils::CheckSSE41);
1026   }
qu8_gemm_1x4c2__sse2_ld64(benchmark::State & state,const char * net)1027   static void qu8_gemm_1x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1028     GEMMBenchmark(state,
1029       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1030       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1031       1, 4, 2, 1);
1032   }
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)1033   static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1034     GEMMBenchmark(state,
1035       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1036       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1037       2, 4, 2, 1);
1038   }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)1039   static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1040     GEMMBenchmark(state,
1041       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1042       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1043       3, 4, 2, 1);
1044   }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)1045   static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1046     GEMMBenchmark(state,
1047       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1048       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1049       4, 4, 2, 1);
1050   }
qu8_gemm_1x4c2__sse2_ld128(benchmark::State & state,const char * net)1051   static void qu8_gemm_1x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1052     GEMMBenchmark(state,
1053       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1054       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1055       1, 4, 2, 1);
1056   }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1057   static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1058     GEMMBenchmark(state,
1059       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1060       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1061       2, 4, 2, 1);
1062   }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1063   static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1064     GEMMBenchmark(state,
1065       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1066       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1067       3, 4, 2, 1);
1068   }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1069   static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1070     GEMMBenchmark(state,
1071       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1072       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1073       4, 4, 2, 1);
1074   }
qu8_gemm_1x4c8__sse2_ld64(benchmark::State & state,const char * net)1075   static void qu8_gemm_1x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1076     GEMMBenchmark(state,
1077       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1078       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1079       1, 4, 8, 1);
1080   }
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,const char * net)1081   static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1082     GEMMBenchmark(state,
1083       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1084       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1085       2, 4, 8, 1);
1086   }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,const char * net)1087   static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1088     GEMMBenchmark(state,
1089       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1090       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1091       3, 4, 8, 1);
1092   }
qu8_gemm_1x4c8__sse2_ld128(benchmark::State & state,const char * net)1093   static void qu8_gemm_1x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1094     GEMMBenchmark(state,
1095       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1096       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1097       1, 4, 8, 1);
1098   }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,const char * net)1099   static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1100     GEMMBenchmark(state,
1101       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1102       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1103       2, 4, 8, 1);
1104   }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,const char * net)1105   static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1106     GEMMBenchmark(state,
1107       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1108       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1109       3, 4, 8, 1);
1110   }
1111 
1112   BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)1113   BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)
1114   BENCHMARK_GEMM(qu8_gemm_3x16c8__avx512skx)
1115   BENCHMARK_GEMM(qu8_gemm_4x16c8__avx512skx)
1116 
1117   BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
1118   BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
1119   BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
1120 
1121   BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld64)
1122   BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld64)
1123   BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld64)
1124   BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld64)
1125   BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld128)
1126   BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld128)
1127   BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld128)
1128   BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld128)
1129   BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld64)
1130   BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld64)
1131   BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld64)
1132   BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld128)
1133   BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld128)
1134   BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld128)
1135 
1136   BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld64)
1137   BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld64)
1138   BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld64)
1139   BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld64)
1140   BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld128)
1141   BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld128)
1142   BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld128)
1143   BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld128)
1144   BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld64)
1145   BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld64)
1146   BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld64)
1147   BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld128)
1148   BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld128)
1149   BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld128)
1150 
1151   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld64)
1152   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld64)
1153   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld64)
1154   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld64)
1155   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld128)
1156   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld128)
1157   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld128)
1158   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld128)
1159   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld64)
1160   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld64)
1161   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld64)
1162   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld128)
1163   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld128)
1164   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld128)
1165 
1166   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld64)
1167   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld64)
1168   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld64)
1169   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld64)
1170   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld128)
1171   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld128)
1172   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld128)
1173   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld128)
1174   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld64)
1175   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld64)
1176   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld64)
1177   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld128)
1178   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld128)
1179   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld128)
1180 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1181 
1182 
1183 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1184   static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1185     GEMMBenchmark(state,
1186       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1187       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188       1, 4, 2, 1);
1189   }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1190   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1191     GEMMBenchmark(state,
1192       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1193       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1194       2, 4, 2, 1);
1195   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1196   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1197     GEMMBenchmark(state,
1198       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1199       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1200       3, 4, 2, 1);
1201   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1202   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1203     GEMMBenchmark(state,
1204       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1205       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1206       4, 4, 2, 1);
1207   }
1208 
qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1209   static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1210     GEMMBenchmark(state,
1211       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1212       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1213       1, 4, 2, 1);
1214   }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1215   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1216     GEMMBenchmark(state,
1217       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1218       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1219       2, 4, 2, 1);
1220   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1221   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1222     GEMMBenchmark(state,
1223       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1224       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225       3, 4, 2, 1);
1226   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1227   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1228     GEMMBenchmark(state,
1229       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1230       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1231       4, 4, 2, 1);
1232   }
1233 
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1234   static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1235     GEMMBenchmark(state,
1236       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1237       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1238       1, 4, 2, 4);
1239   }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1240   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1241     GEMMBenchmark(state,
1242       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1243       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244       2, 4, 2, 4);
1245   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1246   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1247     GEMMBenchmark(state,
1248       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1249       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1250       3, 4, 2, 4);
1251   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1252   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1253     GEMMBenchmark(state,
1254       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1255       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1256       4, 4, 2, 4);
1257   }
1258 
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1259   static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1260     GEMMBenchmark(state,
1261       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1262       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1263       1, 4, 2, 4);
1264   }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1265   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1266     GEMMBenchmark(state,
1267       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1268       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1269       2, 4, 2, 4);
1270   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1271   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1272     GEMMBenchmark(state,
1273       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1274       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1275       3, 4, 2, 4);
1276   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1277   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1278     GEMMBenchmark(state,
1279       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1280       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1281       4, 4, 2, 4);
1282   }
1283 
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1284   static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1285     GEMMBenchmark(state,
1286       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1287       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1288       1, 4, 8, 1);
1289   }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1290   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1291     GEMMBenchmark(state,
1292       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1293       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1294       2, 4, 8, 1);
1295   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1296   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1297     GEMMBenchmark(state,
1298       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1299       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1300       3, 4, 8, 1);
1301   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1302   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1303     GEMMBenchmark(state,
1304       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1305       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1306       4, 4, 8, 1);
1307   }
1308 
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1309   static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1310     GEMMBenchmark(state,
1311       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1312       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1313       1, 4, 8, 1);
1314   }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1315   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1316     GEMMBenchmark(state,
1317       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1318       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1319       2, 4, 8, 1);
1320   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1321   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1322     GEMMBenchmark(state,
1323       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1324       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1325       3, 4, 8, 1);
1326   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1327   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1328     GEMMBenchmark(state,
1329       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1330       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1331       4, 4, 8, 1);
1332   }
1333 
1334   BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)1335   BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
1336   BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1337   BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1338   BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128)
1339   BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1340   BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1341   BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1342 
1343   BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64)
1344   BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1345   BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1346   BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1347   BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128)
1348   BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1349   BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1350   BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1351 
1352   BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64)
1353   BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1354   BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1355   BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1356   BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128)
1357   BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1358   BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1359   BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1360 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1361 
1362 
1363 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1364   static void qu8_gemm_1x2__wasm_fmagic(benchmark::State& state, const char* net) {
1365     GEMMBenchmark(state,
1366       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1367       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368       1, 2, 1, 1);
1369   }
qu8_gemm_2x2__wasm_fmagic(benchmark::State & state,const char * net)1370   static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
1371     GEMMBenchmark(state,
1372       xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1373       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374       2, 2, 1, 1);
1375   }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,const char * net)1376   static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
1377     GEMMBenchmark(state,
1378       xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1379       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1380       3, 2, 1, 1);
1381   }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,const char * net)1382   static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
1383     GEMMBenchmark(state,
1384       xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1385       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386       4, 2, 1, 1);
1387   }
qu8_gemm_1x4__wasm_fmagic(benchmark::State & state,const char * net)1388   static void qu8_gemm_1x4__wasm_fmagic(benchmark::State& state, const char* net) {
1389     GEMMBenchmark(state,
1390       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1391       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392       1, 4, 1, 1);
1393   }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,const char * net)1394   static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
1395     GEMMBenchmark(state,
1396       xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1397       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1398       2, 4, 1, 1);
1399   }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,const char * net)1400   static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
1401     GEMMBenchmark(state,
1402       xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1403       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1404       3, 4, 1, 1);
1405   }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,const char * net)1406   static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
1407     GEMMBenchmark(state,
1408       xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1409       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1410       4, 4, 1, 1);
1411   }
1412 
1413   BENCHMARK_GEMM(qu8_gemm_1x2__wasm_fmagic)
BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)1414   BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)
1415   BENCHMARK_GEMM(qu8_gemm_3x2__wasm_fmagic)
1416   BENCHMARK_GEMM(qu8_gemm_4x2__wasm_fmagic)
1417   BENCHMARK_GEMM(qu8_gemm_1x4__wasm_fmagic)
1418   BENCHMARK_GEMM(qu8_gemm_2x4__wasm_fmagic)
1419   BENCHMARK_GEMM(qu8_gemm_3x4__wasm_fmagic)
1420   BENCHMARK_GEMM(qu8_gemm_4x4__wasm_fmagic)
1421 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1422 
1423 
1424 static void qu8_gemm_1x2__scalar_fmagic(benchmark::State& state, const char* net) {
1425   GEMMBenchmark(state,
1426     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1427     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1428     1, 2, 1, 1);
1429 }
qu8_gemm_2x2__scalar_fmagic(benchmark::State & state,const char * net)1430 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
1431   GEMMBenchmark(state,
1432     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1433     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1434     2, 2, 1, 1);
1435 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,const char * net)1436 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
1437   GEMMBenchmark(state,
1438     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1439     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1440     3, 2, 1, 1);
1441 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,const char * net)1442 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
1443   GEMMBenchmark(state,
1444     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1445     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1446     4, 2, 1, 1);
1447 }
qu8_gemm_1x4__scalar_fmagic(benchmark::State & state,const char * net)1448 static void qu8_gemm_1x4__scalar_fmagic(benchmark::State& state, const char* net) {
1449   GEMMBenchmark(state,
1450     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1451     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1452     1, 4, 1, 1);
1453 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,const char * net)1454 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
1455   GEMMBenchmark(state,
1456     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1457     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1458     2, 4, 1, 1);
1459 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,const char * net)1460 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
1461   GEMMBenchmark(state,
1462     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1463     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1464     3, 4, 1, 1);
1465 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,const char * net)1466 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
1467   GEMMBenchmark(state,
1468     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1469     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1470     4, 4, 1, 1);
1471 }
1472 
qu8_gemm_1x2__scalar_imagic(benchmark::State & state,const char * net)1473 static void qu8_gemm_1x2__scalar_imagic(benchmark::State& state, const char* net) {
1474   GEMMBenchmark(state,
1475     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1476     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1477     1, 2, 1, 1);
1478 }
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,const char * net)1479 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
1480   GEMMBenchmark(state,
1481     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1482     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1483     2, 2, 1, 1);
1484 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,const char * net)1485 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
1486   GEMMBenchmark(state,
1487     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1488     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1489     3, 2, 1, 1);
1490 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,const char * net)1491 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
1492   GEMMBenchmark(state,
1493     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1494     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1495     4, 2, 1, 1);
1496 }
qu8_gemm_1x4__scalar_imagic(benchmark::State & state,const char * net)1497 static void qu8_gemm_1x4__scalar_imagic(benchmark::State& state, const char* net) {
1498   GEMMBenchmark(state,
1499     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1500     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1501     1, 4, 1, 1);
1502 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,const char * net)1503 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
1504   GEMMBenchmark(state,
1505     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1506     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1507     2, 4, 1, 1);
1508 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,const char * net)1509 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
1510   GEMMBenchmark(state,
1511     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1512     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1513     3, 4, 1, 1);
1514 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,const char * net)1515 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
1516   GEMMBenchmark(state,
1517     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1518     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1519     4, 4, 1, 1);
1520 }
1521 
qu8_gemm_1x2__scalar_lrintf(benchmark::State & state,const char * net)1522 static void qu8_gemm_1x2__scalar_lrintf(benchmark::State& state, const char* net) {
1523   GEMMBenchmark(state,
1524     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1525     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1526     1, 2, 1, 1);
1527 }
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,const char * net)1528 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
1529   GEMMBenchmark(state,
1530     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1531     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1532     2, 2, 1, 1);
1533 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,const char * net)1534 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
1535   GEMMBenchmark(state,
1536     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1537     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1538     3, 2, 1, 1);
1539 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,const char * net)1540 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
1541   GEMMBenchmark(state,
1542     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1543     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1544     4, 2, 1, 1);
1545 }
qu8_gemm_1x4__scalar_lrintf(benchmark::State & state,const char * net)1546 static void qu8_gemm_1x4__scalar_lrintf(benchmark::State& state, const char* net) {
1547   GEMMBenchmark(state,
1548     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1549     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1550     1, 4, 1, 1);
1551 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,const char * net)1552 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
1553   GEMMBenchmark(state,
1554     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1555     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1556     2, 4, 1, 1);
1557 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,const char * net)1558 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
1559   GEMMBenchmark(state,
1560     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1561     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1562     3, 4, 1, 1);
1563 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,const char * net)1564 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
1565   GEMMBenchmark(state,
1566     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1567     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1568     4, 4, 1, 1);
1569 }
1570 
1571 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_fmagic)
1572 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_fmagic)
1573 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_fmagic)
1574 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_fmagic)
1575 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_fmagic)
1576 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_fmagic)
1577 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_fmagic)
1578 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_fmagic)
1579 
1580 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_imagic)
1581 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_imagic)
1582 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_imagic)
1583 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_imagic)
1584 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_imagic)
1585 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_imagic)
1586 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_imagic)
1587 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_imagic)
1588 
1589 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_lrintf)
1590 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_lrintf)
1591 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_lrintf)
1592 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_lrintf)
1593 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_lrintf)
1594 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_lrintf)
1595 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_lrintf)
1596 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_lrintf)
1597 
1598 
1599 #ifdef BENCHMARK_RUY
1600 BENCHMARK_GEMM(ruy_st)
1601 #endif  // BENCHMARK_RUY
1602 #ifdef BENCHMARK_GEMMLOWP
1603 BENCHMARK_GEMM(gemmlowp_st)
1604 #endif  // BENCHMARK_GEMMLOWP
1605 
1606 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1607 BENCHMARK_MAIN();
1608 #endif
1609