xref: /aosp_15_r20/external/XNNPACK/bench/qs8-dwconv.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <benchmark/benchmark.h>
14 #include "bench/dwconv.h"
15 #include "bench/utils.h"
16 
17 #include <xnnpack.h>
18 #include <xnnpack/aligned-allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/dwconv.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/microfnptr.h>
23 #include <xnnpack/microparams-init.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26 
27 
DWConvBenchmark(benchmark::State & state,xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,xnn_init_qs8_conv_minmax_params_fn init_params,uint32_t channel_tile,uint32_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)28 static void DWConvBenchmark(benchmark::State& state,
29   xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
30   xnn_init_qs8_conv_minmax_params_fn init_params,
31   uint32_t channel_tile, uint32_t primary_tile,
32   benchmark::utils::IsaCheckFunction isa_check = nullptr)
33 {
34   if (isa_check && !isa_check(state)) {
35     return;
36   }
37 
38   const size_t input_height = state.range(0);
39   const size_t input_width = state.range(1);
40   const size_t kernel_height = state.range(2);
41   const size_t kernel_width = state.range(3);
42   const size_t padding_height = state.range(4);
43   const size_t padding_width = state.range(5);
44   const size_t subsampling = state.range(6);
45   const size_t dilation = state.range(7);
46   const size_t channels = state.range(8);
47 
48   const size_t kernel_size = kernel_height * kernel_width;
49   if (kernel_size != primary_tile) {
50     state.SkipWithError("kernel size mismatch");
51     return;
52   }
53 
54   std::random_device random_device;
55   auto rng = std::mt19937(random_device());
56   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
57   auto i8rng = std::bind(
58     std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
59 
60   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
61   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
62   const size_t padding_left = padding_width / 2;
63   const size_t padding_top = padding_height / 2;
64   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
65   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
66   const size_t output_size = output_height * output_width;
67   const size_t step_width = dilation == 1 ? subsampling : kernel_width;
68   const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
69 
70   const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
71 
72   std::vector<int8_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t));
73   std::generate(a.begin(), a.end(), std::ref(i8rng));
74   std::vector<int8_t> k(channels * kernel_height * kernel_width);
75   std::generate(k.begin(), k.end(), std::ref(i8rng));
76   std::vector<int32_t> b(channels);
77   std::generate(b.begin(), b.end(), std::ref(i32rng));
78 
79   std::vector<int8_t> z(channels + XNN_EXTRA_BYTES / sizeof(int8_t));
80 
81   const size_t k_elements = kernel_size * c_stride;
82   const size_t b_elements = c_stride;
83   const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
84   const size_t i_elements = output_height * step_height;
85   const size_t c_elements = output_size * channels;
86   const size_t num_buffers = 1 +
87     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
88       (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements);
89 
90   std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
91   std::fill(w.begin(), w.end(), 0.0f);
92   struct xnn_qs8_packing_params packing_params;
93   packing_params.input_zero_point = 0;
94   xnn_pack_qs8_dwconv_ghw_w(primary_tile, kernel_height, kernel_width, channels, channel_tile,
95       k.data(), b.data(), w.data(), 0 /* extra bytes */, &packing_params);
96   for (size_t n = 1; n < num_buffers; n++) {
97     std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size);
98   }
99 
100   std::vector<const int8_t*> i(i_elements * num_buffers);
101   xnn_operator convolution_op = { };
102   convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
103   convolution_op.input              = a.data();
104   convolution_op.input_pixel_stride = channels;
105   convolution_op.zero_buffer        = z.data();
106   convolution_op.input_height       = input_height;
107   convolution_op.input_width        = input_width;
108   convolution_op.output_height      = output_height;
109   convolution_op.output_width       = output_width;
110   convolution_op.kernel_height      = kernel_height;
111   convolution_op.kernel_width       = kernel_width;
112   convolution_op.stride_height      = subsampling;
113   convolution_op.stride_width       = subsampling;
114   convolution_op.dilation_height    = dilation;
115   convolution_op.dilation_width     = dilation;
116   convolution_op.padding_top        = padding_top;
117   convolution_op.padding_left       = padding_left;
118 
119   xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, primary_tile, 0 /* log2(sizeof(int8_t)) */);
120   for (size_t n = 1; n < num_buffers; n++) {
121     std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
122   }
123 
124   std::vector<int8_t> c(c_elements * num_buffers);
125   std::fill(c.begin(), c.end(), std::nanf(""));
126 
127   xnn_qs8_conv_minmax_params params;
128   init_params(&params,
129     0.5f /* scale */, 0 /* output zero point */, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
130 
131   size_t buffer_index = 0;
132   for (auto _ : state) {
133     state.PauseTiming();
134     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
135     buffer_index = (buffer_index + 1) % num_buffers;
136     state.ResumeTiming();
137 
138     for (size_t y = 0; y < output_height; y++) {
139       dwconv(channels, output_width,
140         i.data() + buffer_index * i_elements + step_height * y,
141         w.data() + buffer_index * w_size,
142         c.data() + buffer_index * c_elements + y * output_width * channels,
143         kernel_height * step_width * sizeof(void*), 0,
144         0, z.data(), &params);
145     }
146   }
147 
148   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
149   if (cpu_frequency != 0) {
150     state.counters["cpufreq"] = cpu_frequency;
151   }
152 
153   state.counters["FLOPS"] = benchmark::Counter(
154     uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
155     benchmark::Counter::kIsRate);
156 
157   state.counters["bytes"] = benchmark::Counter(
158     uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)),
159     benchmark::Counter::kIsRate);
160 }
161 
162 
163 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State & state,const char * net)164   static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
165     DWConvBenchmark(state,
166       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
167       xnn_init_qs8_conv_minmax_rndnu_neon_params,
168       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
169   }
qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State & state,const char * net)170   static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
171     DWConvBenchmark(state,
172       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
173       xnn_init_qs8_conv_minmax_rndnu_neon_params,
174       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
175   }
qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State & state,const char * net)176   static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, const char* net) {
177     DWConvBenchmark(state,
178       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
179       xnn_init_qs8_conv_minmax_rndnu_neon_params,
180       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
181   }
qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State & state,const char * net)182   static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
183     DWConvBenchmark(state,
184       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
185       xnn_init_qs8_conv_minmax_rndnu_neon_params,
186       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
187   }
qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State & state,const char * net)188   static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
189     DWConvBenchmark(state,
190       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
191       xnn_init_qs8_conv_minmax_rndnu_neon_params,
192       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
193   }
qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State & state,const char * net)194   static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, const char* net) {
195     DWConvBenchmark(state,
196       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
197       xnn_init_qs8_conv_minmax_rndnu_neon_params,
198       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
199   }
qs8_dwconv_up8x9__neon_mul16(benchmark::State & state,const char * net)200   static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, const char* net) {
201     DWConvBenchmark(state,
202       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
203       xnn_init_qs8_conv_minmax_rndnu_neon_params,
204       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
205   }
qs8_dwconv_up16x9__neon_mul16(benchmark::State & state,const char * net)206   static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, const char* net) {
207     DWConvBenchmark(state,
208       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
209       xnn_init_qs8_conv_minmax_rndnu_neon_params,
210       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
211   }
qs8_dwconv_up24x9__neon_mul16(benchmark::State & state,const char * net)212   static void qs8_dwconv_up24x9__neon_mul16(benchmark::State& state, const char* net) {
213     DWConvBenchmark(state,
214       xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x9__neon_mul16,
215       xnn_init_qs8_conv_minmax_rndnu_neon_params,
216       24 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
217   }
qs8_dwconv_up32x9__neon_mul16(benchmark::State & state,const char * net)218   static void qs8_dwconv_up32x9__neon_mul16(benchmark::State& state, const char* net) {
219     DWConvBenchmark(state,
220       xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x9__neon_mul16,
221       xnn_init_qs8_conv_minmax_rndnu_neon_params,
222       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
223   }
qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State & state,const char * net)224   static void qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
225     DWConvBenchmark(state,
226       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8_ld64,
227       xnn_init_qs8_conv_minmax_rndnu_neon_params,
228       8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
229   }
qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State & state,const char * net)230   static void qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
231     DWConvBenchmark(state,
232       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld64,
233       xnn_init_qs8_conv_minmax_rndnu_neon_params,
234       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
235   }
qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State & state,const char * net)236   static void qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State& state, const char* net) {
237     DWConvBenchmark(state,
238       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld128,
239       xnn_init_qs8_conv_minmax_rndnu_neon_params,
240       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
241   }
qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State & state,const char * net)242   static void qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
243     DWConvBenchmark(state,
244       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64,
245       xnn_init_qs8_conv_minmax_rndnu_neon_params,
246       8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
247   }
qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State & state,const char * net)248   static void qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
249     DWConvBenchmark(state,
250       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64,
251       xnn_init_qs8_conv_minmax_rndnu_neon_params,
252       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
253   }
qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State & state,const char * net)254   static void qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State& state, const char* net) {
255     DWConvBenchmark(state,
256       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld128,
257       xnn_init_qs8_conv_minmax_rndnu_neon_params,
258       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
259   }
qs8_dwconv_up8x25__neon_mul16(benchmark::State & state,const char * net)260   static void qs8_dwconv_up8x25__neon_mul16(benchmark::State& state, const char* net) {
261     DWConvBenchmark(state,
262       xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16,
263       xnn_init_qs8_conv_minmax_rndnu_neon_params,
264       8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
265   }
qs8_dwconv_up16x25__neon_mul16(benchmark::State & state,const char * net)266   static void qs8_dwconv_up16x25__neon_mul16(benchmark::State& state, const char* net) {
267     DWConvBenchmark(state,
268       xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul16,
269       xnn_init_qs8_conv_minmax_rndnu_neon_params,
270       16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
271   }
qs8_dwconv_up24x25__neon_mul16(benchmark::State & state,const char * net)272   static void qs8_dwconv_up24x25__neon_mul16(benchmark::State& state, const char* net) {
273     DWConvBenchmark(state,
274       xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul16,
275       xnn_init_qs8_conv_minmax_rndnu_neon_params,
276       24 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
277   }
qs8_dwconv_up32x25__neon_mul16(benchmark::State & state,const char * net)278   static void qs8_dwconv_up32x25__neon_mul16(benchmark::State& state, const char* net) {
279     DWConvBenchmark(state,
280       xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x25__neon_mul16,
281       xnn_init_qs8_conv_minmax_rndnu_neon_params,
282       32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
283   }
284 
285   BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul8_ld64);
286   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld64);
287   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld128);
288   BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mla8_ld64);
289   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld64);
290   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld128);
291   BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul16);
292   BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul16);
293   BENCHMARK_DWCONV(qs8_dwconv_up24x9__neon_mul16);
294   BENCHMARK_DWCONV(qs8_dwconv_up32x9__neon_mul16);
295   BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul8_ld64);
296   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld64);
297   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld128);
298   BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mla8_ld64);
299   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld64);
300   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld128);
301   BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul16);
302   BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul16);
303   BENCHMARK_DWCONV(qs8_dwconv_up24x25__neon_mul16);
304   BENCHMARK_DWCONV(qs8_dwconv_up32x25__neon_mul16);
305 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
306 
307 
308 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State & state,const char * net)309   static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, const char* net) {
310     DWConvBenchmark(state,
311       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
312       xnn_init_qs8_conv_minmax_fp32_avx512_params,
313       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
314   }
qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State & state,const char * net)315   static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, const char* net) {
316     DWConvBenchmark(state,
317       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
318       xnn_init_qs8_conv_minmax_fp32_avx512_params,
319       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
320   }
qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State & state,const char * net)321   static void qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
322     DWConvBenchmark(state,
323       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpmovsx,
324       xnn_init_qs8_conv_minmax_fp32_avx2_params,
325       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
326   }
qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State & state,const char * net)327   static void qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
328     DWConvBenchmark(state,
329       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpmovsx,
330       xnn_init_qs8_conv_minmax_fp32_avx2_params,
331       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
332   }
qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State & state,const char * net)333   static void qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
334     DWConvBenchmark(state,
335       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpunpck,
336       xnn_init_qs8_conv_minmax_fp32_avx2_params,
337       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
338   }
qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State & state,const char * net)339   static void qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
340     DWConvBenchmark(state,
341       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpunpck,
342       xnn_init_qs8_conv_minmax_fp32_avx2_params,
343       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
344   }
qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State & state,const char * net)345   static void qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
346     DWConvBenchmark(state,
347       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_add16_vpunpck,
348       xnn_init_qs8_conv_minmax_fp32_avx2_params,
349       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
350   }
qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State & state,const char * net)351   static void qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
352     DWConvBenchmark(state,
353       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck,
354       xnn_init_qs8_conv_minmax_fp32_avx2_params,
355       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
356   }
qs8_dwconv_up8x9__avx2_mul32(benchmark::State & state,const char * net)357   static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, const char* net) {
358     DWConvBenchmark(state,
359       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
360       xnn_init_qs8_conv_minmax_fp32_avx2_params,
361       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
362   }
qs8_dwconv_up16x9__avx2_mul32(benchmark::State & state,const char * net)363   static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, const char* net) {
364     DWConvBenchmark(state,
365       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
366       xnn_init_qs8_conv_minmax_fp32_avx2_params,
367       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
368   }
qs8_dwconv_up32x9__avx2_mul32(benchmark::State & state,const char * net)369   static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, const char* net) {
370     DWConvBenchmark(state,
371       xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
372       xnn_init_qs8_conv_minmax_fp32_avx2_params,
373       32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
374   }
qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State & state,const char * net)375   static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, const char* net) {
376     DWConvBenchmark(state,
377       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
378       xnn_init_qs8_conv_minmax_fp32_sse4_params,
379       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
380   }
qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State & state,const char * net)381   static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, const char* net) {
382     DWConvBenchmark(state,
383       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
384       xnn_init_qs8_conv_minmax_fp32_sse4_params,
385       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
386   }
qs8_dwconv_up8x9__avx_mul16(benchmark::State & state,const char * net)387   static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, const char* net) {
388     DWConvBenchmark(state,
389       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
390       xnn_init_qs8_conv_minmax_fp32_sse4_params,
391       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
392   }
qs8_dwconv_up16x9__avx_mul16(benchmark::State & state,const char * net)393   static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, const char* net) {
394     DWConvBenchmark(state,
395       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
396       xnn_init_qs8_conv_minmax_fp32_sse4_params,
397       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
398   }
qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State & state,const char * net)399   static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, const char* net) {
400     DWConvBenchmark(state,
401       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
402       xnn_init_qs8_conv_minmax_fp32_sse4_params,
403       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
404   }
qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State & state,const char * net)405   static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, const char* net) {
406     DWConvBenchmark(state,
407       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
408       xnn_init_qs8_conv_minmax_fp32_sse4_params,
409       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
410   }
qs8_dwconv_up8x9__avx_mul32(benchmark::State & state,const char * net)411   static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, const char* net) {
412     DWConvBenchmark(state,
413       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
414       xnn_init_qs8_conv_minmax_fp32_sse4_params,
415       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
416   }
qs8_dwconv_up16x9__avx_mul32(benchmark::State & state,const char * net)417   static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, const char* net) {
418     DWConvBenchmark(state,
419       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
420       xnn_init_qs8_conv_minmax_fp32_sse4_params,
421       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
422   }
qs8_dwconv_up8x9__sse41_mul16(benchmark::State & state,const char * net)423   static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, const char* net) {
424     DWConvBenchmark(state,
425       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
426       xnn_init_qs8_conv_minmax_fp32_sse4_params,
427       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
428   }
qs8_dwconv_up16x9__sse41_mul16(benchmark::State & state,const char * net)429   static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, const char* net) {
430     DWConvBenchmark(state,
431       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
432       xnn_init_qs8_conv_minmax_fp32_sse4_params,
433       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
434   }
qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State & state,const char * net)435   static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
436     DWConvBenchmark(state,
437       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
438       xnn_init_qs8_conv_minmax_fp32_sse4_params,
439       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
440   }
qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State & state,const char * net)441   static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
442     DWConvBenchmark(state,
443       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
444       xnn_init_qs8_conv_minmax_fp32_sse4_params,
445       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
446   }
qs8_dwconv_up8x9__sse41_mul32(benchmark::State & state,const char * net)447   static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, const char* net) {
448     DWConvBenchmark(state,
449       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
450       xnn_init_qs8_conv_minmax_fp32_sse4_params,
451       8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
452   }
qs8_dwconv_up16x9__sse41_mul32(benchmark::State & state,const char * net)453   static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, const char* net) {
454     DWConvBenchmark(state,
455       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
456       xnn_init_qs8_conv_minmax_fp32_sse4_params,
457       16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
458   }
qs8_dwconv_up8x9__sse2_mul16(benchmark::State & state,const char * net)459   static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, const char* net) {
460     DWConvBenchmark(state,
461       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
462       xnn_init_qs8_conv_minmax_fp32_sse2_params,
463       8 /* channel tile */, 9 /* primary tile */);
464   }
qs8_dwconv_up16x9__sse2_mul16(benchmark::State & state,const char * net)465   static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, const char* net) {
466     DWConvBenchmark(state,
467       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
468       xnn_init_qs8_conv_minmax_fp32_sse2_params,
469       16 /* channel tile */, 9 /* primary tile */);
470   }
qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State & state,const char * net)471   static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
472     DWConvBenchmark(state,
473       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
474       xnn_init_qs8_conv_minmax_fp32_sse2_params,
475       8 /* channel tile */, 9 /* primary tile */);
476   }
qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State & state,const char * net)477   static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
478     DWConvBenchmark(state,
479       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
480       xnn_init_qs8_conv_minmax_fp32_sse2_params,
481       16 /* channel tile */, 9 /* primary tile */);
482   }
483 
484   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx512skx_mul32);
485   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx512skx_mul32);
486 
487   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpmovsx);
488   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpmovsx);
489   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpunpck);
490   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpunpck);
491   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck);
492   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck);
493   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx2_mul32);
494   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul32);
495   BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul32);
496 
497   BENCHMARK_DWCONV(qs8_dwconv_up8x9__xop_mul16_add16);
498   BENCHMARK_DWCONV(qs8_dwconv_up16x9__xop_mul16_add16);
499 
500   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16);
501   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16);
502   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16_add16);
503   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16_add16);
504   BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul32);
505   BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul32);
506 
507   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16);
508   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16);
509   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16_add16);
510   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16_add16);
511   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul32);
512   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul32);
513 
514   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16);
515   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16);
516   BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16_add16);
517   BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16_add16);
518 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
519 
520 
521 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State & state,const char * net)522   static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
523     DWConvBenchmark(state,
524       xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
525       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
526       8 /* channel tile */, 9 /* primary tile */);
527   }
qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State & state,const char * net)528   static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
529     DWConvBenchmark(state,
530       xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
531       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
532       16 /* channel tile */, 9 /* primary tile */);
533   }
534 
535   BENCHMARK_DWCONV(qs8_dwconv_up8x9__wasmsimd_mul16);
536   BENCHMARK_DWCONV(qs8_dwconv_up16x9__wasmsimd_mul16);
537 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
538 
539 
540 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up1x9__wasm_fmagic(benchmark::State & state,const char * net)541   static void qs8_dwconv_up1x9__wasm_fmagic(benchmark::State& state, const char* net) {
542     DWConvBenchmark(state,
543       xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__wasm_fmagic,
544       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
545       1 /* channel tile */, 9 /* primary tile */);
546   }
qs8_dwconv_up2x9__wasm_fmagic(benchmark::State & state,const char * net)547   static void qs8_dwconv_up2x9__wasm_fmagic(benchmark::State& state, const char* net) {
548     DWConvBenchmark(state,
549       xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic,
550       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
551       2 /* channel tile */, 9 /* primary tile */);
552   }
qs8_dwconv_up4x9__wasm_fmagic(benchmark::State & state,const char * net)553   static void qs8_dwconv_up4x9__wasm_fmagic(benchmark::State& state, const char* net) {
554     DWConvBenchmark(state,
555       xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__wasm_fmagic,
556       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
557       4 /* channel tile */, 9 /* primary tile */);
558   }
559 
560   BENCHMARK_DWCONV(qs8_dwconv_up1x9__wasm_fmagic);
561   BENCHMARK_DWCONV(qs8_dwconv_up2x9__wasm_fmagic);
562   BENCHMARK_DWCONV(qs8_dwconv_up4x9__wasm_fmagic);
563 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
564 
565 
qs8_dwconv_up1x9__scalar_fmagic(benchmark::State & state,const char * net)566 static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, const char* net) {
567   DWConvBenchmark(state,
568     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
569     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
570     1 /* channel tile */, 9 /* primary tile */);
571 }
qs8_dwconv_up2x9__scalar_fmagic(benchmark::State & state,const char * net)572 static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, const char* net) {
573   DWConvBenchmark(state,
574     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
575     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
576     2 /* channel tile */, 9 /* primary tile */);
577 }
qs8_dwconv_up4x9__scalar_fmagic(benchmark::State & state,const char * net)578 static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, const char* net) {
579   DWConvBenchmark(state,
580     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
581     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
582     4 /* channel tile */, 9 /* primary tile */);
583 }
584 
qs8_dwconv_up1x9__scalar_imagic(benchmark::State & state,const char * net)585 static void qs8_dwconv_up1x9__scalar_imagic(benchmark::State& state, const char* net) {
586   DWConvBenchmark(state,
587     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_imagic,
588     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
589     1 /* channel tile */, 9 /* primary tile */);
590 }
qs8_dwconv_up2x9__scalar_imagic(benchmark::State & state,const char * net)591 static void qs8_dwconv_up2x9__scalar_imagic(benchmark::State& state, const char* net) {
592   DWConvBenchmark(state,
593     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic,
594     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
595     2 /* channel tile */, 9 /* primary tile */);
596 }
qs8_dwconv_up4x9__scalar_imagic(benchmark::State & state,const char * net)597 static void qs8_dwconv_up4x9__scalar_imagic(benchmark::State& state, const char* net) {
598   DWConvBenchmark(state,
599     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_imagic,
600     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
601     4 /* channel tile */, 9 /* primary tile */);
602 }
603 
qs8_dwconv_up1x9__scalar_lrintf(benchmark::State & state,const char * net)604 static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, const char* net) {
605   DWConvBenchmark(state,
606     xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
607     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
608     1 /* channel tile */, 9 /* primary tile */);
609 }
qs8_dwconv_up2x9__scalar_lrintf(benchmark::State & state,const char * net)610 static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, const char* net) {
611   DWConvBenchmark(state,
612     xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
613     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
614     2 /* channel tile */, 9 /* primary tile */);
615 }
qs8_dwconv_up4x9__scalar_lrintf(benchmark::State & state,const char * net)616 static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, const char* net) {
617   DWConvBenchmark(state,
618     xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
619     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
620     4 /* channel tile */, 9 /* primary tile */);
621 }
622 
623 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_fmagic);
624 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_fmagic);
625 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_fmagic);
626 
627 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_imagic);
628 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_imagic);
629 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_imagic);
630 
631 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_lrintf);
632 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_lrintf);
633 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_lrintf);
634 
635 
636 #ifndef XNNPACK_BENCHMARK_NO_MAIN
637 BENCHMARK_MAIN();
638 #endif
639