1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12
13 #include <benchmark/benchmark.h>
14 #include "bench/dwconv.h"
15 #include "bench/utils.h"
16
17 #include <xnnpack.h>
18 #include <xnnpack/aligned-allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/dwconv.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/microfnptr.h>
23 #include <xnnpack/microparams-init.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26
27
DWConvBenchmark(benchmark::State & state,xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,xnn_init_qs8_conv_minmax_params_fn init_params,uint32_t channel_tile,uint32_t primary_tile,benchmark::utils::IsaCheckFunction isa_check=nullptr)28 static void DWConvBenchmark(benchmark::State& state,
29 xnn_qs8_dwconv_minmax_unipass_ukernel_function dwconv,
30 xnn_init_qs8_conv_minmax_params_fn init_params,
31 uint32_t channel_tile, uint32_t primary_tile,
32 benchmark::utils::IsaCheckFunction isa_check = nullptr)
33 {
34 if (isa_check && !isa_check(state)) {
35 return;
36 }
37
38 const size_t input_height = state.range(0);
39 const size_t input_width = state.range(1);
40 const size_t kernel_height = state.range(2);
41 const size_t kernel_width = state.range(3);
42 const size_t padding_height = state.range(4);
43 const size_t padding_width = state.range(5);
44 const size_t subsampling = state.range(6);
45 const size_t dilation = state.range(7);
46 const size_t channels = state.range(8);
47
48 const size_t kernel_size = kernel_height * kernel_width;
49 if (kernel_size != primary_tile) {
50 state.SkipWithError("kernel size mismatch");
51 return;
52 }
53
54 std::random_device random_device;
55 auto rng = std::mt19937(random_device());
56 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
57 auto i8rng = std::bind(
58 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
59
60 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
61 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
62 const size_t padding_left = padding_width / 2;
63 const size_t padding_top = padding_height / 2;
64 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
65 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
66 const size_t output_size = output_height * output_width;
67 const size_t step_width = dilation == 1 ? subsampling : kernel_width;
68 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
69
70 const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, channel_tile);
71
72 std::vector<int8_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(int8_t));
73 std::generate(a.begin(), a.end(), std::ref(i8rng));
74 std::vector<int8_t> k(channels * kernel_height * kernel_width);
75 std::generate(k.begin(), k.end(), std::ref(i8rng));
76 std::vector<int32_t> b(channels);
77 std::generate(b.begin(), b.end(), std::ref(i32rng));
78
79 std::vector<int8_t> z(channels + XNN_EXTRA_BYTES / sizeof(int8_t));
80
81 const size_t k_elements = kernel_size * c_stride;
82 const size_t b_elements = c_stride;
83 const size_t w_size = k_elements * sizeof(int8_t) + b_elements * sizeof(int32_t);
84 const size_t i_elements = output_height * step_height;
85 const size_t c_elements = output_size * channels;
86 const size_t num_buffers = 1 +
87 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
88 (c_elements * sizeof(int8_t) + w_size) + sizeof(void*) * i_elements);
89
90 std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
91 std::fill(w.begin(), w.end(), 0.0f);
92 struct xnn_qs8_packing_params packing_params;
93 packing_params.input_zero_point = 0;
94 xnn_pack_qs8_dwconv_ghw_w(primary_tile, kernel_height, kernel_width, channels, channel_tile,
95 k.data(), b.data(), w.data(), 0 /* extra bytes */, &packing_params);
96 for (size_t n = 1; n < num_buffers; n++) {
97 std::copy(w.cbegin(), w.cbegin() + w_size, w.begin() + n * w_size);
98 }
99
100 std::vector<const int8_t*> i(i_elements * num_buffers);
101 xnn_operator convolution_op = { };
102 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
103 convolution_op.input = a.data();
104 convolution_op.input_pixel_stride = channels;
105 convolution_op.zero_buffer = z.data();
106 convolution_op.input_height = input_height;
107 convolution_op.input_width = input_width;
108 convolution_op.output_height = output_height;
109 convolution_op.output_width = output_width;
110 convolution_op.kernel_height = kernel_height;
111 convolution_op.kernel_width = kernel_width;
112 convolution_op.stride_height = subsampling;
113 convolution_op.stride_width = subsampling;
114 convolution_op.dilation_height = dilation;
115 convolution_op.dilation_width = dilation;
116 convolution_op.padding_top = padding_top;
117 convolution_op.padding_left = padding_left;
118
119 xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, primary_tile, 0 /* log2(sizeof(int8_t)) */);
120 for (size_t n = 1; n < num_buffers; n++) {
121 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
122 }
123
124 std::vector<int8_t> c(c_elements * num_buffers);
125 std::fill(c.begin(), c.end(), std::nanf(""));
126
127 xnn_qs8_conv_minmax_params params;
128 init_params(¶ms,
129 0.5f /* scale */, 0 /* output zero point */, std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
130
131 size_t buffer_index = 0;
132 for (auto _ : state) {
133 state.PauseTiming();
134 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
135 buffer_index = (buffer_index + 1) % num_buffers;
136 state.ResumeTiming();
137
138 for (size_t y = 0; y < output_height; y++) {
139 dwconv(channels, output_width,
140 i.data() + buffer_index * i_elements + step_height * y,
141 w.data() + buffer_index * w_size,
142 c.data() + buffer_index * c_elements + y * output_width * channels,
143 kernel_height * step_width * sizeof(void*), 0,
144 0, z.data(), ¶ms);
145 }
146 }
147
148 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
149 if (cpu_frequency != 0) {
150 state.counters["cpufreq"] = cpu_frequency;
151 }
152
153 state.counters["FLOPS"] = benchmark::Counter(
154 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
155 benchmark::Counter::kIsRate);
156
157 state.counters["bytes"] = benchmark::Counter(
158 uint64_t(state.iterations()) * channels * ((output_size + input_height * input_width + kernel_size) * sizeof(int8_t) + sizeof(int32_t)),
159 benchmark::Counter::kIsRate);
160 }
161
162
163 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State & state,const char * net)164 static void qs8_dwconv_up8x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
165 DWConvBenchmark(state,
166 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul8_ld64,
167 xnn_init_qs8_conv_minmax_rndnu_neon_params,
168 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
169 }
qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State & state,const char * net)170 static void qs8_dwconv_up16x9__neon_mul8_ld64(benchmark::State& state, const char* net) {
171 DWConvBenchmark(state,
172 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld64,
173 xnn_init_qs8_conv_minmax_rndnu_neon_params,
174 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
175 }
qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State & state,const char * net)176 static void qs8_dwconv_up16x9__neon_mul8_ld128(benchmark::State& state, const char* net) {
177 DWConvBenchmark(state,
178 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8_ld128,
179 xnn_init_qs8_conv_minmax_rndnu_neon_params,
180 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
181 }
qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State & state,const char * net)182 static void qs8_dwconv_up8x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
183 DWConvBenchmark(state,
184 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mla8_ld64,
185 xnn_init_qs8_conv_minmax_rndnu_neon_params,
186 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
187 }
qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State & state,const char * net)188 static void qs8_dwconv_up16x9__neon_mla8_ld64(benchmark::State& state, const char* net) {
189 DWConvBenchmark(state,
190 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64,
191 xnn_init_qs8_conv_minmax_rndnu_neon_params,
192 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
193 }
qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State & state,const char * net)194 static void qs8_dwconv_up16x9__neon_mla8_ld128(benchmark::State& state, const char* net) {
195 DWConvBenchmark(state,
196 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld128,
197 xnn_init_qs8_conv_minmax_rndnu_neon_params,
198 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
199 }
qs8_dwconv_up8x9__neon_mul16(benchmark::State & state,const char * net)200 static void qs8_dwconv_up8x9__neon_mul16(benchmark::State& state, const char* net) {
201 DWConvBenchmark(state,
202 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x9__neon_mul16,
203 xnn_init_qs8_conv_minmax_rndnu_neon_params,
204 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
205 }
qs8_dwconv_up16x9__neon_mul16(benchmark::State & state,const char * net)206 static void qs8_dwconv_up16x9__neon_mul16(benchmark::State& state, const char* net) {
207 DWConvBenchmark(state,
208 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul16,
209 xnn_init_qs8_conv_minmax_rndnu_neon_params,
210 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
211 }
qs8_dwconv_up24x9__neon_mul16(benchmark::State & state,const char * net)212 static void qs8_dwconv_up24x9__neon_mul16(benchmark::State& state, const char* net) {
213 DWConvBenchmark(state,
214 xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x9__neon_mul16,
215 xnn_init_qs8_conv_minmax_rndnu_neon_params,
216 24 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
217 }
qs8_dwconv_up32x9__neon_mul16(benchmark::State & state,const char * net)218 static void qs8_dwconv_up32x9__neon_mul16(benchmark::State& state, const char* net) {
219 DWConvBenchmark(state,
220 xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x9__neon_mul16,
221 xnn_init_qs8_conv_minmax_rndnu_neon_params,
222 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckNEON);
223 }
qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State & state,const char * net)224 static void qs8_dwconv_up8x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
225 DWConvBenchmark(state,
226 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8_ld64,
227 xnn_init_qs8_conv_minmax_rndnu_neon_params,
228 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
229 }
qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State & state,const char * net)230 static void qs8_dwconv_up16x25__neon_mul8_ld64(benchmark::State& state, const char* net) {
231 DWConvBenchmark(state,
232 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld64,
233 xnn_init_qs8_conv_minmax_rndnu_neon_params,
234 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
235 }
qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State & state,const char * net)236 static void qs8_dwconv_up16x25__neon_mul8_ld128(benchmark::State& state, const char* net) {
237 DWConvBenchmark(state,
238 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul8_ld128,
239 xnn_init_qs8_conv_minmax_rndnu_neon_params,
240 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
241 }
qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State & state,const char * net)242 static void qs8_dwconv_up8x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
243 DWConvBenchmark(state,
244 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64,
245 xnn_init_qs8_conv_minmax_rndnu_neon_params,
246 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
247 }
qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State & state,const char * net)248 static void qs8_dwconv_up16x25__neon_mla8_ld64(benchmark::State& state, const char* net) {
249 DWConvBenchmark(state,
250 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64,
251 xnn_init_qs8_conv_minmax_rndnu_neon_params,
252 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
253 }
qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State & state,const char * net)254 static void qs8_dwconv_up16x25__neon_mla8_ld128(benchmark::State& state, const char* net) {
255 DWConvBenchmark(state,
256 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld128,
257 xnn_init_qs8_conv_minmax_rndnu_neon_params,
258 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
259 }
qs8_dwconv_up8x25__neon_mul16(benchmark::State & state,const char * net)260 static void qs8_dwconv_up8x25__neon_mul16(benchmark::State& state, const char* net) {
261 DWConvBenchmark(state,
262 xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul16,
263 xnn_init_qs8_conv_minmax_rndnu_neon_params,
264 8 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
265 }
qs8_dwconv_up16x25__neon_mul16(benchmark::State & state,const char * net)266 static void qs8_dwconv_up16x25__neon_mul16(benchmark::State& state, const char* net) {
267 DWConvBenchmark(state,
268 xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mul16,
269 xnn_init_qs8_conv_minmax_rndnu_neon_params,
270 16 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
271 }
qs8_dwconv_up24x25__neon_mul16(benchmark::State & state,const char * net)272 static void qs8_dwconv_up24x25__neon_mul16(benchmark::State& state, const char* net) {
273 DWConvBenchmark(state,
274 xnn_qs8_dwconv_minmax_rndnu_ukernel_up24x25__neon_mul16,
275 xnn_init_qs8_conv_minmax_rndnu_neon_params,
276 24 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
277 }
qs8_dwconv_up32x25__neon_mul16(benchmark::State & state,const char * net)278 static void qs8_dwconv_up32x25__neon_mul16(benchmark::State& state, const char* net) {
279 DWConvBenchmark(state,
280 xnn_qs8_dwconv_minmax_rndnu_ukernel_up32x25__neon_mul16,
281 xnn_init_qs8_conv_minmax_rndnu_neon_params,
282 32 /* channel tile */, 25 /* primary tile */, benchmark::utils::CheckNEON);
283 }
284
285 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul8_ld64);
286 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld64);
287 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul8_ld128);
288 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mla8_ld64);
289 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld64);
290 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mla8_ld128);
291 BENCHMARK_DWCONV(qs8_dwconv_up8x9__neon_mul16);
292 BENCHMARK_DWCONV(qs8_dwconv_up16x9__neon_mul16);
293 BENCHMARK_DWCONV(qs8_dwconv_up24x9__neon_mul16);
294 BENCHMARK_DWCONV(qs8_dwconv_up32x9__neon_mul16);
295 BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul8_ld64);
296 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld64);
297 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul8_ld128);
298 BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mla8_ld64);
299 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld64);
300 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mla8_ld128);
301 BENCHMARK_DWCONV(qs8_dwconv_up8x25__neon_mul16);
302 BENCHMARK_DWCONV(qs8_dwconv_up16x25__neon_mul16);
303 BENCHMARK_DWCONV(qs8_dwconv_up24x25__neon_mul16);
304 BENCHMARK_DWCONV(qs8_dwconv_up32x25__neon_mul16);
305 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
306
307
308 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State & state,const char * net)309 static void qs8_dwconv_up16x9__avx512skx_mul32(benchmark::State& state, const char* net) {
310 DWConvBenchmark(state,
311 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx512skx_mul32,
312 xnn_init_qs8_conv_minmax_fp32_avx512_params,
313 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
314 }
qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State & state,const char * net)315 static void qs8_dwconv_up32x9__avx512skx_mul32(benchmark::State& state, const char* net) {
316 DWConvBenchmark(state,
317 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32,
318 xnn_init_qs8_conv_minmax_fp32_avx512_params,
319 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX512SKX);
320 }
qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State & state,const char * net)321 static void qs8_dwconv_up16x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
322 DWConvBenchmark(state,
323 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpmovsx,
324 xnn_init_qs8_conv_minmax_fp32_avx2_params,
325 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
326 }
qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State & state,const char * net)327 static void qs8_dwconv_up32x9__avx2_mul16_vpmovsx(benchmark::State& state, const char* net) {
328 DWConvBenchmark(state,
329 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpmovsx,
330 xnn_init_qs8_conv_minmax_fp32_avx2_params,
331 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
332 }
qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State & state,const char * net)333 static void qs8_dwconv_up16x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
334 DWConvBenchmark(state,
335 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_vpunpck,
336 xnn_init_qs8_conv_minmax_fp32_avx2_params,
337 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
338 }
qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State & state,const char * net)339 static void qs8_dwconv_up32x9__avx2_mul16_vpunpck(benchmark::State& state, const char* net) {
340 DWConvBenchmark(state,
341 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_vpunpck,
342 xnn_init_qs8_conv_minmax_fp32_avx2_params,
343 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
344 }
qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State & state,const char * net)345 static void qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
346 DWConvBenchmark(state,
347 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul16_add16_vpunpck,
348 xnn_init_qs8_conv_minmax_fp32_avx2_params,
349 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
350 }
qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State & state,const char * net)351 static void qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck(benchmark::State& state, const char* net) {
352 DWConvBenchmark(state,
353 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul16_add16_vpunpck,
354 xnn_init_qs8_conv_minmax_fp32_avx2_params,
355 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
356 }
qs8_dwconv_up8x9__avx2_mul32(benchmark::State & state,const char * net)357 static void qs8_dwconv_up8x9__avx2_mul32(benchmark::State& state, const char* net) {
358 DWConvBenchmark(state,
359 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx2_mul32,
360 xnn_init_qs8_conv_minmax_fp32_avx2_params,
361 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
362 }
qs8_dwconv_up16x9__avx2_mul32(benchmark::State & state,const char * net)363 static void qs8_dwconv_up16x9__avx2_mul32(benchmark::State& state, const char* net) {
364 DWConvBenchmark(state,
365 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32,
366 xnn_init_qs8_conv_minmax_fp32_avx2_params,
367 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
368 }
qs8_dwconv_up32x9__avx2_mul32(benchmark::State & state,const char * net)369 static void qs8_dwconv_up32x9__avx2_mul32(benchmark::State& state, const char* net) {
370 DWConvBenchmark(state,
371 xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx2_mul32,
372 xnn_init_qs8_conv_minmax_fp32_avx2_params,
373 32 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX2);
374 }
qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State & state,const char * net)375 static void qs8_dwconv_up8x9__xop_mul16_add16(benchmark::State& state, const char* net) {
376 DWConvBenchmark(state,
377 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__xop_mul16_add16,
378 xnn_init_qs8_conv_minmax_fp32_sse4_params,
379 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
380 }
qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State & state,const char * net)381 static void qs8_dwconv_up16x9__xop_mul16_add16(benchmark::State& state, const char* net) {
382 DWConvBenchmark(state,
383 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16,
384 xnn_init_qs8_conv_minmax_fp32_sse4_params,
385 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckXOP);
386 }
qs8_dwconv_up8x9__avx_mul16(benchmark::State & state,const char * net)387 static void qs8_dwconv_up8x9__avx_mul16(benchmark::State& state, const char* net) {
388 DWConvBenchmark(state,
389 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16,
390 xnn_init_qs8_conv_minmax_fp32_sse4_params,
391 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
392 }
qs8_dwconv_up16x9__avx_mul16(benchmark::State & state,const char * net)393 static void qs8_dwconv_up16x9__avx_mul16(benchmark::State& state, const char* net) {
394 DWConvBenchmark(state,
395 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16,
396 xnn_init_qs8_conv_minmax_fp32_sse4_params,
397 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
398 }
qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State & state,const char * net)399 static void qs8_dwconv_up8x9__avx_mul16_add16(benchmark::State& state, const char* net) {
400 DWConvBenchmark(state,
401 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul16_add16,
402 xnn_init_qs8_conv_minmax_fp32_sse4_params,
403 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
404 }
qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State & state,const char * net)405 static void qs8_dwconv_up16x9__avx_mul16_add16(benchmark::State& state, const char* net) {
406 DWConvBenchmark(state,
407 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16,
408 xnn_init_qs8_conv_minmax_fp32_sse4_params,
409 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
410 }
qs8_dwconv_up8x9__avx_mul32(benchmark::State & state,const char * net)411 static void qs8_dwconv_up8x9__avx_mul32(benchmark::State& state, const char* net) {
412 DWConvBenchmark(state,
413 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__avx_mul32,
414 xnn_init_qs8_conv_minmax_fp32_sse4_params,
415 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
416 }
qs8_dwconv_up16x9__avx_mul32(benchmark::State & state,const char * net)417 static void qs8_dwconv_up16x9__avx_mul32(benchmark::State& state, const char* net) {
418 DWConvBenchmark(state,
419 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul32,
420 xnn_init_qs8_conv_minmax_fp32_sse4_params,
421 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckAVX);
422 }
qs8_dwconv_up8x9__sse41_mul16(benchmark::State & state,const char * net)423 static void qs8_dwconv_up8x9__sse41_mul16(benchmark::State& state, const char* net) {
424 DWConvBenchmark(state,
425 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16,
426 xnn_init_qs8_conv_minmax_fp32_sse4_params,
427 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
428 }
qs8_dwconv_up16x9__sse41_mul16(benchmark::State & state,const char * net)429 static void qs8_dwconv_up16x9__sse41_mul16(benchmark::State& state, const char* net) {
430 DWConvBenchmark(state,
431 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16,
432 xnn_init_qs8_conv_minmax_fp32_sse4_params,
433 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
434 }
qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State & state,const char * net)435 static void qs8_dwconv_up8x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
436 DWConvBenchmark(state,
437 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16,
438 xnn_init_qs8_conv_minmax_fp32_sse4_params,
439 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
440 }
qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State & state,const char * net)441 static void qs8_dwconv_up16x9__sse41_mul16_add16(benchmark::State& state, const char* net) {
442 DWConvBenchmark(state,
443 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul16_add16,
444 xnn_init_qs8_conv_minmax_fp32_sse4_params,
445 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
446 }
qs8_dwconv_up8x9__sse41_mul32(benchmark::State & state,const char * net)447 static void qs8_dwconv_up8x9__sse41_mul32(benchmark::State& state, const char* net) {
448 DWConvBenchmark(state,
449 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul32,
450 xnn_init_qs8_conv_minmax_fp32_sse4_params,
451 8 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
452 }
qs8_dwconv_up16x9__sse41_mul32(benchmark::State & state,const char * net)453 static void qs8_dwconv_up16x9__sse41_mul32(benchmark::State& state, const char* net) {
454 DWConvBenchmark(state,
455 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse41_mul32,
456 xnn_init_qs8_conv_minmax_fp32_sse4_params,
457 16 /* channel tile */, 9 /* primary tile */, benchmark::utils::CheckSSE41);
458 }
qs8_dwconv_up8x9__sse2_mul16(benchmark::State & state,const char * net)459 static void qs8_dwconv_up8x9__sse2_mul16(benchmark::State& state, const char* net) {
460 DWConvBenchmark(state,
461 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16,
462 xnn_init_qs8_conv_minmax_fp32_sse2_params,
463 8 /* channel tile */, 9 /* primary tile */);
464 }
qs8_dwconv_up16x9__sse2_mul16(benchmark::State & state,const char * net)465 static void qs8_dwconv_up16x9__sse2_mul16(benchmark::State& state, const char* net) {
466 DWConvBenchmark(state,
467 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16,
468 xnn_init_qs8_conv_minmax_fp32_sse2_params,
469 16 /* channel tile */, 9 /* primary tile */);
470 }
qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State & state,const char * net)471 static void qs8_dwconv_up8x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
472 DWConvBenchmark(state,
473 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16,
474 xnn_init_qs8_conv_minmax_fp32_sse2_params,
475 8 /* channel tile */, 9 /* primary tile */);
476 }
qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State & state,const char * net)477 static void qs8_dwconv_up16x9__sse2_mul16_add16(benchmark::State& state, const char* net) {
478 DWConvBenchmark(state,
479 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__sse2_mul16_add16,
480 xnn_init_qs8_conv_minmax_fp32_sse2_params,
481 16 /* channel tile */, 9 /* primary tile */);
482 }
483
484 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx512skx_mul32);
485 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx512skx_mul32);
486
487 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpmovsx);
488 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpmovsx);
489 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_vpunpck);
490 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_vpunpck);
491 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul16_add16_vpunpck);
492 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul16_add16_vpunpck);
493 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx2_mul32);
494 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx2_mul32);
495 BENCHMARK_DWCONV(qs8_dwconv_up32x9__avx2_mul32);
496
497 BENCHMARK_DWCONV(qs8_dwconv_up8x9__xop_mul16_add16);
498 BENCHMARK_DWCONV(qs8_dwconv_up16x9__xop_mul16_add16);
499
500 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16);
501 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16);
502 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul16_add16);
503 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul16_add16);
504 BENCHMARK_DWCONV(qs8_dwconv_up8x9__avx_mul32);
505 BENCHMARK_DWCONV(qs8_dwconv_up16x9__avx_mul32);
506
507 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16);
508 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16);
509 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul16_add16);
510 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul16_add16);
511 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse41_mul32);
512 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse41_mul32);
513
514 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16);
515 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16);
516 BENCHMARK_DWCONV(qs8_dwconv_up8x9__sse2_mul16_add16);
517 BENCHMARK_DWCONV(qs8_dwconv_up16x9__sse2_mul16_add16);
518 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
519
520
521 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State & state,const char * net)522 static void qs8_dwconv_up8x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
523 DWConvBenchmark(state,
524 xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16,
525 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
526 8 /* channel tile */, 9 /* primary tile */);
527 }
qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State & state,const char * net)528 static void qs8_dwconv_up16x9__wasmsimd_mul16(benchmark::State& state, const char* net) {
529 DWConvBenchmark(state,
530 xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16,
531 xnn_init_qs8_conv_minmax_fp32_wasmsimd_params,
532 16 /* channel tile */, 9 /* primary tile */);
533 }
534
535 BENCHMARK_DWCONV(qs8_dwconv_up8x9__wasmsimd_mul16);
536 BENCHMARK_DWCONV(qs8_dwconv_up16x9__wasmsimd_mul16);
537 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
538
539
540 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qs8_dwconv_up1x9__wasm_fmagic(benchmark::State & state,const char * net)541 static void qs8_dwconv_up1x9__wasm_fmagic(benchmark::State& state, const char* net) {
542 DWConvBenchmark(state,
543 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__wasm_fmagic,
544 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
545 1 /* channel tile */, 9 /* primary tile */);
546 }
qs8_dwconv_up2x9__wasm_fmagic(benchmark::State & state,const char * net)547 static void qs8_dwconv_up2x9__wasm_fmagic(benchmark::State& state, const char* net) {
548 DWConvBenchmark(state,
549 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic,
550 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
551 2 /* channel tile */, 9 /* primary tile */);
552 }
qs8_dwconv_up4x9__wasm_fmagic(benchmark::State & state,const char * net)553 static void qs8_dwconv_up4x9__wasm_fmagic(benchmark::State& state, const char* net) {
554 DWConvBenchmark(state,
555 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__wasm_fmagic,
556 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
557 4 /* channel tile */, 9 /* primary tile */);
558 }
559
560 BENCHMARK_DWCONV(qs8_dwconv_up1x9__wasm_fmagic);
561 BENCHMARK_DWCONV(qs8_dwconv_up2x9__wasm_fmagic);
562 BENCHMARK_DWCONV(qs8_dwconv_up4x9__wasm_fmagic);
563 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
564
565
qs8_dwconv_up1x9__scalar_fmagic(benchmark::State & state,const char * net)566 static void qs8_dwconv_up1x9__scalar_fmagic(benchmark::State& state, const char* net) {
567 DWConvBenchmark(state,
568 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic,
569 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
570 1 /* channel tile */, 9 /* primary tile */);
571 }
qs8_dwconv_up2x9__scalar_fmagic(benchmark::State & state,const char * net)572 static void qs8_dwconv_up2x9__scalar_fmagic(benchmark::State& state, const char* net) {
573 DWConvBenchmark(state,
574 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_fmagic,
575 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
576 2 /* channel tile */, 9 /* primary tile */);
577 }
qs8_dwconv_up4x9__scalar_fmagic(benchmark::State & state,const char * net)578 static void qs8_dwconv_up4x9__scalar_fmagic(benchmark::State& state, const char* net) {
579 DWConvBenchmark(state,
580 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_fmagic,
581 xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params,
582 4 /* channel tile */, 9 /* primary tile */);
583 }
584
qs8_dwconv_up1x9__scalar_imagic(benchmark::State & state,const char * net)585 static void qs8_dwconv_up1x9__scalar_imagic(benchmark::State& state, const char* net) {
586 DWConvBenchmark(state,
587 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_imagic,
588 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
589 1 /* channel tile */, 9 /* primary tile */);
590 }
qs8_dwconv_up2x9__scalar_imagic(benchmark::State & state,const char * net)591 static void qs8_dwconv_up2x9__scalar_imagic(benchmark::State& state, const char* net) {
592 DWConvBenchmark(state,
593 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic,
594 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
595 2 /* channel tile */, 9 /* primary tile */);
596 }
qs8_dwconv_up4x9__scalar_imagic(benchmark::State & state,const char * net)597 static void qs8_dwconv_up4x9__scalar_imagic(benchmark::State& state, const char* net) {
598 DWConvBenchmark(state,
599 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_imagic,
600 xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params,
601 4 /* channel tile */, 9 /* primary tile */);
602 }
603
qs8_dwconv_up1x9__scalar_lrintf(benchmark::State & state,const char * net)604 static void qs8_dwconv_up1x9__scalar_lrintf(benchmark::State& state, const char* net) {
605 DWConvBenchmark(state,
606 xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_lrintf,
607 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
608 1 /* channel tile */, 9 /* primary tile */);
609 }
qs8_dwconv_up2x9__scalar_lrintf(benchmark::State & state,const char * net)610 static void qs8_dwconv_up2x9__scalar_lrintf(benchmark::State& state, const char* net) {
611 DWConvBenchmark(state,
612 xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf,
613 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
614 2 /* channel tile */, 9 /* primary tile */);
615 }
qs8_dwconv_up4x9__scalar_lrintf(benchmark::State & state,const char * net)616 static void qs8_dwconv_up4x9__scalar_lrintf(benchmark::State& state, const char* net) {
617 DWConvBenchmark(state,
618 xnn_qs8_dwconv_minmax_fp32_ukernel_up4x9__scalar_lrintf,
619 xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params,
620 4 /* channel tile */, 9 /* primary tile */);
621 }
622
623 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_fmagic);
624 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_fmagic);
625 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_fmagic);
626
627 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_imagic);
628 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_imagic);
629 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_imagic);
630
631 BENCHMARK_DWCONV(qs8_dwconv_up1x9__scalar_lrintf);
632 BENCHMARK_DWCONV(qs8_dwconv_up2x9__scalar_lrintf);
633 BENCHMARK_DWCONV(qs8_dwconv_up4x9__scalar_lrintf);
634
635
636 #ifndef XNNPACK_BENCHMARK_NO_MAIN
637 BENCHMARK_MAIN();
638 #endif
639