xref: /aosp_15_r20/external/XNNPACK/bench/f32-dwconv2d-chw.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12 
13 #include <benchmark/benchmark.h>
14 #include "bench/dwconv.h"
15 #include "bench/utils.h"
16 
17 #include <xnnpack.h>
18 #include <xnnpack/aligned-allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/dwconv.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/microfnptr.h>
23 #include <xnnpack/microparams-init.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26 
27 
f32_dwconv2d_chw(benchmark::State & state,xnn_f32_dwconv2d_chw_ukernel_function dwconv,uint32_t kh,uint32_t kw,uint32_t pw,uint32_t s,benchmark::utils::IsaCheckFunction isa_check=nullptr)28 static void f32_dwconv2d_chw(benchmark::State& state,
29   xnn_f32_dwconv2d_chw_ukernel_function dwconv,
30   uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s,
31   benchmark::utils::IsaCheckFunction isa_check = nullptr)
32 {
33   if (isa_check && !isa_check(state)) {
34     return;
35   }
36 
37   const size_t input_height = state.range(0);
38   const size_t input_width = state.range(1);
39   const size_t kernel_height = state.range(2);
40   const size_t kernel_width = state.range(3);
41   const size_t padding_height = state.range(4);
42   const size_t padding_width = state.range(5);
43   const size_t subsampling = state.range(6);
44   const size_t dilation = state.range(7);
45   const size_t channels = state.range(8);
46 
47   if (kernel_height != kh) {
48     state.SkipWithError("kernel height mismatch");
49     return;
50   }
51 
52   if (kernel_width != kw) {
53     state.SkipWithError("kernel width mismatch");
54     return;
55   }
56 
57   if (subsampling != s) {
58     state.SkipWithError("subsampling mismatch");
59     return;
60   }
61 
62   if (padding_width % 2 != 0 || padding_width / 2 != pw) {
63     state.SkipWithError("padding width mismatch");
64     return;
65   }
66 
67   if (dilation != 1) {
68     state.SkipWithError("unsupported dilation");
69     return;
70   }
71 
72   std::random_device random_device;
73   auto rng = std::mt19937(random_device());
74   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
75 
76   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
77   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
78   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
79   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
80 
81   const size_t inputSize = (input_height + padding_height) * input_width;
82   const size_t kernel_size = kernel_height * kernel_width;
83   const size_t output_size = output_height * output_width;
84 
85   std::vector<float> input(inputSize * channels + 2 * XNN_EXTRA_BYTES);
86   std::generate(input.begin(), input.end(), std::ref(f32rng));
87   std::vector<float> bias(channels);
88   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
89   std::vector<float> kernel(channels * kernel_size);
90   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
91   std::vector<float> zero(input_width + padding_width);
92 
93   const size_t w_elements = (kernel_size + 1) * channels;
94   const size_t o_elements = output_size * channels;
95   const size_t num_buffers = 1 +
96     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
97       sizeof(float) * (w_elements + o_elements));
98 
99   std::vector<float, AlignedAllocator<float, 64>> packed_weights(w_elements * num_buffers);
100   std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
101   for (size_t c = 0; c < channels; c++) {
102     packed_weights[c * kernel_size + c] = bias[c];
103     for (size_t i = 0; i < kernel_size; i++) {
104       packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
105     }
106   }
107   for (size_t n = 1; n < num_buffers; n++) {
108     std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
109   }
110 
111   std::vector<float> output(o_elements * num_buffers);
112   std::fill(output.begin(), output.end(), std::nanf(""));
113 
114   xnn_f32_chw_params chw_params;
115   xnn_init_f32_chw_params(
116     &chw_params, input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
117 
118   size_t buffer_index = 0;
119   for (auto _ : state) {
120     state.PauseTiming();
121     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
122     buffer_index = (buffer_index + 1) % num_buffers;
123     state.ResumeTiming();
124 
125     for (uint32_t channel = 0; channel < channels; channel++) {
126       dwconv(
127         input_height, input_width * sizeof(float),
128         input.data() + channel * inputSize,
129         packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
130         zero.data(),
131         output.data() + channel * output_size + buffer_index * o_elements,
132         padding_height / 2,  // padding_top
133         &chw_params);
134     }
135   }
136 
137   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
138   if (cpu_frequency != 0) {
139     state.counters["cpufreq"] = cpu_frequency;
140   }
141 
142   state.counters["FLOPS"] = benchmark::Counter(
143     uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
144     benchmark::Counter::kIsRate);
145 
146   state.counters["bytes"] = benchmark::Counter(
147     uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(float),
148     benchmark::Counter::kIsRate);
149 }
150 
151 #if XNN_ARCH_ARM
dwconv2d_chw_3x3p1__neon_1x4(benchmark::State & state,const char * net)152   static void dwconv2d_chw_3x3p1__neon_1x4(benchmark::State& state, const char* net) {
153     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
154   }
dwconv2d_chw_3x3p1__neon_2x4(benchmark::State & state,const char * net)155   static void dwconv2d_chw_3x3p1__neon_2x4(benchmark::State& state, const char* net) {
156     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
157   }
dwconv2d_chw_3x3p1__neon_3x4(benchmark::State & state,const char * net)158   static void dwconv2d_chw_3x3p1__neon_3x4(benchmark::State& state, const char* net) {
159     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
160   }
dwconv2d_chw_3x3p1__neon_4x4(benchmark::State & state,const char * net)161   static void dwconv2d_chw_3x3p1__neon_4x4(benchmark::State& state, const char* net) {
162     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
163   }
dwconv2d_chw_3x3p1__neon_5x4(benchmark::State & state,const char * net)164   static void dwconv2d_chw_3x3p1__neon_5x4(benchmark::State& state, const char* net) {
165     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
166   }
dwconv2d_chw_3x3p1__neon_6x4(benchmark::State & state,const char * net)167   static void dwconv2d_chw_3x3p1__neon_6x4(benchmark::State& state, const char* net) {
168     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
169   }
dwconv2d_chw_3x3p1__neon_1x4_acc2(benchmark::State & state,const char * net)170   static void dwconv2d_chw_3x3p1__neon_1x4_acc2(benchmark::State& state, const char* net) {
171     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckNEON);
172   }
dwconv2d_chw_3x3p1__neon_1x4_acc3(benchmark::State & state,const char * net)173   static void dwconv2d_chw_3x3p1__neon_1x4_acc3(benchmark::State& state, const char* net) {
174     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3, 3, 3, 1, 1, benchmark::utils::CheckNEON);
175   }
dwconv2d_chw_3x3p1__neon_1x4_acc4(benchmark::State & state,const char * net)176   static void dwconv2d_chw_3x3p1__neon_1x4_acc4(benchmark::State& state, const char* net) {
177     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
178   }
dwconv2d_chw_3x3p1__neon_2x4_acc2(benchmark::State & state,const char * net)179   static void dwconv2d_chw_3x3p1__neon_2x4_acc2(benchmark::State& state, const char* net) {
180     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckNEON);
181   }
182 
dwconv2d_chw_3x3s2p1__neon_1x4(benchmark::State & state,const char * net)183   static void dwconv2d_chw_3x3s2p1__neon_1x4(benchmark::State& state, const char* net) {
184     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
185   }
dwconv2d_chw_3x3s2p1__neon_2x4(benchmark::State & state,const char * net)186   static void dwconv2d_chw_3x3s2p1__neon_2x4(benchmark::State& state, const char* net) {
187     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
188   }
dwconv2d_chw_3x3s2p1__neon_3x4(benchmark::State & state,const char * net)189   static void dwconv2d_chw_3x3s2p1__neon_3x4(benchmark::State& state, const char* net) {
190     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
191   }
dwconv2d_chw_3x3s2p1__neon_4x4(benchmark::State & state,const char * net)192   static void dwconv2d_chw_3x3s2p1__neon_4x4(benchmark::State& state, const char* net) {
193     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
194   }
dwconv2d_chw_3x3s2p1__neon_1x4_acc2(benchmark::State & state,const char * net)195   static void dwconv2d_chw_3x3s2p1__neon_1x4_acc2(benchmark::State& state, const char* net) {
196     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc2, 3, 3, 1, 2, benchmark::utils::CheckNEON);
197   }
dwconv2d_chw_3x3s2p1__neon_1x4_acc3(benchmark::State & state,const char * net)198   static void dwconv2d_chw_3x3s2p1__neon_1x4_acc3(benchmark::State& state, const char* net) {
199     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc3, 3, 3, 1, 2, benchmark::utils::CheckNEON);
200   }
dwconv2d_chw_3x3s2p1__neon_1x4_acc4(benchmark::State & state,const char * net)201   static void dwconv2d_chw_3x3s2p1__neon_1x4_acc4(benchmark::State& state, const char* net) {
202     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
203   }
dwconv2d_chw_3x3s2p1__neon_2x4_acc2(benchmark::State & state,const char * net)204   static void dwconv2d_chw_3x3s2p1__neon_2x4_acc2(benchmark::State& state, const char* net) {
205     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2, 3, 3, 1, 2, benchmark::utils::CheckNEON);
206   }
207 
dwconv2d_chw_5x5p2__neon_1x4(benchmark::State & state,const char * net)208   static void dwconv2d_chw_5x5p2__neon_1x4(benchmark::State& state, const char* net) {
209     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
210   }
dwconv2d_chw_5x5p2__neon_2x4(benchmark::State & state,const char * net)211   static void dwconv2d_chw_5x5p2__neon_2x4(benchmark::State& state, const char* net) {
212     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
213   }
dwconv2d_chw_5x5p2__neon_3x4(benchmark::State & state,const char * net)214   static void dwconv2d_chw_5x5p2__neon_3x4(benchmark::State& state, const char* net) {
215     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
216   }
dwconv2d_chw_5x5p2__neon_4x4(benchmark::State & state,const char * net)217   static void dwconv2d_chw_5x5p2__neon_4x4(benchmark::State& state, const char* net) {
218     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
219   }
dwconv2d_chw_5x5p2__neon_5x4(benchmark::State & state,const char * net)220   static void dwconv2d_chw_5x5p2__neon_5x4(benchmark::State& state, const char* net) {
221     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
222   }
dwconv2d_chw_5x5p2__neon_1x4_acc2(benchmark::State & state,const char * net)223   static void dwconv2d_chw_5x5p2__neon_1x4_acc2(benchmark::State& state, const char* net) {
224     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
225   }
dwconv2d_chw_5x5p2__neon_1x4_acc3(benchmark::State & state,const char * net)226   static void dwconv2d_chw_5x5p2__neon_1x4_acc3(benchmark::State& state, const char* net) {
227     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3, 5, 5, 2, 1, benchmark::utils::CheckNEON);
228   }
dwconv2d_chw_5x5p2__neon_1x4_acc4(benchmark::State & state,const char * net)229   static void dwconv2d_chw_5x5p2__neon_1x4_acc4(benchmark::State& state, const char* net) {
230     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
231   }
dwconv2d_chw_5x5p2__neon_1x4_acc5(benchmark::State & state,const char * net)232   static void dwconv2d_chw_5x5p2__neon_1x4_acc5(benchmark::State& state, const char* net) {
233     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5, 5, 5, 2, 1, benchmark::utils::CheckNEON);
234   }
dwconv2d_chw_5x5p2__neon_2x4_acc2(benchmark::State & state,const char * net)235   static void dwconv2d_chw_5x5p2__neon_2x4_acc2(benchmark::State& state, const char* net) {
236     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
237   }
dwconv2d_chw_5x5p2__neon_2x4_acc3(benchmark::State & state,const char * net)238   static void dwconv2d_chw_5x5p2__neon_2x4_acc3(benchmark::State& state, const char* net) {
239     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3, 5, 5, 2, 1, benchmark::utils::CheckNEON);
240   }
dwconv2d_chw_5x5p2__neon_3x4_acc2(benchmark::State & state,const char * net)241   static void dwconv2d_chw_5x5p2__neon_3x4_acc2(benchmark::State& state, const char* net) {
242     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
243   }
dwconv2d_chw_5x5p2__neon_4x4_acc2(benchmark::State & state,const char * net)244   static void dwconv2d_chw_5x5p2__neon_4x4_acc2(benchmark::State& state, const char* net) {
245     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
246   }
247 
dwconv2d_chw_5x5s2p2__neon_1x4(benchmark::State & state,const char * net)248   static void dwconv2d_chw_5x5s2p2__neon_1x4(benchmark::State& state, const char* net) {
249     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
250   }
dwconv2d_chw_5x5s2p2__neon_2x4(benchmark::State & state,const char * net)251   static void dwconv2d_chw_5x5s2p2__neon_2x4(benchmark::State& state, const char* net) {
252     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
253   }
dwconv2d_chw_5x5s2p2__neon_3x4(benchmark::State & state,const char * net)254   static void dwconv2d_chw_5x5s2p2__neon_3x4(benchmark::State& state, const char* net) {
255     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
256   }
dwconv2d_chw_5x5s2p2__neon_1x4_acc2(benchmark::State & state,const char * net)257   static void dwconv2d_chw_5x5s2p2__neon_1x4_acc2(benchmark::State& state, const char* net) {
258     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2, 5, 5, 2, 2, benchmark::utils::CheckNEON);
259   }
dwconv2d_chw_5x5s2p2__neon_1x4_acc3(benchmark::State & state,const char * net)260   static void dwconv2d_chw_5x5s2p2__neon_1x4_acc3(benchmark::State& state, const char* net) {
261     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3, 5, 5, 2, 2, benchmark::utils::CheckNEON);
262   }
dwconv2d_chw_5x5s2p2__neon_1x4_acc4(benchmark::State & state,const char * net)263   static void dwconv2d_chw_5x5s2p2__neon_1x4_acc4(benchmark::State& state, const char* net) {
264     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
265   }
dwconv2d_chw_5x5s2p2__neon_1x4_acc5(benchmark::State & state,const char * net)266   static void dwconv2d_chw_5x5s2p2__neon_1x4_acc5(benchmark::State& state, const char* net) {
267     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc5, 5, 5, 2, 2, benchmark::utils::CheckNEON);
268   }
dwconv2d_chw_5x5s2p2__neon_2x4_acc2(benchmark::State & state,const char * net)269   static void dwconv2d_chw_5x5s2p2__neon_2x4_acc2(benchmark::State& state, const char* net) {
270     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc2, 5, 5, 2, 2, benchmark::utils::CheckNEON);
271   }
dwconv2d_chw_5x5s2p2__neon_2x4_acc3(benchmark::State & state,const char * net)272   static void dwconv2d_chw_5x5s2p2__neon_2x4_acc3(benchmark::State& state, const char* net) {
273     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc3, 5, 5, 2, 2, benchmark::utils::CheckNEON);
274   }
dwconv2d_chw_5x5s2p2__neon_3x4_acc2(benchmark::State & state,const char * net)275   static void dwconv2d_chw_5x5s2p2__neon_3x4_acc2(benchmark::State& state, const char* net) {
276     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2, 5, 5, 2, 2, benchmark::utils::CheckNEON);
277   }
278 
279   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_2x4)280   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_2x4)
281   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_3x4)
282   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_4x4)
283   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_5x4)
284   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_6x4)
285   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4_acc2)
286   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4_acc3)
287   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4_acc4)
288   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_2x4_acc2)
289 
290   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4)
291   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_2x4)
292   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_3x4)
293   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_4x4)
294   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4_acc2)
295   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4_acc3)
296   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4_acc4)
297   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_2x4_acc2)
298 
299   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4)
300   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_2x4)
301   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_3x4)
302   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_4x4)
303   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_5x4)
304   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc2)
305   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc3)
306   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc4)
307   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc5)
308   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_2x4_acc2)
309   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_2x4_acc3)
310   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_3x4_acc2)
311   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_4x4_acc2)
312 
313   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4)
314   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_2x4)
315   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_3x4)
316   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc2)
317   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc3)
318   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc4)
319   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc5)
320   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_2x4_acc2)
321   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_2x4_acc3)
322   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_3x4_acc2)
323 #endif  // XNN_ARCH_ARM
324 
325 #if XNN_ARCH_ARM64
326   static void dwconv2d_chw_3x3p1__neonfma_1x4(benchmark::State& state, const char* net) {
327     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4, 3, 3, 1, 1);
328   }
dwconv2d_chw_3x3p1__neonfma_2x4(benchmark::State & state,const char * net)329   static void dwconv2d_chw_3x3p1__neonfma_2x4(benchmark::State& state, const char* net) {
330     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4, 3, 3, 1, 1);
331   }
dwconv2d_chw_3x3p1__neonfma_3x4(benchmark::State & state,const char * net)332   static void dwconv2d_chw_3x3p1__neonfma_3x4(benchmark::State& state, const char* net) {
333     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4, 3, 3, 1, 1);
334   }
dwconv2d_chw_3x3p1__neonfma_4x4(benchmark::State & state,const char * net)335   static void dwconv2d_chw_3x3p1__neonfma_4x4(benchmark::State& state, const char* net) {
336     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4, 3, 3, 1, 1);
337   }
dwconv2d_chw_3x3p1__neonfma_5x4(benchmark::State & state,const char * net)338   static void dwconv2d_chw_3x3p1__neonfma_5x4(benchmark::State& state, const char* net) {
339     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4, 3, 3, 1, 1);
340   }
dwconv2d_chw_3x3p1__neonfma_6x4(benchmark::State & state,const char * net)341   static void dwconv2d_chw_3x3p1__neonfma_6x4(benchmark::State& state, const char* net) {
342     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4, 3, 3, 1, 1);
343   }
dwconv2d_chw_3x3p1__neonfma_1x4_acc2(benchmark::State & state,const char * net)344   static void dwconv2d_chw_3x3p1__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
345     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2, 3, 3, 1, 1);
346   }
dwconv2d_chw_3x3p1__neonfma_1x4_acc3(benchmark::State & state,const char * net)347   static void dwconv2d_chw_3x3p1__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
348     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3, 3, 3, 1, 1);
349   }
dwconv2d_chw_3x3p1__neonfma_1x4_acc4(benchmark::State & state,const char * net)350   static void dwconv2d_chw_3x3p1__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
351     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4, 3, 3, 1, 1);
352   }
dwconv2d_chw_3x3p1__neonfma_2x4_acc2(benchmark::State & state,const char * net)353   static void dwconv2d_chw_3x3p1__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
354     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2, 3, 3, 1, 1);
355   }
356 
dwconv2d_chw_3x3s2p1__neonfma_1x4(benchmark::State & state,const char * net)357   static void dwconv2d_chw_3x3s2p1__neonfma_1x4(benchmark::State& state, const char* net) {
358     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4, 3, 3, 1, 2);
359   }
dwconv2d_chw_3x3s2p1__neonfma_2x4(benchmark::State & state,const char * net)360   static void dwconv2d_chw_3x3s2p1__neonfma_2x4(benchmark::State& state, const char* net) {
361     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4, 3, 3, 1, 2);
362   }
dwconv2d_chw_3x3s2p1__neonfma_3x4(benchmark::State & state,const char * net)363   static void dwconv2d_chw_3x3s2p1__neonfma_3x4(benchmark::State& state, const char* net) {
364     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4, 3, 3, 1, 2);
365   }
dwconv2d_chw_3x3s2p1__neonfma_4x4(benchmark::State & state,const char * net)366   static void dwconv2d_chw_3x3s2p1__neonfma_4x4(benchmark::State& state, const char* net) {
367     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4, 3, 3, 1, 2);
368   }
dwconv2d_chw_3x3s2p1__neonfma_1x4_acc2(benchmark::State & state,const char * net)369   static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
370     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc2, 3, 3, 1, 2);
371   }
dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3(benchmark::State & state,const char * net)372   static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
373     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3, 3, 3, 1, 2);
374   }
dwconv2d_chw_3x3s2p1__neonfma_1x4_acc4(benchmark::State & state,const char * net)375   static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
376     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc4, 3, 3, 1, 2);
377   }
dwconv2d_chw_3x3s2p1__neonfma_2x4_acc2(benchmark::State & state,const char * net)378   static void dwconv2d_chw_3x3s2p1__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
379     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2, 3, 3, 1, 2);
380   }
381 
dwconv2d_chw_5x5p2__neonfma_1x4(benchmark::State & state,const char * net)382   static void dwconv2d_chw_5x5p2__neonfma_1x4(benchmark::State& state, const char* net) {
383     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4, 5, 5, 2, 1);
384   }
dwconv2d_chw_5x5p2__neonfma_2x4(benchmark::State & state,const char * net)385   static void dwconv2d_chw_5x5p2__neonfma_2x4(benchmark::State& state, const char* net) {
386     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4, 5, 5, 2, 1);
387   }
dwconv2d_chw_5x5p2__neonfma_3x4(benchmark::State & state,const char * net)388   static void dwconv2d_chw_5x5p2__neonfma_3x4(benchmark::State& state, const char* net) {
389     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4, 5, 5, 2, 1);
390   }
dwconv2d_chw_5x5p2__neonfma_4x4(benchmark::State & state,const char * net)391   static void dwconv2d_chw_5x5p2__neonfma_4x4(benchmark::State& state, const char* net) {
392     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4, 5, 5, 2, 1);
393   }
dwconv2d_chw_5x5p2__neonfma_5x4(benchmark::State & state,const char * net)394   static void dwconv2d_chw_5x5p2__neonfma_5x4(benchmark::State& state, const char* net) {
395     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4, 5, 5, 2, 1);
396   }
dwconv2d_chw_5x5p2__neonfma_1x4_acc2(benchmark::State & state,const char * net)397   static void dwconv2d_chw_5x5p2__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
398     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2, 5, 5, 2, 1);
399   }
dwconv2d_chw_5x5p2__neonfma_1x4_acc3(benchmark::State & state,const char * net)400   static void dwconv2d_chw_5x5p2__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
401     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3, 5, 5, 2, 1);
402   }
dwconv2d_chw_5x5p2__neonfma_1x4_acc4(benchmark::State & state,const char * net)403   static void dwconv2d_chw_5x5p2__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
404     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4, 5, 5, 2, 1);
405   }
dwconv2d_chw_5x5p2__neonfma_1x4_acc5(benchmark::State & state,const char * net)406   static void dwconv2d_chw_5x5p2__neonfma_1x4_acc5(benchmark::State& state, const char* net) {
407     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5, 5, 5, 2, 1);
408   }
dwconv2d_chw_5x5p2__neonfma_2x4_acc2(benchmark::State & state,const char * net)409   static void dwconv2d_chw_5x5p2__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
410     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2, 5, 5, 2, 1);
411   }
dwconv2d_chw_5x5p2__neonfma_2x4_acc3(benchmark::State & state,const char * net)412   static void dwconv2d_chw_5x5p2__neonfma_2x4_acc3(benchmark::State& state, const char* net) {
413     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3, 5, 5, 2, 1);
414   }
dwconv2d_chw_5x5p2__neonfma_3x4_acc2(benchmark::State & state,const char * net)415   static void dwconv2d_chw_5x5p2__neonfma_3x4_acc2(benchmark::State& state, const char* net) {
416     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2, 5, 5, 2, 1);
417   }
dwconv2d_chw_5x5p2__neonfma_4x4_acc2(benchmark::State & state,const char * net)418   static void dwconv2d_chw_5x5p2__neonfma_4x4_acc2(benchmark::State& state, const char* net) {
419     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2, 5, 5, 2, 1);
420   }
421 
dwconv2d_chw_5x5s2p2__neonfma_1x4(benchmark::State & state,const char * net)422   static void dwconv2d_chw_5x5s2p2__neonfma_1x4(benchmark::State& state, const char* net) {
423     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4, 5, 5, 2, 2);
424   }
dwconv2d_chw_5x5s2p2__neonfma_2x4(benchmark::State & state,const char * net)425   static void dwconv2d_chw_5x5s2p2__neonfma_2x4(benchmark::State& state, const char* net) {
426     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4, 5, 5, 2, 2);
427   }
dwconv2d_chw_5x5s2p2__neonfma_3x4(benchmark::State & state,const char * net)428   static void dwconv2d_chw_5x5s2p2__neonfma_3x4(benchmark::State& state, const char* net) {
429     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4, 5, 5, 2, 2);
430   }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2(benchmark::State & state,const char * net)431   static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
432     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2, 5, 5, 2, 2);
433   }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc3(benchmark::State & state,const char * net)434   static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
435     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3, 5, 5, 2, 2);
436   }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc4(benchmark::State & state,const char * net)437   static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
438     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4, 5, 5, 2, 2);
439   }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc5(benchmark::State & state,const char * net)440   static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc5(benchmark::State& state, const char* net) {
441     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc5, 5, 5, 2, 2);
442   }
dwconv2d_chw_5x5s2p2__neonfma_2x4_acc2(benchmark::State & state,const char * net)443   static void dwconv2d_chw_5x5s2p2__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
444     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc2, 5, 5, 2, 2);
445   }
dwconv2d_chw_5x5s2p2__neonfma_2x4_acc3(benchmark::State & state,const char * net)446   static void dwconv2d_chw_5x5s2p2__neonfma_2x4_acc3(benchmark::State& state, const char* net) {
447     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc3, 5, 5, 2, 2);
448   }
dwconv2d_chw_5x5s2p2__neonfma_3x4_acc2(benchmark::State & state,const char * net)449   static void dwconv2d_chw_5x5s2p2__neonfma_3x4_acc2(benchmark::State& state, const char* net) {
450     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2, 5, 5, 2, 2);
451   }
452 
453   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_2x4)454   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_2x4)
455   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_3x4)
456   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_4x4)
457   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_5x4)
458   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_6x4)
459   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4_acc2)
460   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4_acc3)
461   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4_acc4)
462   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_2x4_acc2)
463 
464   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4)
465   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_2x4)
466   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_3x4)
467   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_4x4)
468   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc2)
469   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3)
470   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc4)
471   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_2x4_acc2)
472 
473   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4)
474   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_2x4)
475   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_3x4)
476   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_4x4)
477   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_5x4)
478   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc2)
479   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc3)
480   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc4)
481   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc5)
482   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_2x4_acc2)
483   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_2x4_acc3)
484   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_3x4_acc2)
485   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_4x4_acc2)
486 
487   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4)
488   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_2x4)
489   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_3x4)
490   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2)
491   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc3)
492   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc4)
493   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc5)
494   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_2x4_acc2)
495   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_2x4_acc3)
496   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_3x4_acc2)
497 #endif  // XNN_ARCH_ARM64
498 
499 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
500   static void dwconv2d_chw_3x3p1__sse_1x4(benchmark::State& state, const char* net) {
501     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4, 3, 3, 1, 1);
502   }
dwconv2d_chw_3x3p1__sse_2x4(benchmark::State & state,const char * net)503   static void dwconv2d_chw_3x3p1__sse_2x4(benchmark::State& state, const char* net) {
504     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4, 3, 3, 1, 1);
505   }
dwconv2d_chw_3x3p1__sse_3x4(benchmark::State & state,const char * net)506   static void dwconv2d_chw_3x3p1__sse_3x4(benchmark::State& state, const char* net) {
507     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4, 3, 3, 1, 1);
508   }
dwconv2d_chw_3x3p1__sse_4x4(benchmark::State & state,const char * net)509   static void dwconv2d_chw_3x3p1__sse_4x4(benchmark::State& state, const char* net) {
510     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4, 3, 3, 1, 1);
511   }
dwconv2d_chw_3x3p1__sse_5x4(benchmark::State & state,const char * net)512   static void dwconv2d_chw_3x3p1__sse_5x4(benchmark::State& state, const char* net) {
513     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4, 3, 3, 1, 1);
514   }
dwconv2d_chw_3x3p1__sse_6x4(benchmark::State & state,const char * net)515   static void dwconv2d_chw_3x3p1__sse_6x4(benchmark::State& state, const char* net) {
516     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4, 3, 3, 1, 1);
517   }
dwconv2d_chw_3x3p1__sse_1x4_acc2(benchmark::State & state,const char * net)518   static void dwconv2d_chw_3x3p1__sse_1x4_acc2(benchmark::State& state, const char* net) {
519     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc2, 3, 3, 1, 1);
520   }
dwconv2d_chw_3x3p1__sse_1x4_acc3(benchmark::State & state,const char * net)521   static void dwconv2d_chw_3x3p1__sse_1x4_acc3(benchmark::State& state, const char* net) {
522     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc3, 3, 3, 1, 1);
523   }
dwconv2d_chw_3x3p1__sse_1x4_acc4(benchmark::State & state,const char * net)524   static void dwconv2d_chw_3x3p1__sse_1x4_acc4(benchmark::State& state, const char* net) {
525     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4, 3, 3, 1, 1);
526   }
dwconv2d_chw_3x3p1__sse_2x4_acc2(benchmark::State & state,const char * net)527   static void dwconv2d_chw_3x3p1__sse_2x4_acc2(benchmark::State& state, const char* net) {
528     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2, 3, 3, 1, 1);
529   }
530 
dwconv2d_chw_3x3p1__ssse3_1x4(benchmark::State & state,const char * net)531   static void dwconv2d_chw_3x3p1__ssse3_1x4(benchmark::State& state, const char* net) {
532     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
533   }
dwconv2d_chw_3x3p1__ssse3_2x4(benchmark::State & state,const char * net)534   static void dwconv2d_chw_3x3p1__ssse3_2x4(benchmark::State& state, const char* net) {
535     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
536   }
dwconv2d_chw_3x3p1__ssse3_3x4(benchmark::State & state,const char * net)537   static void dwconv2d_chw_3x3p1__ssse3_3x4(benchmark::State& state, const char* net) {
538     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
539   }
dwconv2d_chw_3x3p1__ssse3_4x4(benchmark::State & state,const char * net)540   static void dwconv2d_chw_3x3p1__ssse3_4x4(benchmark::State& state, const char* net) {
541     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
542   }
dwconv2d_chw_3x3p1__ssse3_5x4(benchmark::State & state,const char * net)543   static void dwconv2d_chw_3x3p1__ssse3_5x4(benchmark::State& state, const char* net) {
544     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
545   }
dwconv2d_chw_3x3p1__ssse3_6x4(benchmark::State & state,const char * net)546   static void dwconv2d_chw_3x3p1__ssse3_6x4(benchmark::State& state, const char* net) {
547     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
548   }
dwconv2d_chw_3x3p1__ssse3_1x4_acc2(benchmark::State & state,const char * net)549   static void dwconv2d_chw_3x3p1__ssse3_1x4_acc2(benchmark::State& state, const char* net) {
550     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
551   }
dwconv2d_chw_3x3p1__ssse3_1x4_acc3(benchmark::State & state,const char * net)552   static void dwconv2d_chw_3x3p1__ssse3_1x4_acc3(benchmark::State& state, const char* net) {
553     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
554   }
dwconv2d_chw_3x3p1__ssse3_1x4_acc4(benchmark::State & state,const char * net)555   static void dwconv2d_chw_3x3p1__ssse3_1x4_acc4(benchmark::State& state, const char* net) {
556     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
557   }
dwconv2d_chw_3x3p1__ssse3_2x4_acc2(benchmark::State & state,const char * net)558   static void dwconv2d_chw_3x3p1__ssse3_2x4_acc2(benchmark::State& state, const char* net) {
559     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
560   }
561 
dwconv2d_chw_3x3s2p1__sse_1x4(benchmark::State & state,const char * net)562   static void dwconv2d_chw_3x3s2p1__sse_1x4(benchmark::State& state, const char* net) {
563     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4, 3, 3, 1, 2);
564   }
dwconv2d_chw_3x3s2p1__sse_2x4(benchmark::State & state,const char * net)565   static void dwconv2d_chw_3x3s2p1__sse_2x4(benchmark::State& state, const char* net) {
566     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_2x4, 3, 3, 1, 2);
567   }
dwconv2d_chw_3x3s2p1__sse_3x4(benchmark::State & state,const char * net)568   static void dwconv2d_chw_3x3s2p1__sse_3x4(benchmark::State& state, const char* net) {
569     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4, 3, 3, 1, 2);
570   }
dwconv2d_chw_3x3s2p1__sse_4x4(benchmark::State & state,const char * net)571   static void dwconv2d_chw_3x3s2p1__sse_4x4(benchmark::State& state, const char* net) {
572     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4, 3, 3, 1, 2);
573   }
dwconv2d_chw_3x3s2p1__sse_1x4_acc2(benchmark::State & state,const char * net)574   static void dwconv2d_chw_3x3s2p1__sse_1x4_acc2(benchmark::State& state, const char* net) {
575     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc2, 3, 3, 1, 2);
576   }
dwconv2d_chw_3x3s2p1__sse_1x4_acc3(benchmark::State & state,const char * net)577   static void dwconv2d_chw_3x3s2p1__sse_1x4_acc3(benchmark::State& state, const char* net) {
578     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3, 3, 3, 1, 2);
579   }
dwconv2d_chw_3x3s2p1__sse_1x4_acc4(benchmark::State & state,const char * net)580   static void dwconv2d_chw_3x3s2p1__sse_1x4_acc4(benchmark::State& state, const char* net) {
581     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc4, 3, 3, 1, 2);
582   }
dwconv2d_chw_3x3s2p1__sse_2x4_acc2(benchmark::State & state,const char * net)583   static void dwconv2d_chw_3x3s2p1__sse_2x4_acc2(benchmark::State& state, const char* net) {
584     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_2x4_acc2, 3, 3, 1, 2);
585   }
586 
dwconv2d_chw_5x5p2__sse_1x4(benchmark::State & state,const char * net)587   static void dwconv2d_chw_5x5p2__sse_1x4(benchmark::State& state, const char* net) {
588     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4, 5, 5, 2, 1);
589   }
dwconv2d_chw_5x5p2__sse_2x4(benchmark::State & state,const char * net)590   static void dwconv2d_chw_5x5p2__sse_2x4(benchmark::State& state, const char* net) {
591     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4, 5, 5, 2, 1);
592   }
dwconv2d_chw_5x5p2__sse_3x4(benchmark::State & state,const char * net)593   static void dwconv2d_chw_5x5p2__sse_3x4(benchmark::State& state, const char* net) {
594     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4, 5, 5, 2, 1);
595   }
dwconv2d_chw_5x5p2__sse_4x4(benchmark::State & state,const char * net)596   static void dwconv2d_chw_5x5p2__sse_4x4(benchmark::State& state, const char* net) {
597     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4, 5, 5, 2, 1);
598   }
dwconv2d_chw_5x5p2__sse_5x4(benchmark::State & state,const char * net)599   static void dwconv2d_chw_5x5p2__sse_5x4(benchmark::State& state, const char* net) {
600     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4, 5, 5, 2, 1);
601   }
dwconv2d_chw_5x5p2__sse_1x4_acc2(benchmark::State & state,const char * net)602   static void dwconv2d_chw_5x5p2__sse_1x4_acc2(benchmark::State& state, const char* net) {
603     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2, 5, 5, 2, 1);
604   }
dwconv2d_chw_5x5p2__sse_1x4_acc3(benchmark::State & state,const char * net)605   static void dwconv2d_chw_5x5p2__sse_1x4_acc3(benchmark::State& state, const char* net) {
606     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3, 5, 5, 2, 1);
607   }
dwconv2d_chw_5x5p2__sse_1x4_acc4(benchmark::State & state,const char * net)608   static void dwconv2d_chw_5x5p2__sse_1x4_acc4(benchmark::State& state, const char* net) {
609     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4, 5, 5, 2, 1);
610   }
dwconv2d_chw_5x5p2__sse_1x4_acc5(benchmark::State & state,const char * net)611   static void dwconv2d_chw_5x5p2__sse_1x4_acc5(benchmark::State& state, const char* net) {
612     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5, 5, 5, 2, 1);
613   }
dwconv2d_chw_5x5p2__sse_2x4_acc2(benchmark::State & state,const char * net)614   static void dwconv2d_chw_5x5p2__sse_2x4_acc2(benchmark::State& state, const char* net) {
615     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2, 5, 5, 2, 1);
616   }
dwconv2d_chw_5x5p2__sse_2x4_acc3(benchmark::State & state,const char * net)617   static void dwconv2d_chw_5x5p2__sse_2x4_acc3(benchmark::State& state, const char* net) {
618     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3, 5, 5, 2, 1);
619   }
dwconv2d_chw_5x5p2__sse_3x4_acc2(benchmark::State & state,const char * net)620   static void dwconv2d_chw_5x5p2__sse_3x4_acc2(benchmark::State& state, const char* net) {
621     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2, 5, 5, 2, 1);
622   }
dwconv2d_chw_5x5p2__sse_4x4_acc2(benchmark::State & state,const char * net)623   static void dwconv2d_chw_5x5p2__sse_4x4_acc2(benchmark::State& state, const char* net) {
624     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2, 5, 5, 2, 1);
625   }
626 
dwconv2d_chw_5x5s2p2__sse_1x4(benchmark::State & state,const char * net)627   static void dwconv2d_chw_5x5s2p2__sse_1x4(benchmark::State& state, const char* net) {
628     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4, 5, 5, 2, 2);
629   }
dwconv2d_chw_5x5s2p2__sse_2x4(benchmark::State & state,const char * net)630   static void dwconv2d_chw_5x5s2p2__sse_2x4(benchmark::State& state, const char* net) {
631     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4, 5, 5, 2, 2);
632   }
dwconv2d_chw_5x5s2p2__sse_3x4(benchmark::State & state,const char * net)633   static void dwconv2d_chw_5x5s2p2__sse_3x4(benchmark::State& state, const char* net) {
634     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4, 5, 5, 2, 2);
635   }
dwconv2d_chw_5x5s2p2__sse_1x4_acc2(benchmark::State & state,const char * net)636   static void dwconv2d_chw_5x5s2p2__sse_1x4_acc2(benchmark::State& state, const char* net) {
637     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc2, 5, 5, 2, 2);
638   }
dwconv2d_chw_5x5s2p2__sse_1x4_acc3(benchmark::State & state,const char * net)639   static void dwconv2d_chw_5x5s2p2__sse_1x4_acc3(benchmark::State& state, const char* net) {
640     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc3, 5, 5, 2, 2);
641   }
dwconv2d_chw_5x5s2p2__sse_1x4_acc4(benchmark::State & state,const char * net)642   static void dwconv2d_chw_5x5s2p2__sse_1x4_acc4(benchmark::State& state, const char* net) {
643     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc4, 5, 5, 2, 2);
644   }
dwconv2d_chw_5x5s2p2__sse_1x4_acc5(benchmark::State & state,const char * net)645   static void dwconv2d_chw_5x5s2p2__sse_1x4_acc5(benchmark::State& state, const char* net) {
646     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc5, 5, 5, 2, 2);
647   }
dwconv2d_chw_5x5s2p2__sse_2x4_acc2(benchmark::State & state,const char * net)648   static void dwconv2d_chw_5x5s2p2__sse_2x4_acc2(benchmark::State& state, const char* net) {
649     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2, 5, 5, 2, 2);
650   }
dwconv2d_chw_5x5s2p2__sse_2x4_acc3(benchmark::State & state,const char * net)651   static void dwconv2d_chw_5x5s2p2__sse_2x4_acc3(benchmark::State& state, const char* net) {
652     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3, 5, 5, 2, 2);
653   }
dwconv2d_chw_5x5s2p2__sse_3x4_acc2(benchmark::State & state,const char * net)654   static void dwconv2d_chw_5x5s2p2__sse_3x4_acc2(benchmark::State& state, const char* net) {
655     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2, 5, 5, 2, 2);
656   }
657 
658   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_2x4)659   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_2x4)
660   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_3x4)
661   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_4x4)
662   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_5x4)
663   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_6x4)
664   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc2)
665   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc3)
666   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc4)
667   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_2x4_acc2)
668 
669   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4)
670   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_2x4)
671   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_3x4)
672   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_4x4)
673   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_5x4)
674   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_6x4)
675   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4_acc2)
676   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4_acc3)
677   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4_acc4)
678   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_2x4_acc2)
679 
680   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4)
681   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_2x4)
682   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_3x4)
683   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_4x4)
684   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc2)
685   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc3)
686   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc4)
687   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_2x4_acc2)
688 
689   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4)
690   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_2x4)
691   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_3x4)
692   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_4x4)
693   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_5x4)
694   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc2)
695   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc3)
696   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc4)
697   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc5)
698   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_2x4_acc2)
699   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_2x4_acc3)
700   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_3x4_acc2)
701   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_4x4_acc2)
702 
703   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4)
704   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_2x4)
705   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_3x4)
706   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc2)
707   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc3)
708   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc4)
709   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc5)
710   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_2x4_acc2)
711   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_2x4_acc3)
712   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_3x4_acc2)
713 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
714 
715 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
716   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
717     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4, 3, 3, 1, 1);
718   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)719   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
720     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4, 3, 3, 1, 1);
721   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)722   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
723     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4, 3, 3, 1, 1);
724   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_4x4(benchmark::State & state,const char * net)725   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_4x4(benchmark::State& state, const char* net) {
726     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4, 3, 3, 1, 1);
727   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_5x4(benchmark::State & state,const char * net)728   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_5x4(benchmark::State& state, const char* net) {
729     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4, 3, 3, 1, 1);
730   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_6x4(benchmark::State & state,const char * net)731   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_6x4(benchmark::State& state, const char* net) {
732     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4, 3, 3, 1, 1);
733   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)734   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
735     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2, 3, 3, 1, 1);
736   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)737   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
738     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3, 3, 3, 1, 1);
739   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)740   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
741     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4, 3, 3, 1, 1);
742   }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)743   static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
744     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2, 3, 3, 1, 1);
745   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)746   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
747     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4, 3, 3, 1, 1);
748   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)749   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
750     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4, 3, 3, 1, 1);
751   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)752   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
753     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4, 3, 3, 1, 1);
754   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_4x4(benchmark::State & state,const char * net)755   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_4x4(benchmark::State& state, const char* net) {
756     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4, 3, 3, 1, 1);
757   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_5x4(benchmark::State & state,const char * net)758   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_5x4(benchmark::State& state, const char* net) {
759     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4, 3, 3, 1, 1);
760   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_6x4(benchmark::State & state,const char * net)761   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_6x4(benchmark::State& state, const char* net) {
762     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4, 3, 3, 1, 1);
763   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)764   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
765     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2, 3, 3, 1, 1);
766   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)767   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
768     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3, 3, 3, 1, 1);
769   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)770   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
771     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4, 3, 3, 1, 1);
772   }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)773   static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
774     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2, 3, 3, 1, 1);
775   }
776 
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)777   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
778     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4, 3, 3, 1, 1);
779   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)780   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
781     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4, 3, 3, 1, 1);
782   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)783   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
784     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4, 3, 3, 1, 1);
785   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_4x4(benchmark::State & state,const char * net)786   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_4x4(benchmark::State& state, const char* net) {
787     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4, 3, 3, 1, 1);
788   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_5x4(benchmark::State & state,const char * net)789   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_5x4(benchmark::State& state, const char* net) {
790     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4, 3, 3, 1, 1);
791   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_6x4(benchmark::State & state,const char * net)792   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_6x4(benchmark::State& state, const char* net) {
793     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4, 3, 3, 1, 1);
794   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)795   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
796     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2, 3, 3, 1, 1);
797   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)798   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
799     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3, 3, 3, 1, 1);
800   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)801   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
802     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4, 3, 3, 1, 1);
803   }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)804   static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
805     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2, 3, 3, 1, 1);
806   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)807   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
808     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4, 3, 3, 1, 1);
809   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)810   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
811     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4, 3, 3, 1, 1);
812   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)813   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
814     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4, 3, 3, 1, 1);
815   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_4x4(benchmark::State & state,const char * net)816   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_4x4(benchmark::State& state, const char* net) {
817     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4, 3, 3, 1, 1);
818   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_5x4(benchmark::State & state,const char * net)819   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_5x4(benchmark::State& state, const char* net) {
820     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4, 3, 3, 1, 1);
821   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_6x4(benchmark::State & state,const char * net)822   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_6x4(benchmark::State& state, const char* net) {
823     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4, 3, 3, 1, 1);
824   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)825   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
826     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2, 3, 3, 1, 1);
827   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)828   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
829     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3, 3, 3, 1, 1);
830   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)831   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
832     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4, 3, 3, 1, 1);
833   }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)834   static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
835     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2, 3, 3, 1, 1);
836   }
837 
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4(benchmark::State & state,const char * net)838   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
839     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4, 3, 3, 1, 2);
840   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)841   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
842     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_2x4, 3, 3, 1, 2);
843   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)844   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
845     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4, 3, 3, 1, 2);
846   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_4x4(benchmark::State & state,const char * net)847   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_4x4(benchmark::State& state, const char* net) {
848     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4, 3, 3, 1, 2);
849   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)850   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
851     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2, 3, 3, 1, 2);
852   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)853   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
854     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3, 3, 3, 1, 2);
855   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)856   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
857     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4, 3, 3, 1, 2);
858   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)859   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
860     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2, 3, 3, 1, 2);
861   }
862 
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)863   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
864     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4, 3, 3, 1, 2);
865   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)866   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
867     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_2x4, 3, 3, 1, 2);
868   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)869   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
870     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4, 3, 3, 1, 2);
871   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_4x4(benchmark::State & state,const char * net)872   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_4x4(benchmark::State& state, const char* net) {
873     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4, 3, 3, 1, 2);
874   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)875   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
876     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2, 3, 3, 1, 2);
877   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)878   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
879     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3, 3, 3, 1, 2);
880   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)881   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
882     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4, 3, 3, 1, 2);
883   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)884   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
885     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2, 3, 3, 1, 2);
886   }
887 
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)888   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
889     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4, 3, 3, 1, 2);
890   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)891   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
892     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4, 3, 3, 1, 2);
893   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)894   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
895     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4, 3, 3, 1, 2);
896   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_4x4(benchmark::State & state,const char * net)897   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_4x4(benchmark::State& state, const char* net) {
898     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4, 3, 3, 1, 2);
899   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)900   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
901     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc2, 3, 3, 1, 2);
902   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)903   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
904     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc3, 3, 3, 1, 2);
905   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)906   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
907     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4, 3, 3, 1, 2);
908   }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)909   static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
910     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2, 3, 3, 1, 2);
911   }
912 
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)913   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
914     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4, 3, 3, 1, 2);
915   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)916   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
917     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4, 3, 3, 1, 2);
918   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)919   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
920     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4, 3, 3, 1, 2);
921   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_4x4(benchmark::State & state,const char * net)922   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_4x4(benchmark::State& state, const char* net) {
923     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4, 3, 3, 1, 2);
924   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)925   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
926     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2, 3, 3, 1, 2);
927   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)928   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
929     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc3, 3, 3, 1, 2);
930   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)931   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
932     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc4, 3, 3, 1, 2);
933   }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)934   static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
935     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2, 3, 3, 1, 2);
936   }
937 
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4(benchmark::State & state,const char * net)938   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
939     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4, 5, 5, 2, 1);
940   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)941   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
942     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4, 5, 5, 2, 1);
943   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)944   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
945     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4, 5, 5, 2, 1);
946   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4(benchmark::State & state,const char * net)947   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4(benchmark::State& state, const char* net) {
948     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4, 5, 5, 2, 1);
949   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_5x4(benchmark::State & state,const char * net)950   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_5x4(benchmark::State& state, const char* net) {
951     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4, 5, 5, 2, 1);
952   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)953   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
954     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2, 5, 5, 2, 1);
955   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)956   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
957     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3, 5, 5, 2, 1);
958   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)959   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
960     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4, 5, 5, 2, 1);
961   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State & state,const char * net)962   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
963     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5, 5, 5, 2, 1);
964   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)965   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
966     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2, 5, 5, 2, 1);
967   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State & state,const char * net)968   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
969     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3, 5, 5, 2, 1);
970   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State & state,const char * net)971   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
972     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2, 5, 5, 2, 1);
973   }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2(benchmark::State & state,const char * net)974   static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2(benchmark::State& state, const char* net) {
975     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2, 5, 5, 2, 1);
976   }
977 
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)978   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
979     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4, 5, 5, 2, 1);
980   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)981   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
982     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4, 5, 5, 2, 1);
983   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)984   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
985     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4, 5, 5, 2, 1);
986   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4(benchmark::State & state,const char * net)987   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4(benchmark::State& state, const char* net) {
988     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4, 5, 5, 2, 1);
989   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_5x4(benchmark::State & state,const char * net)990   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_5x4(benchmark::State& state, const char* net) {
991     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4, 5, 5, 2, 1);
992   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)993   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
994     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2, 5, 5, 2, 1);
995   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)996   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
997     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3, 5, 5, 2, 1);
998   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)999   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
1000     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4, 5, 5, 2, 1);
1001   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State & state,const char * net)1002   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
1003     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5, 5, 5, 2, 1);
1004   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)1005   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
1006     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2, 5, 5, 2, 1);
1007   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3(benchmark::State & state,const char * net)1008   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
1009     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3, 5, 5, 2, 1);
1010   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2(benchmark::State & state,const char * net)1011   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
1012     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2, 5, 5, 2, 1);
1013   }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2(benchmark::State & state,const char * net)1014   static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2(benchmark::State& state, const char* net) {
1015     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2, 5, 5, 2, 1);
1016   }
1017 
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)1018   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
1019     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4, 5, 5, 2, 1);
1020   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)1021   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
1022     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4, 5, 5, 2, 1);
1023   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)1024   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
1025     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4, 5, 5, 2, 1);
1026   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4(benchmark::State & state,const char * net)1027   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4(benchmark::State& state, const char* net) {
1028     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4, 5, 5, 2, 1);
1029   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_5x4(benchmark::State & state,const char * net)1030   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_5x4(benchmark::State& state, const char* net) {
1031     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4, 5, 5, 2, 1);
1032   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)1033   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
1034     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2, 5, 5, 2, 1);
1035   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)1036   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
1037     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3, 5, 5, 2, 1);
1038   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)1039   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
1040     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4, 5, 5, 2, 1);
1041   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc5(benchmark::State & state,const char * net)1042   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc5(benchmark::State& state, const char* net) {
1043     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc5, 5, 5, 2, 1);
1044   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)1045   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
1046     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2, 5, 5, 2, 1);
1047   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc3(benchmark::State & state,const char * net)1048   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc3(benchmark::State& state, const char* net) {
1049     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3, 5, 5, 2, 1);
1050   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4_acc2(benchmark::State & state,const char * net)1051   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4_acc2(benchmark::State& state, const char* net) {
1052     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2, 5, 5, 2, 1);
1053   }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4_acc2(benchmark::State & state,const char * net)1054   static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4_acc2(benchmark::State& state, const char* net) {
1055     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2, 5, 5, 2, 1);
1056   }
1057 
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)1058   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
1059     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4, 5, 5, 2, 1);
1060   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)1061   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
1062     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4, 5, 5, 2, 1);
1063   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)1064   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
1065     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4, 5, 5, 2, 1);
1066   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4(benchmark::State & state,const char * net)1067   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4(benchmark::State& state, const char* net) {
1068     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4, 5, 5, 2, 1);
1069   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_5x4(benchmark::State & state,const char * net)1070   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_5x4(benchmark::State& state, const char* net) {
1071     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4, 5, 5, 2, 1);
1072   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)1073   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
1074     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2, 5, 5, 2, 1);
1075   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)1076   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
1077     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3, 5, 5, 2, 1);
1078   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)1079   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
1080     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4, 5, 5, 2, 1);
1081   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc5(benchmark::State & state,const char * net)1082   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc5(benchmark::State& state, const char* net) {
1083     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc5, 5, 5, 2, 1);
1084   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)1085   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
1086     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2, 5, 5, 2, 1);
1087   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc3(benchmark::State & state,const char * net)1088   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc3(benchmark::State& state, const char* net) {
1089     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3, 5, 5, 2, 1);
1090   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4_acc2(benchmark::State & state,const char * net)1091   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4_acc2(benchmark::State& state, const char* net) {
1092     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2, 5, 5, 2, 1);
1093   }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4_acc2(benchmark::State & state,const char * net)1094   static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4_acc2(benchmark::State& state, const char* net) {
1095     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2, 5, 5, 2, 1);
1096   }
1097 
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4(benchmark::State & state,const char * net)1098   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
1099     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4, 5, 5, 2, 2);
1100   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)1101   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
1102     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4, 5, 5, 2, 2);
1103   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)1104   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
1105     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4, 5, 5, 2, 2);
1106   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)1107   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
1108     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2, 5, 5, 2, 2);
1109   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)1110   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
1111     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3, 5, 5, 2, 2);
1112   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)1113   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
1114     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4, 5, 5, 2, 2);
1115   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State & state,const char * net)1116   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
1117     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5, 5, 5, 2, 2);
1118   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)1119   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
1120     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2, 5, 5, 2, 2);
1121   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State & state,const char * net)1122   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
1123     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3, 5, 5, 2, 2);
1124   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State & state,const char * net)1125   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
1126     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2, 5, 5, 2, 2);
1127   }
1128 
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)1129   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
1130     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4, 5, 5, 2, 2);
1131   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)1132   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
1133     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4, 5, 5, 2, 2);
1134   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)1135   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
1136     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4, 5, 5, 2, 2);
1137   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)1138   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
1139     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2, 5, 5, 2, 2);
1140   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)1141   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
1142     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3, 5, 5, 2, 2);
1143   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)1144   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
1145     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4, 5, 5, 2, 2);
1146   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State & state,const char * net)1147   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
1148     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc5, 5, 5, 2, 2);
1149   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)1150   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
1151     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2, 5, 5, 2, 2);
1152   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3(benchmark::State & state,const char * net)1153   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
1154     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3, 5, 5, 2, 2);
1155   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2(benchmark::State & state,const char * net)1156   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
1157     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2, 5, 5, 2, 2);
1158   }
1159 
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)1160   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
1161     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4, 5, 5, 2, 2);
1162   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)1163   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
1164     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4, 5, 5, 2, 2);
1165   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)1166   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
1167     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4, 5, 5, 2, 2);
1168   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)1169   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
1170     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2, 5, 5, 2, 2);
1171   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)1172   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
1173     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3, 5, 5, 2, 2);
1174   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)1175   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
1176     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc4, 5, 5, 2, 2);
1177   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc5(benchmark::State & state,const char * net)1178   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc5(benchmark::State& state, const char* net) {
1179     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc5, 5, 5, 2, 2);
1180   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)1181   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
1182     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2, 5, 5, 2, 2);
1183   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4_acc3(benchmark::State & state,const char * net)1184   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4_acc3(benchmark::State& state, const char* net) {
1185     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3, 5, 5, 2, 2);
1186   }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_3x4_acc2(benchmark::State & state,const char * net)1187   static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_3x4_acc2(benchmark::State& state, const char* net) {
1188     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2, 5, 5, 2, 2);
1189   }
1190 
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)1191   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
1192     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4, 5, 5, 2, 2);
1193   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)1194   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
1195     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4, 5, 5, 2, 2);
1196   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)1197   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
1198     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4, 5, 5, 2, 2);
1199   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)1200   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
1201     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2, 5, 5, 2, 2);
1202   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)1203   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
1204     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3, 5, 5, 2, 2);
1205   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)1206   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
1207     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc4, 5, 5, 2, 2);
1208   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc5(benchmark::State & state,const char * net)1209   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc5(benchmark::State& state, const char* net) {
1210     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc5, 5, 5, 2, 2);
1211   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)1212   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
1213     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2, 5, 5, 2, 2);
1214   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4_acc3(benchmark::State & state,const char * net)1215   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4_acc3(benchmark::State& state, const char* net) {
1216     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3, 5, 5, 2, 2);
1217   }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_3x4_acc2(benchmark::State & state,const char * net)1218   static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_3x4_acc2(benchmark::State& state, const char* net) {
1219     f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2, 5, 5, 2, 2);
1220   }
1221 
1222   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4)1223   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4)
1224   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_3x4)
1225   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_4x4)
1226   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_5x4)
1227   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_6x4)
1228   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2)
1229   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3)
1230   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4)
1231   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2)
1232 
1233   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4)
1234   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4)
1235   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_3x4)
1236   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_4x4)
1237   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_5x4)
1238   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_6x4)
1239   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2)
1240   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3)
1241   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4)
1242   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2)
1243 
1244   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4)
1245   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4)
1246   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_3x4)
1247   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_4x4)
1248   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_5x4)
1249   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_6x4)
1250   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc2)
1251   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc3)
1252   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc4)
1253   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4_acc2)
1254 
1255   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4)
1256   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4)
1257   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_3x4)
1258   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_4x4)
1259   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_5x4)
1260   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_6x4)
1261   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc2)
1262   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc3)
1263   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc4)
1264   BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4_acc2)
1265 
1266   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4)
1267   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4)
1268   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_3x4)
1269   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_4x4)
1270   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2)
1271   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3)
1272   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4)
1273   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2)
1274 
1275   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4)
1276   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4)
1277   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_3x4)
1278   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_4x4)
1279   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2)
1280   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3)
1281   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4)
1282   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2)
1283 
1284   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4)
1285   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4)
1286   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_3x4)
1287   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_4x4)
1288   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc2)
1289   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc3)
1290   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc4)
1291   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4_acc2)
1292 
1293   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4)
1294   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4)
1295   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_3x4)
1296   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_4x4)
1297   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc2)
1298   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc3)
1299   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc4)
1300   BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4_acc2)
1301 
1302   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4)
1303   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4)
1304   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4)
1305   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4)
1306   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_5x4)
1307   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2)
1308   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3)
1309   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4)
1310   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5)
1311   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2)
1312   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3)
1313   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2)
1314   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2)
1315 
1316   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4)
1317   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4)
1318   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4)
1319   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4)
1320   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_5x4)
1321   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2)
1322   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3)
1323   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4)
1324   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5)
1325   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2)
1326   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3)
1327   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2)
1328   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2)
1329 
1330   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4)
1331   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4)
1332   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4)
1333   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4)
1334   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_5x4)
1335   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc2)
1336   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc3)
1337   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc4)
1338   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc5)
1339   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc2)
1340   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc3)
1341   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4_acc2)
1342   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4_acc2)
1343 
1344   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4)
1345   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4)
1346   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4)
1347   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4)
1348   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_5x4)
1349   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc2)
1350   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc3)
1351   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc4)
1352   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc5)
1353   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc2)
1354   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc3)
1355   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4_acc2)
1356   BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4_acc2)
1357 
1358   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4)
1359   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4)
1360   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4)
1361   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2)
1362   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3)
1363   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4)
1364   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5)
1365   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2)
1366   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3)
1367   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2)
1368 
1369   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4)
1370   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4)
1371   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4)
1372   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2)
1373   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3)
1374   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4)
1375   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc5)
1376   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2)
1377   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3)
1378   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2)
1379 
1380   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4)
1381   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4)
1382   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_3x4)
1383   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc2)
1384   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc3)
1385   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc4)
1386   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_1x4_acc5)
1387   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4_acc2)
1388   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_2x4_acc3)
1389   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_arm_splat_3x4_acc2)
1390 
1391   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4)
1392   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4)
1393   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_3x4)
1394   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc2)
1395   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc3)
1396   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc4)
1397   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_1x4_acc5)
1398   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4_acc2)
1399   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_2x4_acc3)
1400   BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__wasmsimd_x86_splat_3x4_acc2)
1401 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1402 
1403 static void dwconv2d_chw_3x3p1__scalar_1x1(benchmark::State& state, const char* net) {
1404   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1, 3, 3, 1, 1);
1405 }
dwconv2d_chw_3x3p1__scalar_2x1(benchmark::State & state,const char * net)1406 static void dwconv2d_chw_3x3p1__scalar_2x1(benchmark::State& state, const char* net) {
1407   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1, 3, 3, 1, 1);
1408 }
dwconv2d_chw_3x3p1__scalar_3x1(benchmark::State & state,const char * net)1409 static void dwconv2d_chw_3x3p1__scalar_3x1(benchmark::State& state, const char* net) {
1410   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1, 3, 3, 1, 1);
1411 }
dwconv2d_chw_3x3p1__scalar_4x1(benchmark::State & state,const char * net)1412 static void dwconv2d_chw_3x3p1__scalar_4x1(benchmark::State& state, const char* net) {
1413   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1, 3, 3, 1, 1);
1414 }
dwconv2d_chw_3x3p1__scalar_5x1(benchmark::State & state,const char * net)1415 static void dwconv2d_chw_3x3p1__scalar_5x1(benchmark::State& state, const char* net) {
1416   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1, 3, 3, 1, 1);
1417 }
dwconv2d_chw_3x3p1__scalar_6x1(benchmark::State & state,const char * net)1418 static void dwconv2d_chw_3x3p1__scalar_6x1(benchmark::State& state, const char* net) {
1419   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1, 3, 3, 1, 1);
1420 }
dwconv2d_chw_3x3p1__scalar_1x1_acc2(benchmark::State & state,const char * net)1421 static void dwconv2d_chw_3x3p1__scalar_1x1_acc2(benchmark::State& state, const char* net) {
1422   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2, 3, 3, 1, 1);
1423 }
dwconv2d_chw_3x3p1__scalar_1x1_acc3(benchmark::State & state,const char * net)1424 static void dwconv2d_chw_3x3p1__scalar_1x1_acc3(benchmark::State& state, const char* net) {
1425   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3, 3, 3, 1, 1);
1426 }
dwconv2d_chw_3x3p1__scalar_1x1_acc4(benchmark::State & state,const char * net)1427 static void dwconv2d_chw_3x3p1__scalar_1x1_acc4(benchmark::State& state, const char* net) {
1428   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4, 3, 3, 1, 1);
1429 }
dwconv2d_chw_3x3p1__scalar_2x1_acc2(benchmark::State & state,const char * net)1430 static void dwconv2d_chw_3x3p1__scalar_2x1_acc2(benchmark::State& state, const char* net) {
1431   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2, 3, 3, 1, 1);
1432 }
1433 
dwconv2d_chw_3x3s2p1__scalar_1x1(benchmark::State & state,const char * net)1434 static void dwconv2d_chw_3x3s2p1__scalar_1x1(benchmark::State& state, const char* net) {
1435   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1, 3, 3, 1, 2);
1436 }
dwconv2d_chw_3x3s2p1__scalar_2x1(benchmark::State & state,const char * net)1437 static void dwconv2d_chw_3x3s2p1__scalar_2x1(benchmark::State& state, const char* net) {
1438   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1, 3, 3, 1, 2);
1439 }
dwconv2d_chw_3x3s2p1__scalar_3x1(benchmark::State & state,const char * net)1440 static void dwconv2d_chw_3x3s2p1__scalar_3x1(benchmark::State& state, const char* net) {
1441   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1, 3, 3, 1, 2);
1442 }
dwconv2d_chw_3x3s2p1__scalar_4x1(benchmark::State & state,const char * net)1443 static void dwconv2d_chw_3x3s2p1__scalar_4x1(benchmark::State& state, const char* net) {
1444   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1, 3, 3, 1, 2);
1445 }
dwconv2d_chw_3x3s2p1__scalar_1x1_acc2(benchmark::State & state,const char * net)1446 static void dwconv2d_chw_3x3s2p1__scalar_1x1_acc2(benchmark::State& state, const char* net) {
1447   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2, 3, 3, 1, 2);
1448 }
dwconv2d_chw_3x3s2p1__scalar_1x1_acc3(benchmark::State & state,const char * net)1449 static void dwconv2d_chw_3x3s2p1__scalar_1x1_acc3(benchmark::State& state, const char* net) {
1450   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc3, 3, 3, 1, 2);
1451 }
dwconv2d_chw_3x3s2p1__scalar_1x1_acc4(benchmark::State & state,const char * net)1452 static void dwconv2d_chw_3x3s2p1__scalar_1x1_acc4(benchmark::State& state, const char* net) {
1453   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc4, 3, 3, 1, 2);
1454 }
dwconv2d_chw_3x3s2p1__scalar_2x1_acc2(benchmark::State & state,const char * net)1455 static void dwconv2d_chw_3x3s2p1__scalar_2x1_acc2(benchmark::State& state, const char* net) {
1456   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2, 3, 3, 1, 2);
1457 }
1458 
dwconv2d_chw_5x5p2__scalar_1x1(benchmark::State & state,const char * net)1459 static void dwconv2d_chw_5x5p2__scalar_1x1(benchmark::State& state, const char* net) {
1460   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1, 5, 5, 2, 1);
1461 }
dwconv2d_chw_5x5p2__scalar_2x1(benchmark::State & state,const char * net)1462 static void dwconv2d_chw_5x5p2__scalar_2x1(benchmark::State& state, const char* net) {
1463   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1, 5, 5, 2, 1);
1464 }
dwconv2d_chw_5x5p2__scalar_3x1(benchmark::State & state,const char * net)1465 static void dwconv2d_chw_5x5p2__scalar_3x1(benchmark::State& state, const char* net) {
1466   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1, 5, 5, 2, 1);
1467 }
dwconv2d_chw_5x5p2__scalar_1x1_acc2(benchmark::State & state,const char * net)1468 static void dwconv2d_chw_5x5p2__scalar_1x1_acc2(benchmark::State& state, const char* net) {
1469   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2, 5, 5, 2, 1);
1470 }
dwconv2d_chw_5x5p2__scalar_1x1_acc3(benchmark::State & state,const char * net)1471 static void dwconv2d_chw_5x5p2__scalar_1x1_acc3(benchmark::State& state, const char* net) {
1472   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3, 5, 5, 2, 1);
1473 }
dwconv2d_chw_5x5p2__scalar_1x1_acc4(benchmark::State & state,const char * net)1474 static void dwconv2d_chw_5x5p2__scalar_1x1_acc4(benchmark::State& state, const char* net) {
1475   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4, 5, 5, 2, 1);
1476 }
dwconv2d_chw_5x5p2__scalar_1x1_acc5(benchmark::State & state,const char * net)1477 static void dwconv2d_chw_5x5p2__scalar_1x1_acc5(benchmark::State& state, const char* net) {
1478   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5, 5, 5, 2, 1);
1479 }
dwconv2d_chw_5x5p2__scalar_2x1_acc2(benchmark::State & state,const char * net)1480 static void dwconv2d_chw_5x5p2__scalar_2x1_acc2(benchmark::State& state, const char* net) {
1481   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2, 5, 5, 2, 1);
1482 }
dwconv2d_chw_5x5p2__scalar_2x1_acc3(benchmark::State & state,const char * net)1483 static void dwconv2d_chw_5x5p2__scalar_2x1_acc3(benchmark::State& state, const char* net) {
1484   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3, 5, 5, 2, 1);
1485 }
dwconv2d_chw_5x5p2__scalar_3x1_acc2(benchmark::State & state,const char * net)1486 static void dwconv2d_chw_5x5p2__scalar_3x1_acc2(benchmark::State& state, const char* net) {
1487   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2, 5, 5, 2, 1);
1488 }
1489 
dwconv2d_chw_5x5s2p2__scalar_1x1(benchmark::State & state,const char * net)1490 static void dwconv2d_chw_5x5s2p2__scalar_1x1(benchmark::State& state, const char* net) {
1491   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1, 5, 5, 2, 2);
1492 }
dwconv2d_chw_5x5s2p2__scalar_2x1(benchmark::State & state,const char * net)1493 static void dwconv2d_chw_5x5s2p2__scalar_2x1(benchmark::State& state, const char* net) {
1494   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1, 5, 5, 2, 2);
1495 }
dwconv2d_chw_5x5s2p2__scalar_3x1(benchmark::State & state,const char * net)1496 static void dwconv2d_chw_5x5s2p2__scalar_3x1(benchmark::State& state, const char* net) {
1497   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1, 5, 5, 2, 2);
1498 }
dwconv2d_chw_5x5s2p2__scalar_1x1_acc2(benchmark::State & state,const char * net)1499 static void dwconv2d_chw_5x5s2p2__scalar_1x1_acc2(benchmark::State& state, const char* net) {
1500   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2, 5, 5, 2, 2);
1501 }
dwconv2d_chw_5x5s2p2__scalar_1x1_acc3(benchmark::State & state,const char * net)1502 static void dwconv2d_chw_5x5s2p2__scalar_1x1_acc3(benchmark::State& state, const char* net) {
1503   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3, 5, 5, 2, 2);
1504 }
dwconv2d_chw_5x5s2p2__scalar_1x1_acc4(benchmark::State & state,const char * net)1505 static void dwconv2d_chw_5x5s2p2__scalar_1x1_acc4(benchmark::State& state, const char* net) {
1506   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4, 5, 5, 2, 2);
1507 }
dwconv2d_chw_5x5s2p2__scalar_1x1_acc5(benchmark::State & state,const char * net)1508 static void dwconv2d_chw_5x5s2p2__scalar_1x1_acc5(benchmark::State& state, const char* net) {
1509   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5, 5, 5, 2, 2);
1510 }
dwconv2d_chw_5x5s2p2__scalar_2x1_acc2(benchmark::State & state,const char * net)1511 static void dwconv2d_chw_5x5s2p2__scalar_2x1_acc2(benchmark::State& state, const char* net) {
1512   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2, 5, 5, 2, 2);
1513 }
dwconv2d_chw_5x5s2p2__scalar_2x1_acc3(benchmark::State & state,const char * net)1514 static void dwconv2d_chw_5x5s2p2__scalar_2x1_acc3(benchmark::State& state, const char* net) {
1515   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3, 5, 5, 2, 2);
1516 }
dwconv2d_chw_5x5s2p2__scalar_3x1_acc2(benchmark::State & state,const char * net)1517 static void dwconv2d_chw_5x5s2p2__scalar_3x1_acc2(benchmark::State& state, const char* net) {
1518   f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2, 5, 5, 2, 2);
1519 }
1520 
1521 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_1x1)
1522 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_2x1)
1523 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_3x1)
1524 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_4x1)
1525 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_5x1)
1526 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_6x1)
1527 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_1x1_acc2)
1528 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_1x1_acc3)
1529 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_1x1_acc4)
1530 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_2x1_acc2)
1531 
1532 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_1x1)
1533 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_2x1)
1534 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_3x1)
1535 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_4x1)
1536 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_1x1_acc2)
1537 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_1x1_acc3)
1538 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_1x1_acc4)
1539 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_2x1_acc2)
1540 
1541 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_1x1)
1542 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_2x1)
1543 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_3x1)
1544 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_1x1_acc2)
1545 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_1x1_acc3)
1546 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_1x1_acc4)
1547 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_1x1_acc5)
1548 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_2x1_acc2)
1549 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_2x1_acc3)
1550 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_3x1_acc2)
1551 
1552 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_1x1)
1553 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_2x1)
1554 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_3x1)
1555 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_1x1_acc2)
1556 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_1x1_acc3)
1557 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_1x1_acc4)
1558 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_1x1_acc5)
1559 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_2x1_acc2)
1560 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_2x1_acc3)
1561 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_3x1_acc2)
1562 
1563 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1564 BENCHMARK_MAIN();
1565 #endif
1566