1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <random>
11 #include <vector>
12
13 #include <benchmark/benchmark.h>
14 #include "bench/dwconv.h"
15 #include "bench/utils.h"
16
17 #include <xnnpack.h>
18 #include <xnnpack/aligned-allocator.h>
19 #include <xnnpack/common.h>
20 #include <xnnpack/dwconv.h>
21 #include <xnnpack/indirection.h>
22 #include <xnnpack/microfnptr.h>
23 #include <xnnpack/microparams-init.h>
24 #include <xnnpack/operator.h>
25 #include <xnnpack/pack.h>
26
27
f32_dwconv2d_chw(benchmark::State & state,xnn_f32_dwconv2d_chw_ukernel_function dwconv,uint32_t kh,uint32_t kw,uint32_t pw,uint32_t s,benchmark::utils::IsaCheckFunction isa_check=nullptr)28 static void f32_dwconv2d_chw(benchmark::State& state,
29 xnn_f32_dwconv2d_chw_ukernel_function dwconv,
30 uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s,
31 benchmark::utils::IsaCheckFunction isa_check = nullptr)
32 {
33 if (isa_check && !isa_check(state)) {
34 return;
35 }
36
37 const size_t input_height = state.range(0);
38 const size_t input_width = state.range(1);
39 const size_t kernel_height = state.range(2);
40 const size_t kernel_width = state.range(3);
41 const size_t padding_height = state.range(4);
42 const size_t padding_width = state.range(5);
43 const size_t subsampling = state.range(6);
44 const size_t dilation = state.range(7);
45 const size_t channels = state.range(8);
46
47 if (kernel_height != kh) {
48 state.SkipWithError("kernel height mismatch");
49 return;
50 }
51
52 if (kernel_width != kw) {
53 state.SkipWithError("kernel width mismatch");
54 return;
55 }
56
57 if (subsampling != s) {
58 state.SkipWithError("subsampling mismatch");
59 return;
60 }
61
62 if (padding_width % 2 != 0 || padding_width / 2 != pw) {
63 state.SkipWithError("padding width mismatch");
64 return;
65 }
66
67 if (dilation != 1) {
68 state.SkipWithError("unsupported dilation");
69 return;
70 }
71
72 std::random_device random_device;
73 auto rng = std::mt19937(random_device());
74 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
75
76 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
77 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
78 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
79 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
80
81 const size_t inputSize = (input_height + padding_height) * input_width;
82 const size_t kernel_size = kernel_height * kernel_width;
83 const size_t output_size = output_height * output_width;
84
85 std::vector<float> input(inputSize * channels + 2 * XNN_EXTRA_BYTES);
86 std::generate(input.begin(), input.end(), std::ref(f32rng));
87 std::vector<float> bias(channels);
88 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
89 std::vector<float> kernel(channels * kernel_size);
90 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
91 std::vector<float> zero(input_width + padding_width);
92
93 const size_t w_elements = (kernel_size + 1) * channels;
94 const size_t o_elements = output_size * channels;
95 const size_t num_buffers = 1 +
96 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
97 sizeof(float) * (w_elements + o_elements));
98
99 std::vector<float, AlignedAllocator<float, 64>> packed_weights(w_elements * num_buffers);
100 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
101 for (size_t c = 0; c < channels; c++) {
102 packed_weights[c * kernel_size + c] = bias[c];
103 for (size_t i = 0; i < kernel_size; i++) {
104 packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
105 }
106 }
107 for (size_t n = 1; n < num_buffers; n++) {
108 std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
109 }
110
111 std::vector<float> output(o_elements * num_buffers);
112 std::fill(output.begin(), output.end(), std::nanf(""));
113
114 xnn_f32_chw_params chw_params;
115 xnn_init_f32_chw_params(
116 &chw_params, input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
117
118 size_t buffer_index = 0;
119 for (auto _ : state) {
120 state.PauseTiming();
121 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
122 buffer_index = (buffer_index + 1) % num_buffers;
123 state.ResumeTiming();
124
125 for (uint32_t channel = 0; channel < channels; channel++) {
126 dwconv(
127 input_height, input_width * sizeof(float),
128 input.data() + channel * inputSize,
129 packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
130 zero.data(),
131 output.data() + channel * output_size + buffer_index * o_elements,
132 padding_height / 2, // padding_top
133 &chw_params);
134 }
135 }
136
137 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
138 if (cpu_frequency != 0) {
139 state.counters["cpufreq"] = cpu_frequency;
140 }
141
142 state.counters["FLOPS"] = benchmark::Counter(
143 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
144 benchmark::Counter::kIsRate);
145
146 state.counters["bytes"] = benchmark::Counter(
147 uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(float),
148 benchmark::Counter::kIsRate);
149 }
150
151 #if XNN_ARCH_ARM
dwconv2d_chw_3x3p1__neon_1x4(benchmark::State & state,const char * net)152 static void dwconv2d_chw_3x3p1__neon_1x4(benchmark::State& state, const char* net) {
153 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
154 }
dwconv2d_chw_3x3p1__neon_2x4(benchmark::State & state,const char * net)155 static void dwconv2d_chw_3x3p1__neon_2x4(benchmark::State& state, const char* net) {
156 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
157 }
dwconv2d_chw_3x3p1__neon_3x4(benchmark::State & state,const char * net)158 static void dwconv2d_chw_3x3p1__neon_3x4(benchmark::State& state, const char* net) {
159 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
160 }
dwconv2d_chw_3x3p1__neon_4x4(benchmark::State & state,const char * net)161 static void dwconv2d_chw_3x3p1__neon_4x4(benchmark::State& state, const char* net) {
162 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
163 }
dwconv2d_chw_3x3p1__neon_5x4(benchmark::State & state,const char * net)164 static void dwconv2d_chw_3x3p1__neon_5x4(benchmark::State& state, const char* net) {
165 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
166 }
dwconv2d_chw_3x3p1__neon_6x4(benchmark::State & state,const char * net)167 static void dwconv2d_chw_3x3p1__neon_6x4(benchmark::State& state, const char* net) {
168 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
169 }
dwconv2d_chw_3x3p1__neon_1x4_acc2(benchmark::State & state,const char * net)170 static void dwconv2d_chw_3x3p1__neon_1x4_acc2(benchmark::State& state, const char* net) {
171 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckNEON);
172 }
dwconv2d_chw_3x3p1__neon_1x4_acc3(benchmark::State & state,const char * net)173 static void dwconv2d_chw_3x3p1__neon_1x4_acc3(benchmark::State& state, const char* net) {
174 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc3, 3, 3, 1, 1, benchmark::utils::CheckNEON);
175 }
dwconv2d_chw_3x3p1__neon_1x4_acc4(benchmark::State & state,const char * net)176 static void dwconv2d_chw_3x3p1__neon_1x4_acc4(benchmark::State& state, const char* net) {
177 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_1x4_acc4, 3, 3, 1, 1, benchmark::utils::CheckNEON);
178 }
dwconv2d_chw_3x3p1__neon_2x4_acc2(benchmark::State & state,const char * net)179 static void dwconv2d_chw_3x3p1__neon_2x4_acc2(benchmark::State& state, const char* net) {
180 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckNEON);
181 }
182
dwconv2d_chw_3x3s2p1__neon_1x4(benchmark::State & state,const char * net)183 static void dwconv2d_chw_3x3s2p1__neon_1x4(benchmark::State& state, const char* net) {
184 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
185 }
dwconv2d_chw_3x3s2p1__neon_2x4(benchmark::State & state,const char * net)186 static void dwconv2d_chw_3x3s2p1__neon_2x4(benchmark::State& state, const char* net) {
187 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
188 }
dwconv2d_chw_3x3s2p1__neon_3x4(benchmark::State & state,const char * net)189 static void dwconv2d_chw_3x3s2p1__neon_3x4(benchmark::State& state, const char* net) {
190 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_3x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
191 }
dwconv2d_chw_3x3s2p1__neon_4x4(benchmark::State & state,const char * net)192 static void dwconv2d_chw_3x3s2p1__neon_4x4(benchmark::State& state, const char* net) {
193 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_4x4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
194 }
dwconv2d_chw_3x3s2p1__neon_1x4_acc2(benchmark::State & state,const char * net)195 static void dwconv2d_chw_3x3s2p1__neon_1x4_acc2(benchmark::State& state, const char* net) {
196 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc2, 3, 3, 1, 2, benchmark::utils::CheckNEON);
197 }
dwconv2d_chw_3x3s2p1__neon_1x4_acc3(benchmark::State & state,const char * net)198 static void dwconv2d_chw_3x3s2p1__neon_1x4_acc3(benchmark::State& state, const char* net) {
199 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc3, 3, 3, 1, 2, benchmark::utils::CheckNEON);
200 }
dwconv2d_chw_3x3s2p1__neon_1x4_acc4(benchmark::State & state,const char * net)201 static void dwconv2d_chw_3x3s2p1__neon_1x4_acc4(benchmark::State& state, const char* net) {
202 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4_acc4, 3, 3, 1, 2, benchmark::utils::CheckNEON);
203 }
dwconv2d_chw_3x3s2p1__neon_2x4_acc2(benchmark::State & state,const char * net)204 static void dwconv2d_chw_3x3s2p1__neon_2x4_acc2(benchmark::State& state, const char* net) {
205 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_2x4_acc2, 3, 3, 1, 2, benchmark::utils::CheckNEON);
206 }
207
dwconv2d_chw_5x5p2__neon_1x4(benchmark::State & state,const char * net)208 static void dwconv2d_chw_5x5p2__neon_1x4(benchmark::State& state, const char* net) {
209 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
210 }
dwconv2d_chw_5x5p2__neon_2x4(benchmark::State & state,const char * net)211 static void dwconv2d_chw_5x5p2__neon_2x4(benchmark::State& state, const char* net) {
212 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
213 }
dwconv2d_chw_5x5p2__neon_3x4(benchmark::State & state,const char * net)214 static void dwconv2d_chw_5x5p2__neon_3x4(benchmark::State& state, const char* net) {
215 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
216 }
dwconv2d_chw_5x5p2__neon_4x4(benchmark::State & state,const char * net)217 static void dwconv2d_chw_5x5p2__neon_4x4(benchmark::State& state, const char* net) {
218 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
219 }
dwconv2d_chw_5x5p2__neon_5x4(benchmark::State & state,const char * net)220 static void dwconv2d_chw_5x5p2__neon_5x4(benchmark::State& state, const char* net) {
221 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
222 }
dwconv2d_chw_5x5p2__neon_1x4_acc2(benchmark::State & state,const char * net)223 static void dwconv2d_chw_5x5p2__neon_1x4_acc2(benchmark::State& state, const char* net) {
224 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
225 }
dwconv2d_chw_5x5p2__neon_1x4_acc3(benchmark::State & state,const char * net)226 static void dwconv2d_chw_5x5p2__neon_1x4_acc3(benchmark::State& state, const char* net) {
227 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3, 5, 5, 2, 1, benchmark::utils::CheckNEON);
228 }
dwconv2d_chw_5x5p2__neon_1x4_acc4(benchmark::State & state,const char * net)229 static void dwconv2d_chw_5x5p2__neon_1x4_acc4(benchmark::State& state, const char* net) {
230 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4, 5, 5, 2, 1, benchmark::utils::CheckNEON);
231 }
dwconv2d_chw_5x5p2__neon_1x4_acc5(benchmark::State & state,const char * net)232 static void dwconv2d_chw_5x5p2__neon_1x4_acc5(benchmark::State& state, const char* net) {
233 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5, 5, 5, 2, 1, benchmark::utils::CheckNEON);
234 }
dwconv2d_chw_5x5p2__neon_2x4_acc2(benchmark::State & state,const char * net)235 static void dwconv2d_chw_5x5p2__neon_2x4_acc2(benchmark::State& state, const char* net) {
236 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
237 }
dwconv2d_chw_5x5p2__neon_2x4_acc3(benchmark::State & state,const char * net)238 static void dwconv2d_chw_5x5p2__neon_2x4_acc3(benchmark::State& state, const char* net) {
239 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc3, 5, 5, 2, 1, benchmark::utils::CheckNEON);
240 }
dwconv2d_chw_5x5p2__neon_3x4_acc2(benchmark::State & state,const char * net)241 static void dwconv2d_chw_5x5p2__neon_3x4_acc2(benchmark::State& state, const char* net) {
242 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
243 }
dwconv2d_chw_5x5p2__neon_4x4_acc2(benchmark::State & state,const char * net)244 static void dwconv2d_chw_5x5p2__neon_4x4_acc2(benchmark::State& state, const char* net) {
245 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2, 5, 5, 2, 1, benchmark::utils::CheckNEON);
246 }
247
dwconv2d_chw_5x5s2p2__neon_1x4(benchmark::State & state,const char * net)248 static void dwconv2d_chw_5x5s2p2__neon_1x4(benchmark::State& state, const char* net) {
249 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
250 }
dwconv2d_chw_5x5s2p2__neon_2x4(benchmark::State & state,const char * net)251 static void dwconv2d_chw_5x5s2p2__neon_2x4(benchmark::State& state, const char* net) {
252 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
253 }
dwconv2d_chw_5x5s2p2__neon_3x4(benchmark::State & state,const char * net)254 static void dwconv2d_chw_5x5s2p2__neon_3x4(benchmark::State& state, const char* net) {
255 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
256 }
dwconv2d_chw_5x5s2p2__neon_1x4_acc2(benchmark::State & state,const char * net)257 static void dwconv2d_chw_5x5s2p2__neon_1x4_acc2(benchmark::State& state, const char* net) {
258 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc2, 5, 5, 2, 2, benchmark::utils::CheckNEON);
259 }
dwconv2d_chw_5x5s2p2__neon_1x4_acc3(benchmark::State & state,const char * net)260 static void dwconv2d_chw_5x5s2p2__neon_1x4_acc3(benchmark::State& state, const char* net) {
261 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3, 5, 5, 2, 2, benchmark::utils::CheckNEON);
262 }
dwconv2d_chw_5x5s2p2__neon_1x4_acc4(benchmark::State & state,const char * net)263 static void dwconv2d_chw_5x5s2p2__neon_1x4_acc4(benchmark::State& state, const char* net) {
264 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4, 5, 5, 2, 2, benchmark::utils::CheckNEON);
265 }
dwconv2d_chw_5x5s2p2__neon_1x4_acc5(benchmark::State & state,const char * net)266 static void dwconv2d_chw_5x5s2p2__neon_1x4_acc5(benchmark::State& state, const char* net) {
267 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc5, 5, 5, 2, 2, benchmark::utils::CheckNEON);
268 }
dwconv2d_chw_5x5s2p2__neon_2x4_acc2(benchmark::State & state,const char * net)269 static void dwconv2d_chw_5x5s2p2__neon_2x4_acc2(benchmark::State& state, const char* net) {
270 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc2, 5, 5, 2, 2, benchmark::utils::CheckNEON);
271 }
dwconv2d_chw_5x5s2p2__neon_2x4_acc3(benchmark::State & state,const char * net)272 static void dwconv2d_chw_5x5s2p2__neon_2x4_acc3(benchmark::State& state, const char* net) {
273 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4_acc3, 5, 5, 2, 2, benchmark::utils::CheckNEON);
274 }
dwconv2d_chw_5x5s2p2__neon_3x4_acc2(benchmark::State & state,const char * net)275 static void dwconv2d_chw_5x5s2p2__neon_3x4_acc2(benchmark::State& state, const char* net) {
276 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4_acc2, 5, 5, 2, 2, benchmark::utils::CheckNEON);
277 }
278
279 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_2x4)280 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_2x4)
281 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_3x4)
282 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_4x4)
283 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_5x4)
284 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_6x4)
285 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4_acc2)
286 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4_acc3)
287 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_1x4_acc4)
288 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neon_2x4_acc2)
289
290 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4)
291 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_2x4)
292 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_3x4)
293 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_4x4)
294 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4_acc2)
295 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4_acc3)
296 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_1x4_acc4)
297 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neon_2x4_acc2)
298
299 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4)
300 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_2x4)
301 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_3x4)
302 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_4x4)
303 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_5x4)
304 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc2)
305 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc3)
306 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc4)
307 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_1x4_acc5)
308 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_2x4_acc2)
309 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_2x4_acc3)
310 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_3x4_acc2)
311 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neon_4x4_acc2)
312
313 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4)
314 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_2x4)
315 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_3x4)
316 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc2)
317 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc3)
318 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc4)
319 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_1x4_acc5)
320 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_2x4_acc2)
321 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_2x4_acc3)
322 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neon_3x4_acc2)
323 #endif // XNN_ARCH_ARM
324
325 #if XNN_ARCH_ARM64
326 static void dwconv2d_chw_3x3p1__neonfma_1x4(benchmark::State& state, const char* net) {
327 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4, 3, 3, 1, 1);
328 }
dwconv2d_chw_3x3p1__neonfma_2x4(benchmark::State & state,const char * net)329 static void dwconv2d_chw_3x3p1__neonfma_2x4(benchmark::State& state, const char* net) {
330 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4, 3, 3, 1, 1);
331 }
dwconv2d_chw_3x3p1__neonfma_3x4(benchmark::State & state,const char * net)332 static void dwconv2d_chw_3x3p1__neonfma_3x4(benchmark::State& state, const char* net) {
333 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4, 3, 3, 1, 1);
334 }
dwconv2d_chw_3x3p1__neonfma_4x4(benchmark::State & state,const char * net)335 static void dwconv2d_chw_3x3p1__neonfma_4x4(benchmark::State& state, const char* net) {
336 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4, 3, 3, 1, 1);
337 }
dwconv2d_chw_3x3p1__neonfma_5x4(benchmark::State & state,const char * net)338 static void dwconv2d_chw_3x3p1__neonfma_5x4(benchmark::State& state, const char* net) {
339 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4, 3, 3, 1, 1);
340 }
dwconv2d_chw_3x3p1__neonfma_6x4(benchmark::State & state,const char * net)341 static void dwconv2d_chw_3x3p1__neonfma_6x4(benchmark::State& state, const char* net) {
342 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4, 3, 3, 1, 1);
343 }
dwconv2d_chw_3x3p1__neonfma_1x4_acc2(benchmark::State & state,const char * net)344 static void dwconv2d_chw_3x3p1__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
345 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc2, 3, 3, 1, 1);
346 }
dwconv2d_chw_3x3p1__neonfma_1x4_acc3(benchmark::State & state,const char * net)347 static void dwconv2d_chw_3x3p1__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
348 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc3, 3, 3, 1, 1);
349 }
dwconv2d_chw_3x3p1__neonfma_1x4_acc4(benchmark::State & state,const char * net)350 static void dwconv2d_chw_3x3p1__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
351 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_1x4_acc4, 3, 3, 1, 1);
352 }
dwconv2d_chw_3x3p1__neonfma_2x4_acc2(benchmark::State & state,const char * net)353 static void dwconv2d_chw_3x3p1__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
354 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_2x4_acc2, 3, 3, 1, 1);
355 }
356
dwconv2d_chw_3x3s2p1__neonfma_1x4(benchmark::State & state,const char * net)357 static void dwconv2d_chw_3x3s2p1__neonfma_1x4(benchmark::State& state, const char* net) {
358 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4, 3, 3, 1, 2);
359 }
dwconv2d_chw_3x3s2p1__neonfma_2x4(benchmark::State & state,const char * net)360 static void dwconv2d_chw_3x3s2p1__neonfma_2x4(benchmark::State& state, const char* net) {
361 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4, 3, 3, 1, 2);
362 }
dwconv2d_chw_3x3s2p1__neonfma_3x4(benchmark::State & state,const char * net)363 static void dwconv2d_chw_3x3s2p1__neonfma_3x4(benchmark::State& state, const char* net) {
364 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_3x4, 3, 3, 1, 2);
365 }
dwconv2d_chw_3x3s2p1__neonfma_4x4(benchmark::State & state,const char * net)366 static void dwconv2d_chw_3x3s2p1__neonfma_4x4(benchmark::State& state, const char* net) {
367 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_4x4, 3, 3, 1, 2);
368 }
dwconv2d_chw_3x3s2p1__neonfma_1x4_acc2(benchmark::State & state,const char * net)369 static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
370 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc2, 3, 3, 1, 2);
371 }
dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3(benchmark::State & state,const char * net)372 static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
373 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3, 3, 3, 1, 2);
374 }
dwconv2d_chw_3x3s2p1__neonfma_1x4_acc4(benchmark::State & state,const char * net)375 static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
376 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc4, 3, 3, 1, 2);
377 }
dwconv2d_chw_3x3s2p1__neonfma_2x4_acc2(benchmark::State & state,const char * net)378 static void dwconv2d_chw_3x3s2p1__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
379 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2, 3, 3, 1, 2);
380 }
381
dwconv2d_chw_5x5p2__neonfma_1x4(benchmark::State & state,const char * net)382 static void dwconv2d_chw_5x5p2__neonfma_1x4(benchmark::State& state, const char* net) {
383 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4, 5, 5, 2, 1);
384 }
dwconv2d_chw_5x5p2__neonfma_2x4(benchmark::State & state,const char * net)385 static void dwconv2d_chw_5x5p2__neonfma_2x4(benchmark::State& state, const char* net) {
386 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4, 5, 5, 2, 1);
387 }
dwconv2d_chw_5x5p2__neonfma_3x4(benchmark::State & state,const char * net)388 static void dwconv2d_chw_5x5p2__neonfma_3x4(benchmark::State& state, const char* net) {
389 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4, 5, 5, 2, 1);
390 }
dwconv2d_chw_5x5p2__neonfma_4x4(benchmark::State & state,const char * net)391 static void dwconv2d_chw_5x5p2__neonfma_4x4(benchmark::State& state, const char* net) {
392 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4, 5, 5, 2, 1);
393 }
dwconv2d_chw_5x5p2__neonfma_5x4(benchmark::State & state,const char * net)394 static void dwconv2d_chw_5x5p2__neonfma_5x4(benchmark::State& state, const char* net) {
395 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4, 5, 5, 2, 1);
396 }
dwconv2d_chw_5x5p2__neonfma_1x4_acc2(benchmark::State & state,const char * net)397 static void dwconv2d_chw_5x5p2__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
398 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2, 5, 5, 2, 1);
399 }
dwconv2d_chw_5x5p2__neonfma_1x4_acc3(benchmark::State & state,const char * net)400 static void dwconv2d_chw_5x5p2__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
401 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3, 5, 5, 2, 1);
402 }
dwconv2d_chw_5x5p2__neonfma_1x4_acc4(benchmark::State & state,const char * net)403 static void dwconv2d_chw_5x5p2__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
404 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4, 5, 5, 2, 1);
405 }
dwconv2d_chw_5x5p2__neonfma_1x4_acc5(benchmark::State & state,const char * net)406 static void dwconv2d_chw_5x5p2__neonfma_1x4_acc5(benchmark::State& state, const char* net) {
407 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5, 5, 5, 2, 1);
408 }
dwconv2d_chw_5x5p2__neonfma_2x4_acc2(benchmark::State & state,const char * net)409 static void dwconv2d_chw_5x5p2__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
410 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2, 5, 5, 2, 1);
411 }
dwconv2d_chw_5x5p2__neonfma_2x4_acc3(benchmark::State & state,const char * net)412 static void dwconv2d_chw_5x5p2__neonfma_2x4_acc3(benchmark::State& state, const char* net) {
413 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc3, 5, 5, 2, 1);
414 }
dwconv2d_chw_5x5p2__neonfma_3x4_acc2(benchmark::State & state,const char * net)415 static void dwconv2d_chw_5x5p2__neonfma_3x4_acc2(benchmark::State& state, const char* net) {
416 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2, 5, 5, 2, 1);
417 }
dwconv2d_chw_5x5p2__neonfma_4x4_acc2(benchmark::State & state,const char * net)418 static void dwconv2d_chw_5x5p2__neonfma_4x4_acc2(benchmark::State& state, const char* net) {
419 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2, 5, 5, 2, 1);
420 }
421
dwconv2d_chw_5x5s2p2__neonfma_1x4(benchmark::State & state,const char * net)422 static void dwconv2d_chw_5x5s2p2__neonfma_1x4(benchmark::State& state, const char* net) {
423 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4, 5, 5, 2, 2);
424 }
dwconv2d_chw_5x5s2p2__neonfma_2x4(benchmark::State & state,const char * net)425 static void dwconv2d_chw_5x5s2p2__neonfma_2x4(benchmark::State& state, const char* net) {
426 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4, 5, 5, 2, 2);
427 }
dwconv2d_chw_5x5s2p2__neonfma_3x4(benchmark::State & state,const char * net)428 static void dwconv2d_chw_5x5s2p2__neonfma_3x4(benchmark::State& state, const char* net) {
429 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4, 5, 5, 2, 2);
430 }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2(benchmark::State & state,const char * net)431 static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
432 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2, 5, 5, 2, 2);
433 }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc3(benchmark::State & state,const char * net)434 static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
435 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3, 5, 5, 2, 2);
436 }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc4(benchmark::State & state,const char * net)437 static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc4(benchmark::State& state, const char* net) {
438 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4, 5, 5, 2, 2);
439 }
dwconv2d_chw_5x5s2p2__neonfma_1x4_acc5(benchmark::State & state,const char * net)440 static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc5(benchmark::State& state, const char* net) {
441 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc5, 5, 5, 2, 2);
442 }
dwconv2d_chw_5x5s2p2__neonfma_2x4_acc2(benchmark::State & state,const char * net)443 static void dwconv2d_chw_5x5s2p2__neonfma_2x4_acc2(benchmark::State& state, const char* net) {
444 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc2, 5, 5, 2, 2);
445 }
dwconv2d_chw_5x5s2p2__neonfma_2x4_acc3(benchmark::State & state,const char * net)446 static void dwconv2d_chw_5x5s2p2__neonfma_2x4_acc3(benchmark::State& state, const char* net) {
447 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4_acc3, 5, 5, 2, 2);
448 }
dwconv2d_chw_5x5s2p2__neonfma_3x4_acc2(benchmark::State & state,const char * net)449 static void dwconv2d_chw_5x5s2p2__neonfma_3x4_acc2(benchmark::State& state, const char* net) {
450 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4_acc2, 5, 5, 2, 2);
451 }
452
453 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_2x4)454 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_2x4)
455 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_3x4)
456 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_4x4)
457 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_5x4)
458 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_6x4)
459 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4_acc2)
460 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4_acc3)
461 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_1x4_acc4)
462 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_2x4_acc2)
463
464 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4)
465 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_2x4)
466 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_3x4)
467 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_4x4)
468 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc2)
469 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3)
470 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc4)
471 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_2x4_acc2)
472
473 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4)
474 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_2x4)
475 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_3x4)
476 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_4x4)
477 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_5x4)
478 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc2)
479 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc3)
480 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc4)
481 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_1x4_acc5)
482 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_2x4_acc2)
483 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_2x4_acc3)
484 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_3x4_acc2)
485 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_4x4_acc2)
486
487 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4)
488 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_2x4)
489 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_3x4)
490 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2)
491 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc3)
492 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc4)
493 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc5)
494 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_2x4_acc2)
495 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_2x4_acc3)
496 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_3x4_acc2)
497 #endif // XNN_ARCH_ARM64
498
499 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
500 static void dwconv2d_chw_3x3p1__sse_1x4(benchmark::State& state, const char* net) {
501 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4, 3, 3, 1, 1);
502 }
dwconv2d_chw_3x3p1__sse_2x4(benchmark::State & state,const char * net)503 static void dwconv2d_chw_3x3p1__sse_2x4(benchmark::State& state, const char* net) {
504 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4, 3, 3, 1, 1);
505 }
dwconv2d_chw_3x3p1__sse_3x4(benchmark::State & state,const char * net)506 static void dwconv2d_chw_3x3p1__sse_3x4(benchmark::State& state, const char* net) {
507 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4, 3, 3, 1, 1);
508 }
dwconv2d_chw_3x3p1__sse_4x4(benchmark::State & state,const char * net)509 static void dwconv2d_chw_3x3p1__sse_4x4(benchmark::State& state, const char* net) {
510 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4, 3, 3, 1, 1);
511 }
dwconv2d_chw_3x3p1__sse_5x4(benchmark::State & state,const char * net)512 static void dwconv2d_chw_3x3p1__sse_5x4(benchmark::State& state, const char* net) {
513 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4, 3, 3, 1, 1);
514 }
dwconv2d_chw_3x3p1__sse_6x4(benchmark::State & state,const char * net)515 static void dwconv2d_chw_3x3p1__sse_6x4(benchmark::State& state, const char* net) {
516 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4, 3, 3, 1, 1);
517 }
dwconv2d_chw_3x3p1__sse_1x4_acc2(benchmark::State & state,const char * net)518 static void dwconv2d_chw_3x3p1__sse_1x4_acc2(benchmark::State& state, const char* net) {
519 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc2, 3, 3, 1, 1);
520 }
dwconv2d_chw_3x3p1__sse_1x4_acc3(benchmark::State & state,const char * net)521 static void dwconv2d_chw_3x3p1__sse_1x4_acc3(benchmark::State& state, const char* net) {
522 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc3, 3, 3, 1, 1);
523 }
dwconv2d_chw_3x3p1__sse_1x4_acc4(benchmark::State & state,const char * net)524 static void dwconv2d_chw_3x3p1__sse_1x4_acc4(benchmark::State& state, const char* net) {
525 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc4, 3, 3, 1, 1);
526 }
dwconv2d_chw_3x3p1__sse_2x4_acc2(benchmark::State & state,const char * net)527 static void dwconv2d_chw_3x3p1__sse_2x4_acc2(benchmark::State& state, const char* net) {
528 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2, 3, 3, 1, 1);
529 }
530
dwconv2d_chw_3x3p1__ssse3_1x4(benchmark::State & state,const char * net)531 static void dwconv2d_chw_3x3p1__ssse3_1x4(benchmark::State& state, const char* net) {
532 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
533 }
dwconv2d_chw_3x3p1__ssse3_2x4(benchmark::State & state,const char * net)534 static void dwconv2d_chw_3x3p1__ssse3_2x4(benchmark::State& state, const char* net) {
535 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
536 }
dwconv2d_chw_3x3p1__ssse3_3x4(benchmark::State & state,const char * net)537 static void dwconv2d_chw_3x3p1__ssse3_3x4(benchmark::State& state, const char* net) {
538 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
539 }
dwconv2d_chw_3x3p1__ssse3_4x4(benchmark::State & state,const char * net)540 static void dwconv2d_chw_3x3p1__ssse3_4x4(benchmark::State& state, const char* net) {
541 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
542 }
dwconv2d_chw_3x3p1__ssse3_5x4(benchmark::State & state,const char * net)543 static void dwconv2d_chw_3x3p1__ssse3_5x4(benchmark::State& state, const char* net) {
544 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
545 }
dwconv2d_chw_3x3p1__ssse3_6x4(benchmark::State & state,const char * net)546 static void dwconv2d_chw_3x3p1__ssse3_6x4(benchmark::State& state, const char* net) {
547 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
548 }
dwconv2d_chw_3x3p1__ssse3_1x4_acc2(benchmark::State & state,const char * net)549 static void dwconv2d_chw_3x3p1__ssse3_1x4_acc2(benchmark::State& state, const char* net) {
550 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
551 }
dwconv2d_chw_3x3p1__ssse3_1x4_acc3(benchmark::State & state,const char * net)552 static void dwconv2d_chw_3x3p1__ssse3_1x4_acc3(benchmark::State& state, const char* net) {
553 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc3, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
554 }
dwconv2d_chw_3x3p1__ssse3_1x4_acc4(benchmark::State & state,const char * net)555 static void dwconv2d_chw_3x3p1__ssse3_1x4_acc4(benchmark::State& state, const char* net) {
556 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_1x4_acc4, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
557 }
dwconv2d_chw_3x3p1__ssse3_2x4_acc2(benchmark::State & state,const char * net)558 static void dwconv2d_chw_3x3p1__ssse3_2x4_acc2(benchmark::State& state, const char* net) {
559 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2, 3, 3, 1, 1, benchmark::utils::CheckSSSE3);
560 }
561
dwconv2d_chw_3x3s2p1__sse_1x4(benchmark::State & state,const char * net)562 static void dwconv2d_chw_3x3s2p1__sse_1x4(benchmark::State& state, const char* net) {
563 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4, 3, 3, 1, 2);
564 }
dwconv2d_chw_3x3s2p1__sse_2x4(benchmark::State & state,const char * net)565 static void dwconv2d_chw_3x3s2p1__sse_2x4(benchmark::State& state, const char* net) {
566 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_2x4, 3, 3, 1, 2);
567 }
dwconv2d_chw_3x3s2p1__sse_3x4(benchmark::State & state,const char * net)568 static void dwconv2d_chw_3x3s2p1__sse_3x4(benchmark::State& state, const char* net) {
569 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4, 3, 3, 1, 2);
570 }
dwconv2d_chw_3x3s2p1__sse_4x4(benchmark::State & state,const char * net)571 static void dwconv2d_chw_3x3s2p1__sse_4x4(benchmark::State& state, const char* net) {
572 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4, 3, 3, 1, 2);
573 }
dwconv2d_chw_3x3s2p1__sse_1x4_acc2(benchmark::State & state,const char * net)574 static void dwconv2d_chw_3x3s2p1__sse_1x4_acc2(benchmark::State& state, const char* net) {
575 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc2, 3, 3, 1, 2);
576 }
dwconv2d_chw_3x3s2p1__sse_1x4_acc3(benchmark::State & state,const char * net)577 static void dwconv2d_chw_3x3s2p1__sse_1x4_acc3(benchmark::State& state, const char* net) {
578 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3, 3, 3, 1, 2);
579 }
dwconv2d_chw_3x3s2p1__sse_1x4_acc4(benchmark::State & state,const char * net)580 static void dwconv2d_chw_3x3s2p1__sse_1x4_acc4(benchmark::State& state, const char* net) {
581 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc4, 3, 3, 1, 2);
582 }
dwconv2d_chw_3x3s2p1__sse_2x4_acc2(benchmark::State & state,const char * net)583 static void dwconv2d_chw_3x3s2p1__sse_2x4_acc2(benchmark::State& state, const char* net) {
584 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_2x4_acc2, 3, 3, 1, 2);
585 }
586
dwconv2d_chw_5x5p2__sse_1x4(benchmark::State & state,const char * net)587 static void dwconv2d_chw_5x5p2__sse_1x4(benchmark::State& state, const char* net) {
588 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4, 5, 5, 2, 1);
589 }
dwconv2d_chw_5x5p2__sse_2x4(benchmark::State & state,const char * net)590 static void dwconv2d_chw_5x5p2__sse_2x4(benchmark::State& state, const char* net) {
591 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4, 5, 5, 2, 1);
592 }
dwconv2d_chw_5x5p2__sse_3x4(benchmark::State & state,const char * net)593 static void dwconv2d_chw_5x5p2__sse_3x4(benchmark::State& state, const char* net) {
594 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4, 5, 5, 2, 1);
595 }
dwconv2d_chw_5x5p2__sse_4x4(benchmark::State & state,const char * net)596 static void dwconv2d_chw_5x5p2__sse_4x4(benchmark::State& state, const char* net) {
597 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4, 5, 5, 2, 1);
598 }
dwconv2d_chw_5x5p2__sse_5x4(benchmark::State & state,const char * net)599 static void dwconv2d_chw_5x5p2__sse_5x4(benchmark::State& state, const char* net) {
600 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4, 5, 5, 2, 1);
601 }
dwconv2d_chw_5x5p2__sse_1x4_acc2(benchmark::State & state,const char * net)602 static void dwconv2d_chw_5x5p2__sse_1x4_acc2(benchmark::State& state, const char* net) {
603 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2, 5, 5, 2, 1);
604 }
dwconv2d_chw_5x5p2__sse_1x4_acc3(benchmark::State & state,const char * net)605 static void dwconv2d_chw_5x5p2__sse_1x4_acc3(benchmark::State& state, const char* net) {
606 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3, 5, 5, 2, 1);
607 }
dwconv2d_chw_5x5p2__sse_1x4_acc4(benchmark::State & state,const char * net)608 static void dwconv2d_chw_5x5p2__sse_1x4_acc4(benchmark::State& state, const char* net) {
609 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4, 5, 5, 2, 1);
610 }
dwconv2d_chw_5x5p2__sse_1x4_acc5(benchmark::State & state,const char * net)611 static void dwconv2d_chw_5x5p2__sse_1x4_acc5(benchmark::State& state, const char* net) {
612 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5, 5, 5, 2, 1);
613 }
dwconv2d_chw_5x5p2__sse_2x4_acc2(benchmark::State & state,const char * net)614 static void dwconv2d_chw_5x5p2__sse_2x4_acc2(benchmark::State& state, const char* net) {
615 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2, 5, 5, 2, 1);
616 }
dwconv2d_chw_5x5p2__sse_2x4_acc3(benchmark::State & state,const char * net)617 static void dwconv2d_chw_5x5p2__sse_2x4_acc3(benchmark::State& state, const char* net) {
618 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc3, 5, 5, 2, 1);
619 }
dwconv2d_chw_5x5p2__sse_3x4_acc2(benchmark::State & state,const char * net)620 static void dwconv2d_chw_5x5p2__sse_3x4_acc2(benchmark::State& state, const char* net) {
621 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2, 5, 5, 2, 1);
622 }
dwconv2d_chw_5x5p2__sse_4x4_acc2(benchmark::State & state,const char * net)623 static void dwconv2d_chw_5x5p2__sse_4x4_acc2(benchmark::State& state, const char* net) {
624 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2, 5, 5, 2, 1);
625 }
626
dwconv2d_chw_5x5s2p2__sse_1x4(benchmark::State & state,const char * net)627 static void dwconv2d_chw_5x5s2p2__sse_1x4(benchmark::State& state, const char* net) {
628 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4, 5, 5, 2, 2);
629 }
dwconv2d_chw_5x5s2p2__sse_2x4(benchmark::State & state,const char * net)630 static void dwconv2d_chw_5x5s2p2__sse_2x4(benchmark::State& state, const char* net) {
631 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4, 5, 5, 2, 2);
632 }
dwconv2d_chw_5x5s2p2__sse_3x4(benchmark::State & state,const char * net)633 static void dwconv2d_chw_5x5s2p2__sse_3x4(benchmark::State& state, const char* net) {
634 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4, 5, 5, 2, 2);
635 }
dwconv2d_chw_5x5s2p2__sse_1x4_acc2(benchmark::State & state,const char * net)636 static void dwconv2d_chw_5x5s2p2__sse_1x4_acc2(benchmark::State& state, const char* net) {
637 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc2, 5, 5, 2, 2);
638 }
dwconv2d_chw_5x5s2p2__sse_1x4_acc3(benchmark::State & state,const char * net)639 static void dwconv2d_chw_5x5s2p2__sse_1x4_acc3(benchmark::State& state, const char* net) {
640 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc3, 5, 5, 2, 2);
641 }
dwconv2d_chw_5x5s2p2__sse_1x4_acc4(benchmark::State & state,const char * net)642 static void dwconv2d_chw_5x5s2p2__sse_1x4_acc4(benchmark::State& state, const char* net) {
643 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc4, 5, 5, 2, 2);
644 }
dwconv2d_chw_5x5s2p2__sse_1x4_acc5(benchmark::State & state,const char * net)645 static void dwconv2d_chw_5x5s2p2__sse_1x4_acc5(benchmark::State& state, const char* net) {
646 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_1x4_acc5, 5, 5, 2, 2);
647 }
dwconv2d_chw_5x5s2p2__sse_2x4_acc2(benchmark::State & state,const char * net)648 static void dwconv2d_chw_5x5s2p2__sse_2x4_acc2(benchmark::State& state, const char* net) {
649 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2, 5, 5, 2, 2);
650 }
dwconv2d_chw_5x5s2p2__sse_2x4_acc3(benchmark::State & state,const char * net)651 static void dwconv2d_chw_5x5s2p2__sse_2x4_acc3(benchmark::State& state, const char* net) {
652 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3, 5, 5, 2, 2);
653 }
dwconv2d_chw_5x5s2p2__sse_3x4_acc2(benchmark::State & state,const char * net)654 static void dwconv2d_chw_5x5s2p2__sse_3x4_acc2(benchmark::State& state, const char* net) {
655 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2, 5, 5, 2, 2);
656 }
657
658 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4)
BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_2x4)659 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_2x4)
660 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_3x4)
661 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_4x4)
662 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_5x4)
663 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_6x4)
664 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc2)
665 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc3)
666 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc4)
667 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_2x4_acc2)
668
669 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4)
670 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_2x4)
671 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_3x4)
672 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_4x4)
673 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_5x4)
674 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_6x4)
675 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4_acc2)
676 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4_acc3)
677 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_1x4_acc4)
678 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__ssse3_2x4_acc2)
679
680 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4)
681 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_2x4)
682 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_3x4)
683 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_4x4)
684 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc2)
685 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc3)
686 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc4)
687 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_2x4_acc2)
688
689 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4)
690 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_2x4)
691 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_3x4)
692 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_4x4)
693 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_5x4)
694 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc2)
695 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc3)
696 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc4)
697 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_1x4_acc5)
698 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_2x4_acc2)
699 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_2x4_acc3)
700 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_3x4_acc2)
701 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__sse_4x4_acc2)
702
703 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4)
704 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_2x4)
705 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_3x4)
706 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc2)
707 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc3)
708 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc4)
709 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_1x4_acc5)
710 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_2x4_acc2)
711 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_2x4_acc3)
712 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__sse_3x4_acc2)
713 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
714
715 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
716 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
717 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4, 3, 3, 1, 1);
718 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)719 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
720 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4, 3, 3, 1, 1);
721 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)722 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
723 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4, 3, 3, 1, 1);
724 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_4x4(benchmark::State & state,const char * net)725 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_4x4(benchmark::State& state, const char* net) {
726 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4, 3, 3, 1, 1);
727 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_5x4(benchmark::State & state,const char * net)728 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_5x4(benchmark::State& state, const char* net) {
729 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4, 3, 3, 1, 1);
730 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_6x4(benchmark::State & state,const char * net)731 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_6x4(benchmark::State& state, const char* net) {
732 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4, 3, 3, 1, 1);
733 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)734 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
735 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc2, 3, 3, 1, 1);
736 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)737 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
738 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc3, 3, 3, 1, 1);
739 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)740 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
741 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_1x4_acc4, 3, 3, 1, 1);
742 }
dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)743 static void dwconv2d_chw_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
744 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4_acc2, 3, 3, 1, 1);
745 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)746 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
747 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4, 3, 3, 1, 1);
748 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)749 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
750 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4, 3, 3, 1, 1);
751 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)752 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
753 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4, 3, 3, 1, 1);
754 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_4x4(benchmark::State & state,const char * net)755 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_4x4(benchmark::State& state, const char* net) {
756 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4, 3, 3, 1, 1);
757 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_5x4(benchmark::State & state,const char * net)758 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_5x4(benchmark::State& state, const char* net) {
759 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4, 3, 3, 1, 1);
760 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_6x4(benchmark::State & state,const char * net)761 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_6x4(benchmark::State& state, const char* net) {
762 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4, 3, 3, 1, 1);
763 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)764 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
765 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc2, 3, 3, 1, 1);
766 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)767 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
768 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc3, 3, 3, 1, 1);
769 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)770 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
771 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_1x4_acc4, 3, 3, 1, 1);
772 }
dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)773 static void dwconv2d_chw_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
774 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4_acc2, 3, 3, 1, 1);
775 }
776
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)777 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
778 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4, 3, 3, 1, 1);
779 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)780 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
781 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4, 3, 3, 1, 1);
782 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)783 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
784 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4, 3, 3, 1, 1);
785 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_4x4(benchmark::State & state,const char * net)786 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_4x4(benchmark::State& state, const char* net) {
787 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4, 3, 3, 1, 1);
788 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_5x4(benchmark::State & state,const char * net)789 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_5x4(benchmark::State& state, const char* net) {
790 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4, 3, 3, 1, 1);
791 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_6x4(benchmark::State & state,const char * net)792 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_6x4(benchmark::State& state, const char* net) {
793 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4, 3, 3, 1, 1);
794 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)795 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
796 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc2, 3, 3, 1, 1);
797 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)798 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
799 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc3, 3, 3, 1, 1);
800 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)801 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
802 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_1x4_acc4, 3, 3, 1, 1);
803 }
dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)804 static void dwconv2d_chw_3x3p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
805 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2, 3, 3, 1, 1);
806 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)807 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
808 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4, 3, 3, 1, 1);
809 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)810 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
811 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4, 3, 3, 1, 1);
812 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)813 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
814 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4, 3, 3, 1, 1);
815 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_4x4(benchmark::State & state,const char * net)816 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_4x4(benchmark::State& state, const char* net) {
817 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4, 3, 3, 1, 1);
818 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_5x4(benchmark::State & state,const char * net)819 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_5x4(benchmark::State& state, const char* net) {
820 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4, 3, 3, 1, 1);
821 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_6x4(benchmark::State & state,const char * net)822 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_6x4(benchmark::State& state, const char* net) {
823 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4, 3, 3, 1, 1);
824 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)825 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
826 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2, 3, 3, 1, 1);
827 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)828 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
829 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc3, 3, 3, 1, 1);
830 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)831 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
832 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc4, 3, 3, 1, 1);
833 }
dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)834 static void dwconv2d_chw_3x3p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
835 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2, 3, 3, 1, 1);
836 }
837
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4(benchmark::State & state,const char * net)838 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
839 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4, 3, 3, 1, 2);
840 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)841 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
842 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_2x4, 3, 3, 1, 2);
843 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)844 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
845 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4, 3, 3, 1, 2);
846 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_4x4(benchmark::State & state,const char * net)847 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_4x4(benchmark::State& state, const char* net) {
848 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4, 3, 3, 1, 2);
849 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)850 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
851 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc2, 3, 3, 1, 2);
852 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)853 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
854 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc3, 3, 3, 1, 2);
855 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)856 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
857 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_1x4_acc4, 3, 3, 1, 2);
858 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)859 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
860 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_2x4_acc2, 3, 3, 1, 2);
861 }
862
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)863 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
864 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4, 3, 3, 1, 2);
865 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)866 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
867 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_2x4, 3, 3, 1, 2);
868 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)869 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
870 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4, 3, 3, 1, 2);
871 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_4x4(benchmark::State & state,const char * net)872 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_4x4(benchmark::State& state, const char* net) {
873 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4, 3, 3, 1, 2);
874 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)875 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
876 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc2, 3, 3, 1, 2);
877 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)878 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
879 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc3, 3, 3, 1, 2);
880 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)881 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
882 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_1x4_acc4, 3, 3, 1, 2);
883 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)884 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
885 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_2x4_acc2, 3, 3, 1, 2);
886 }
887
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)888 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
889 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4, 3, 3, 1, 2);
890 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)891 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
892 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4, 3, 3, 1, 2);
893 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)894 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
895 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4, 3, 3, 1, 2);
896 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_4x4(benchmark::State & state,const char * net)897 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_4x4(benchmark::State& state, const char* net) {
898 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4, 3, 3, 1, 2);
899 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)900 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
901 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc2, 3, 3, 1, 2);
902 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)903 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
904 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc3, 3, 3, 1, 2);
905 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)906 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
907 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4, 3, 3, 1, 2);
908 }
dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)909 static void dwconv2d_chw_3x3s2p1__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
910 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2, 3, 3, 1, 2);
911 }
912
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)913 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
914 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4, 3, 3, 1, 2);
915 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)916 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
917 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4, 3, 3, 1, 2);
918 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)919 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
920 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4, 3, 3, 1, 2);
921 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_4x4(benchmark::State & state,const char * net)922 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_4x4(benchmark::State& state, const char* net) {
923 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4, 3, 3, 1, 2);
924 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)925 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
926 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2, 3, 3, 1, 2);
927 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)928 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
929 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc3, 3, 3, 1, 2);
930 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)931 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
932 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc4, 3, 3, 1, 2);
933 }
dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)934 static void dwconv2d_chw_3x3s2p1__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
935 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2, 3, 3, 1, 2);
936 }
937
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4(benchmark::State & state,const char * net)938 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
939 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4, 5, 5, 2, 1);
940 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)941 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
942 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4, 5, 5, 2, 1);
943 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)944 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
945 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4, 5, 5, 2, 1);
946 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4(benchmark::State & state,const char * net)947 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4(benchmark::State& state, const char* net) {
948 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4, 5, 5, 2, 1);
949 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_5x4(benchmark::State & state,const char * net)950 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_5x4(benchmark::State& state, const char* net) {
951 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4, 5, 5, 2, 1);
952 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)953 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
954 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2, 5, 5, 2, 1);
955 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)956 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
957 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3, 5, 5, 2, 1);
958 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)959 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
960 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4, 5, 5, 2, 1);
961 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State & state,const char * net)962 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
963 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc5, 5, 5, 2, 1);
964 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)965 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
966 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc2, 5, 5, 2, 1);
967 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State & state,const char * net)968 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
969 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4_acc3, 5, 5, 2, 1);
970 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State & state,const char * net)971 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
972 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2, 5, 5, 2, 1);
973 }
dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2(benchmark::State & state,const char * net)974 static void dwconv2d_chw_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2(benchmark::State& state, const char* net) {
975 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2, 5, 5, 2, 1);
976 }
977
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)978 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
979 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4, 5, 5, 2, 1);
980 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)981 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
982 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4, 5, 5, 2, 1);
983 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)984 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
985 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4, 5, 5, 2, 1);
986 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4(benchmark::State & state,const char * net)987 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4(benchmark::State& state, const char* net) {
988 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4, 5, 5, 2, 1);
989 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_5x4(benchmark::State & state,const char * net)990 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_5x4(benchmark::State& state, const char* net) {
991 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4, 5, 5, 2, 1);
992 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)993 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
994 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2, 5, 5, 2, 1);
995 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)996 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
997 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3, 5, 5, 2, 1);
998 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)999 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
1000 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4, 5, 5, 2, 1);
1001 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State & state,const char * net)1002 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
1003 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5, 5, 5, 2, 1);
1004 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State & state,const char * net)1005 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
1006 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc2, 5, 5, 2, 1);
1007 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3(benchmark::State & state,const char * net)1008 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
1009 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4_acc3, 5, 5, 2, 1);
1010 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2(benchmark::State & state,const char * net)1011 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
1012 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2, 5, 5, 2, 1);
1013 }
dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2(benchmark::State & state,const char * net)1014 static void dwconv2d_chw_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2(benchmark::State& state, const char* net) {
1015 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2, 5, 5, 2, 1);
1016 }
1017
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4(benchmark::State & state,const char * net)1018 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4(benchmark::State& state, const char* net) {
1019 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4, 5, 5, 2, 1);
1020 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4(benchmark::State & state,const char * net)1021 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4(benchmark::State& state, const char* net) {
1022 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4, 5, 5, 2, 1);
1023 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4(benchmark::State & state,const char * net)1024 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4(benchmark::State& state, const char* net) {
1025 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4, 5, 5, 2, 1);
1026 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4(benchmark::State & state,const char * net)1027 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4(benchmark::State& state, const char* net) {
1028 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4, 5, 5, 2, 1);
1029 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_5x4(benchmark::State & state,const char * net)1030 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_5x4(benchmark::State& state, const char* net) {
1031 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4, 5, 5, 2, 1);
1032 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc2(benchmark::State & state,const char * net)1033 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc2(benchmark::State& state, const char* net) {
1034 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2, 5, 5, 2, 1);
1035 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc3(benchmark::State & state,const char * net)1036 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc3(benchmark::State& state, const char* net) {
1037 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3, 5, 5, 2, 1);
1038 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc4(benchmark::State & state,const char * net)1039 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc4(benchmark::State& state, const char* net) {
1040 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4, 5, 5, 2, 1);
1041 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc5(benchmark::State & state,const char * net)1042 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_1x4_acc5(benchmark::State& state, const char* net) {
1043 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc5, 5, 5, 2, 1);
1044 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc2(benchmark::State & state,const char * net)1045 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc2(benchmark::State& state, const char* net) {
1046 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc2, 5, 5, 2, 1);
1047 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc3(benchmark::State & state,const char * net)1048 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_2x4_acc3(benchmark::State& state, const char* net) {
1049 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4_acc3, 5, 5, 2, 1);
1050 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4_acc2(benchmark::State & state,const char * net)1051 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_3x4_acc2(benchmark::State& state, const char* net) {
1052 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2, 5, 5, 2, 1);
1053 }
dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4_acc2(benchmark::State & state,const char * net)1054 static void dwconv2d_chw_5x5p2__wasmsimd_arm_splat_4x4_acc2(benchmark::State& state, const char* net) {
1055 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2, 5, 5, 2, 1);
1056 }
1057
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4(benchmark::State & state,const char * net)1058 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4(benchmark::State& state, const char* net) {
1059 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4, 5, 5, 2, 1);
1060 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4(benchmark::State & state,const char * net)1061 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4(benchmark::State& state, const char* net) {
1062 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4, 5, 5, 2, 1);
1063 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4(benchmark::State & state,const char * net)1064 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4(benchmark::State& state, const char* net) {
1065 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4, 5, 5, 2, 1);
1066 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4(benchmark::State & state,const char * net)1067 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4(benchmark::State& state, const char* net) {
1068 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4, 5, 5, 2, 1);
1069 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_5x4(benchmark::State & state,const char * net)1070 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_5x4(benchmark::State& state, const char* net) {
1071 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4, 5, 5, 2, 1);
1072 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc2(benchmark::State & state,const char * net)1073 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc2(benchmark::State& state, const char* net) {
1074 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2, 5, 5, 2, 1);
1075 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc3(benchmark::State & state,const char * net)1076 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc3(benchmark::State& state, const char* net) {
1077 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3, 5, 5, 2, 1);
1078 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc4(benchmark::State & state,const char * net)1079 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc4(benchmark::State& state, const char* net) {
1080 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4, 5, 5, 2, 1);
1081 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc5(benchmark::State & state,const char * net)1082 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_1x4_acc5(benchmark::State& state, const char* net) {
1083 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc5, 5, 5, 2, 1);
1084 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc2(benchmark::State & state,const char * net)1085 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc2(benchmark::State& state, const char* net) {
1086 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc2, 5, 5, 2, 1);
1087 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc3(benchmark::State & state,const char * net)1088 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_2x4_acc3(benchmark::State& state, const char* net) {
1089 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4_acc3, 5, 5, 2, 1);
1090 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4_acc2(benchmark::State & state,const char * net)1091 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_3x4_acc2(benchmark::State& state, const char* net) {
1092 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2, 5, 5, 2, 1);
1093 }
dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4_acc2(benchmark::State & state,const char * net)1094 static void dwconv2d_chw_5x5p2__wasmsimd_x86_splat_4x4_acc2(benchmark::State& state, const char* net) {
1095 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2, 5, 5, 2, 1);
1096 }
1097
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4(benchmark::State & state,const char * net)1098 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4(benchmark::State& state, const char* net) {
1099 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4, 5, 5, 2, 2);
1100 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4(benchmark::State & state,const char * net)1101 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4(benchmark::State& state, const char* net) {
1102 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4, 5, 5, 2, 2);
1103 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4(benchmark::State & state,const char * net)1104 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4(benchmark::State& state, const char* net) {
1105 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4, 5, 5, 2, 2);
1106 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State & state,const char * net)1107 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
1108 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc2, 5, 5, 2, 2);
1109 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State & state,const char * net)1110 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
1111 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3, 5, 5, 2, 2);
1112 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State & state,const char * net)1113 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
1114 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc4, 5, 5, 2, 2);
1115 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State & state,const char * net)1116 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5(benchmark::State& state, const char* net) {
1117 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc5, 5, 5, 2, 2);
1118 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State & state,const char * net)1119 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2(benchmark::State& state, const char* net) {
1120 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2, 5, 5, 2, 2);
1121 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State & state,const char * net)1122 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3(benchmark::State& state, const char* net) {
1123 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3, 5, 5, 2, 2);
1124 }
dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State & state,const char * net)1125 static void dwconv2d_chw_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2(benchmark::State& state, const char* net) {
1126 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2, 5, 5, 2, 2);
1127 }
1128
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4(benchmark::State & state,const char * net)1129 static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4(benchmark::State& state, const char* net) {
1130 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4, 5, 5, 2, 2);
1131 }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4(benchmark::State & state,const char * net)1132 static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_2x4(benchmark::State& state, const char* net) {
1133 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4, 5, 5, 2, 2);
1134 }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4(benchmark::State & state,const char * net)1135 static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_3x4(benchmark::State& state, const char* net) {
1136 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4, 5, 5, 2, 2);
1137 }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State & state,const char * net)1138 static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2(benchmark::State& state, const char* net) {
1139 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc2, 5, 5, 2, 2);
1140 }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State & state,const char * net)1141 static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3(benchmark::State& state, const char* net) {
1142 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3, 5, 5, 2, 2);
1143 }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State & state,const char * net)1144 static void dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4(benchmark::State& state, const char* net) {
1145 f32_dwconv2d_chw(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc4, 5, 5, 2, 2);
1146 }
dwconv2d_chw_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc5(benchmark::State & state,const char * net)1147