1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <algorithm>
7 #include <cfloat>
8 #include <cmath>
9 #include <functional>
10 #include <limits>
11 #include <random>
12 #include <vector>
13
14 #include <benchmark/benchmark.h>
15 #include "bench/conv.h"
16 #include "bench/utils.h"
17
18 #include <xnnpack.h>
19 #include <xnnpack/aligned-allocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/igemm.h>
22 #include <xnnpack/indirection.h>
23 #include <xnnpack/operator.h>
24 #include <xnnpack/microfnptr.h>
25 #include <xnnpack/microparams-init.h>
26 #include <xnnpack/pack.h>
27
28
f32_igemm(benchmark::State & state,xnn_f32_igemm_minmax_ukernel_function igemm,uint32_t mr,uint32_t nr,uint32_t kr,uint32_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)29 static void f32_igemm(benchmark::State& state,
30 xnn_f32_igemm_minmax_ukernel_function igemm,
31 uint32_t mr, uint32_t nr, uint32_t kr, uint32_t sr,
32 xnn_init_f32_minmax_params_fn init_params,
33 benchmark::utils::IsaCheckFunction isa_check = nullptr)
34 {
35 if (isa_check && !isa_check(state)) {
36 return;
37 }
38
39 const size_t input_height = state.range(0);
40 const size_t input_width = state.range(1);
41 const size_t kernel_height = state.range(2);
42 const size_t kernel_width = state.range(3);
43 const size_t kernel_size = kernel_height * kernel_width;
44 const size_t padding_height = state.range(4);
45 const size_t padding_width = state.range(5);
46 const size_t subsampling = state.range(6);
47 const size_t dilation = state.range(7);
48 const size_t group_input_channels = state.range(8);
49 const size_t group_output_channels = state.range(9);
50
51 std::random_device random_device;
52 auto rng = std::mt19937(random_device());
53 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
54
55 const size_t output_pixel_stride = group_output_channels;
56 const size_t input_pixel_stride = group_input_channels;
57 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
58 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
59 const size_t padding_left = padding_width / 2;
60 const size_t padding_top = padding_height / 2;
61 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
62 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
63 const size_t output_size = output_height * output_width;
64
65 const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
66 const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
67 const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
68
69 std::vector<float> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
70 std::generate(a.begin(), a.end(), std::ref(f32rng));
71 std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
72 std::generate(k.begin(), k.end(), std::ref(f32rng));
73 std::vector<float> b(group_output_channels);
74 std::generate(b.begin(), b.end(), std::ref(f32rng));
75
76 std::vector<float> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(float));
77
78 const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
79 const size_t i_elements = mc_stride * kernel_size;
80 const size_t c_elements = output_height * output_width * output_pixel_stride;
81 const size_t num_buffers = 1 +
82 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
83 sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
84
85 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
86 std::fill(w.begin(), w.end(), 0.0f);
87 xnn_pack_f32_conv_goki_w(
88 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
89 nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
90 for (size_t n = 1; n < num_buffers; n++) {
91 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
92 }
93
94 std::vector<const float*> i(i_elements * num_buffers);
95 xnn_operator convolution_op = { };
96 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
97 convolution_op.input = a.data();
98 convolution_op.input_pixel_stride = input_pixel_stride;
99 convolution_op.zero_buffer = z.data();
100 convolution_op.groups = 1;
101 convolution_op.group_input_channels = group_input_channels;
102 convolution_op.batch_size = 1;
103 convolution_op.input_height = input_height;
104 convolution_op.input_width = input_width;
105 convolution_op.output_height = output_height;
106 convolution_op.output_width = output_width;
107 convolution_op.kernel_height = kernel_height;
108 convolution_op.kernel_width = kernel_width;
109 convolution_op.stride_height = subsampling;
110 convolution_op.stride_width = subsampling;
111 convolution_op.dilation_height = dilation;
112 convolution_op.dilation_width = dilation;
113 convolution_op.padding_top = padding_top;
114 convolution_op.padding_left = padding_left;
115 xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
116 for (size_t n = 1; n < num_buffers; n++) {
117 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
118 }
119
120 std::vector<float> c(c_elements * num_buffers);
121 std::fill(c.begin(), c.end(), std::nanf(""));
122
123 xnn_f32_minmax_params params;
124 init_params(¶ms,
125 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
126
127 size_t buffer_index = 0;
128 for (auto _ : state) {
129 state.PauseTiming();
130 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
131 buffer_index = (buffer_index + 1) % num_buffers;
132 state.ResumeTiming();
133
134 for (uint32_t m = 0; m < output_size; m += mr) {
135 const uint32_t mb = min(output_size - m, mr);
136 igemm(
137 mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
138 i.data() + buffer_index * i_elements + m,
139 w.data() + buffer_index * w_elements,
140 c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
141 0, z.data(), ¶ms);
142 }
143 }
144
145 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
146 if (cpu_frequency != 0) {
147 state.counters["cpufreq"] = cpu_frequency;
148 }
149
150 state.counters["FLOPS"] = benchmark::Counter(
151 uint64_t(state.iterations()) * 2 *
152 output_height * output_width *
153 group_input_channels * group_output_channels *
154 kernel_height * kernel_width,
155 benchmark::Counter::kIsRate);
156 }
157
158 #if XNN_PLATFORM_JIT
f32_igemm(benchmark::State & state,xnn_jit_igemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_f32_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)159 static void f32_igemm(benchmark::State& state,
160 xnn_jit_igemm_code_generator_function generator,
161 size_t mr, size_t nr, size_t kr, size_t sr,
162 xnn_init_f32_minmax_params_fn init_params,
163 benchmark::utils::IsaCheckFunction isa_check = nullptr)
164 {
165 if (isa_check && !isa_check(state)) {
166 return;
167 }
168
169 const size_t input_height = state.range(0);
170 const size_t input_width = state.range(1);
171 const size_t kernel_height = state.range(2);
172 const size_t kernel_width = state.range(3);
173 const size_t kernel_size = kernel_height * kernel_width;
174 const size_t padding_height = state.range(4);
175 const size_t padding_width = state.range(5);
176 const size_t subsampling = state.range(6);
177 const size_t dilation = state.range(7);
178 const size_t group_input_channels = state.range(8);
179 const size_t group_output_channels = state.range(9);
180
181 std::random_device random_device;
182 auto rng = std::mt19937(random_device());
183 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
184
185 const size_t output_pixel_stride = group_output_channels;
186 const size_t input_pixel_stride = group_input_channels;
187 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
188 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
189 const size_t padding_left = padding_width / 2;
190 const size_t padding_top = padding_height / 2;
191 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
192 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
193 const size_t output_size = output_height * output_width;
194
195 const size_t mc_stride = benchmark::utils::RoundUp<size_t>(output_size, mr);
196 const size_t nc_stride = benchmark::utils::RoundUp<size_t>(group_output_channels, nr);
197 const size_t kc_stride = benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
198
199 std::vector<float> a(input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
200 std::generate(a.begin(), a.end(), std::ref(f32rng));
201 std::vector<float> k(group_output_channels * kernel_height * kernel_width * group_input_channels);
202 std::generate(k.begin(), k.end(), std::ref(f32rng));
203 std::vector<float> b(group_output_channels);
204 std::generate(b.begin(), b.end(), std::ref(f32rng));
205
206 std::vector<float> z(group_input_channels + XNN_EXTRA_BYTES / sizeof(float));
207
208 const size_t w_elements = kernel_size * kc_stride * nc_stride + nc_stride;
209 const size_t i_elements = mc_stride * kernel_size;
210 const size_t c_elements = output_height * output_width * output_pixel_stride;
211 const size_t num_buffers = 1 +
212 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
213 sizeof(float) * (w_elements + c_elements) + sizeof(void*) * i_elements);
214
215 std::vector<float, AlignedAllocator<float, 64>> w(w_elements * num_buffers);
216 std::fill(w.begin(), w.end(), 0.0f);
217 xnn_pack_f32_conv_goki_w(
218 1 /* groups */, group_output_channels, kernel_size, group_input_channels,
219 nr, kr, sr, k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
220 for (size_t n = 1; n < num_buffers; n++) {
221 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
222 }
223
224 std::vector<const float*> i(i_elements * num_buffers);
225 xnn_operator convolution_op = { };
226 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
227 convolution_op.input = a.data();
228 convolution_op.input_pixel_stride = input_pixel_stride;
229 convolution_op.zero_buffer = z.data();
230 convolution_op.groups = 1;
231 convolution_op.group_input_channels = group_input_channels;
232 convolution_op.batch_size = 1;
233 convolution_op.input_height = input_height;
234 convolution_op.input_width = input_width;
235 convolution_op.output_height = output_height;
236 convolution_op.output_width = output_width;
237 convolution_op.kernel_height = kernel_height;
238 convolution_op.kernel_width = kernel_width;
239 convolution_op.stride_height = subsampling;
240 convolution_op.stride_width = subsampling;
241 convolution_op.dilation_height = dilation;
242 convolution_op.dilation_width = dilation;
243 convolution_op.padding_top = padding_top;
244 convolution_op.padding_left = padding_left;
245 xnn_indirection_init_conv2d(&convolution_op, mr, 2 /* log2(sizeof(float)) */);
246 for (size_t n = 1; n < num_buffers; n++) {
247 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
248 }
249
250 std::vector<float> c(c_elements * num_buffers);
251 std::fill(c.begin(), c.end(), std::nanf(""));
252
253 xnn_f32_minmax_params params;
254 init_params(¶ms,
255 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
256
257 jit_gemm_params jit_params = {
258 .f32_minmax = {
259 .min = -std::numeric_limits<float>::infinity(),
260 .max = +std::numeric_limits<float>::infinity()
261 }
262 };
263
264 xnn_initialize(/*allocator=*/nullptr);
265 xnn_code_buffer code_buffer;
266 xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
267 generator(&code_buffer,
268 mr,
269 group_output_channels % nr,
270 group_input_channels * sizeof(float),
271 kernel_size * mr * sizeof(void *),
272 &jit_params);
273 xnn_finalize_code_memory(&code_buffer);
274 auto igemm = reinterpret_cast<xnn_f32_igemm_minmax_ukernel_function>(code_buffer.start);
275
276 size_t buffer_index = 0;
277 for (auto _ : state) {
278 state.PauseTiming();
279 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(float));
280 buffer_index = (buffer_index + 1) % num_buffers;
281 state.ResumeTiming();
282
283 for (uint32_t m = 0; m < output_size; m += mr) {
284 const uint32_t mb = min(output_size - m, mr);
285 igemm(
286 mb, group_output_channels, group_input_channels * sizeof(float), kernel_size * mr * sizeof(void*),
287 i.data() + buffer_index * i_elements + m,
288 w.data() + buffer_index * w_elements,
289 c.data() + buffer_index * c_elements + m * group_output_channels, group_output_channels * sizeof(float), nr * sizeof(float),
290 0, z.data(), ¶ms);
291 }
292 }
293 xnn_release_code_memory(&code_buffer);
294
295 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
296 if (cpu_frequency != 0) {
297 state.counters["cpufreq"] = cpu_frequency;
298 }
299
300 state.counters["FLOPS"] = benchmark::Counter(
301 uint64_t(state.iterations()) * 2 *
302 output_height * output_width *
303 group_input_channels * group_output_channels *
304 kernel_height * kernel_width,
305 benchmark::Counter::kIsRate);
306
307 }
308 #endif // XNN_PLATFORM_JIT
309
310 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)311 static void jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
312 f32_igemm(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
313 xnn_init_f32_minmax_scalar_params);
314 }
jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)315 static void jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
316 f32_igemm(state, xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
317 xnn_init_f32_minmax_scalar_params);
318 }
jit_f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)319 static void jit_f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
320 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
321 xnn_init_f32_minmax_scalar_params);
322 }
jit_f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)323 static void jit_f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
324 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
325 xnn_init_f32_minmax_scalar_params);
326 }
jit_f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)327 static void jit_f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
328 f32_igemm(state, xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
329 xnn_init_f32_minmax_scalar_params);
330 }
331
332 BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_cortex_a75)
333 BENCHMARK_CONV(jit_f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
334 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch64_neonfma_cortex_a75)
335 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
336 BENCHMARK_CONV(jit_f32_igemm_6x8__aarch64_neonfma_ld128)
337
338 #define BENCHMARK_UPTO_MR_IGEMM(name, max_mr, nr) \
339 static void name(benchmark::State &state, const char *net) { \
340 f32_igemm( \
341 state, \
342 xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, \
343 max_mr, nr, 1, 1, xnn_init_f32_minmax_scalar_params); \
344 } \
345 BENCHMARK_CONV(name)
346 BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8);
347 BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_2x8__aarch64_neonfma_prfm_cortex_a75, 2, 8);
348 BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_3x8__aarch64_neonfma_prfm_cortex_a75, 3, 8);
349 BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8);
350 BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8);
351 BENCHMARK_UPTO_MR_IGEMM(jit_f32_igemm_upto6x8_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8);
352 #undef BENCHMARK_UPTO_MR_IGEMM
353
354 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
355
356 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State & state,const char * net)357 static void jit_f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
358 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
359 xnn_init_f32_minmax_scalar_params);
360 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)361 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
362 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
363 xnn_init_f32_minmax_scalar_params);
364 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)365 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
366 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
367 xnn_init_f32_minmax_scalar_params);
368 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)369 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
370 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
371 xnn_init_f32_minmax_scalar_params);
372 }
jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)373 static void jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
374 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
375 xnn_init_f32_minmax_scalar_params);
376 }
jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)377 static void jit_f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
378 f32_igemm(state, xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
379 xnn_init_f32_minmax_scalar_params);
380 }
381
382 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)383 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a7)
384 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a53)
385 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a55)
386 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
387 BENCHMARK_CONV(jit_f32_igemm_4x8__aarch32_neon_cortex_a75)
388 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
389
390 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
391 static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
392 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1,
393 xnn_init_f32_minmax_scalar_params);
394 }
f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State & state,const char * net)395 static void f32_igemm_4x8__aarch32_neon_cortex_a7(benchmark::State& state, const char* net) {
396 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7, 4, 8, 1, 1,
397 xnn_init_f32_minmax_scalar_params);
398 }
f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State & state,const char * net)399 static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
400 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1,
401 xnn_init_f32_minmax_scalar_params);
402 }
f32_igemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State & state,const char * net)403 static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a53(benchmark::State& state, const char* net) {
404 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, 4, 8, 1, 1,
405 xnn_init_f32_minmax_scalar_params);
406 }
f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State & state,const char * net)407 static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
408 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1,
409 xnn_init_f32_minmax_scalar_params);
410 }
f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State & state,const char * net)411 static void f32_igemm_4x8__aarch32_neon_prfm_cortex_a75(benchmark::State& state, const char* net) {
412 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75, 4, 8, 1, 1,
413 xnn_init_f32_minmax_scalar_params);
414 }
f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State & state,const char * net)415 static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
416 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1,
417 xnn_init_f32_minmax_scalar_params);
418 }
419
420 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)421 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a7)
422 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
423 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a53)
424 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
425 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_prfm_cortex_a75)
426 BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
427 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
428
429
430 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
431 static void f32_igemm_1x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
432 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53, 1, 12, 1, 1,
433 xnn_init_f32_minmax_scalar_params);
434 }
f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)435 static void f32_igemm_1x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
436 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, 1, 8, 1, 1,
437 xnn_init_f32_minmax_scalar_params);
438 }
f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)439 static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
440 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, 1, 8, 1, 1,
441 xnn_init_f32_minmax_scalar_params);
442 }
f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)443 static void f32_igemm_1x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
444 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75, 1, 8, 1, 1,
445 xnn_init_f32_minmax_scalar_params);
446 }
f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)447 static void f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
448 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, 1, 8, 1, 1,
449 xnn_init_f32_minmax_scalar_params);
450 }
f32_igemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)451 static void f32_igemm_4x2__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
452 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, 4, 2, 1, 1,
453 xnn_init_f32_minmax_scalar_params);
454 }
f32_igemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)455 static void f32_igemm_4x2__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
456 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, 4, 2, 1, 1,
457 xnn_init_f32_minmax_scalar_params);
458 }
f32_igemm_4x2__aarch64_neonfma_ld64(benchmark::State & state,const char * net)459 static void f32_igemm_4x2__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
460 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, 4, 2, 1, 1,
461 xnn_init_f32_minmax_scalar_params);
462 }
f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)463 static void f32_igemm_4x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
464 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53, 4, 8, 1, 1,
465 xnn_init_f32_minmax_scalar_params);
466 }
f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)467 static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
468 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, 4, 8, 1, 1,
469 xnn_init_f32_minmax_scalar_params);
470 }
f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)471 static void f32_igemm_4x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
472 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, 4, 8, 1, 1,
473 xnn_init_f32_minmax_scalar_params);
474 }
f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)475 static void f32_igemm_4x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
476 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, 4, 8, 1, 1,
477 xnn_init_f32_minmax_scalar_params);
478 }
f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)479 static void f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
480 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, 4, 8, 1, 1,
481 xnn_init_f32_minmax_scalar_params);
482 }
f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)483 static void f32_igemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
484 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld64, 4, 8, 1, 1,
485 xnn_init_f32_minmax_scalar_params);
486 }
f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)487 static void f32_igemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
488 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_ld128, 4, 8, 1, 1,
489 xnn_init_f32_minmax_scalar_params);
490 }
f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)491 static void f32_igemm_5x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
492 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75, 5, 8, 1, 1,
493 xnn_init_f32_minmax_scalar_params);
494 }
f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)495 static void f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
496 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, 5, 8, 1, 1,
497 xnn_init_f32_minmax_scalar_params);
498 }
f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)499 static void f32_igemm_4x12__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
500 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53, 4, 12, 1, 1,
501 xnn_init_f32_minmax_scalar_params);
502 }
f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State & state,const char * net)503 static void f32_igemm_6x8__aarch64_neonfma_cortex_a53(benchmark::State& state, const char* net) {
504 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53, 6, 8, 1, 1,
505 xnn_init_f32_minmax_scalar_params);
506 }
f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State & state,const char * net)507 static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a53(benchmark::State& state, const char* net) {
508 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53, 6, 8, 1, 1,
509 xnn_init_f32_minmax_scalar_params);
510 }
f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State & state,const char * net)511 static void f32_igemm_6x8__aarch64_neonfma_cortex_a55(benchmark::State& state, const char* net) {
512 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, 6, 8, 1, 1,
513 xnn_init_f32_minmax_scalar_params);
514 }
f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State & state,const char * net)515 static void f32_igemm_6x8__aarch64_neonfma_cortex_a73(benchmark::State& state, const char* net) {
516 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73, 6, 8, 1, 1,
517 xnn_init_f32_minmax_scalar_params);
518 }
f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State & state,const char * net)519 static void f32_igemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
520 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1,
521 xnn_init_f32_minmax_scalar_params);
522 }
f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State & state,const char * net)523 static void f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75(benchmark::State& state, const char* net) {
524 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, 6, 8, 1, 1,
525 xnn_init_f32_minmax_scalar_params);
526 }
f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State & state,const char * net)527 static void f32_igemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, const char* net) {
528 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, 6, 8, 1, 1,
529 xnn_init_f32_minmax_scalar_params);
530 }
f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State & state,const char * net)531 static void f32_igemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, const char* net) {
532 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, 6, 8, 1, 1,
533 xnn_init_f32_minmax_scalar_params);
534 }
f32_igemm_1x8__neonfma_lane_ld64(benchmark::State & state,const char * net)535 static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
536 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1,
537 xnn_init_f32_minmax_scalar_params);
538 }
f32_igemm_4x2__neonfma_lane_ld64(benchmark::State & state,const char * net)539 static void f32_igemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
540 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1,
541 xnn_init_f32_minmax_scalar_params);
542 }
f32_igemm_6x2__neonfma_lane_ld64(benchmark::State & state,const char * net)543 static void f32_igemm_6x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
544 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x2__neonfma_lane_ld64, 6, 2, 1, 1,
545 xnn_init_f32_minmax_scalar_params);
546 }
f32_igemm_4x4__neonfma_lane_ld64(benchmark::State & state,const char * net)547 static void f32_igemm_4x4__neonfma_lane_ld64(benchmark::State& state, const char* net) {
548 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x4__neonfma_lane_ld64, 4, 4, 1, 1,
549 xnn_init_f32_minmax_scalar_params);
550 }
f32_igemm_4x8__neonfma_lane_ld128(benchmark::State & state,const char * net)551 static void f32_igemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
552 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1,
553 xnn_init_f32_minmax_scalar_params);
554 }
f32_igemm_4x8__neonfma_lane_ld64(benchmark::State & state,const char * net)555 static void f32_igemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
556 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1,
557 xnn_init_f32_minmax_scalar_params);
558 }
f32_igemm_6x8__neonfma_lane_ld64(benchmark::State & state,const char * net)559 static void f32_igemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
560 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1,
561 xnn_init_f32_minmax_scalar_params);
562 }
f32_igemm_6x8__neonfma_lane_ld128(benchmark::State & state,const char * net)563 static void f32_igemm_6x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
564 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128, 6, 8, 1, 1,
565 xnn_init_f32_minmax_scalar_params);
566 }
567
568 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)
BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53)569 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a53)
570 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a75)
571 BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_prfm_cortex_a75)
572 BENCHMARK_CONV(f32_igemm_1x12__aarch64_neonfma_cortex_a53)
573 BENCHMARK_CONV(f32_igemm_4x2__aarch64_neonfma_cortex_a75)
574 BENCHMARK_CONV(f32_igemm_4x2__aarch64_neonfma_prfm_cortex_a75)
575 BENCHMARK_CONV(f32_igemm_4x2__aarch64_neonfma_ld64)
576 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a53)
577 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a53)
578 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a55)
579 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_cortex_a75)
580 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_prfm_cortex_a75)
581 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld64)
582 BENCHMARK_CONV(f32_igemm_4x8__aarch64_neonfma_ld128)
583 BENCHMARK_CONV(f32_igemm_4x12__aarch64_neonfma_cortex_a53)
584 BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_cortex_a75)
585 BENCHMARK_CONV(f32_igemm_5x8__aarch64_neonfma_prfm_cortex_a75)
586 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a53)
587 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a53)
588 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a55)
589 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
590 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
591 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_prfm_cortex_a75)
592 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld64)
593 BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ld128)
594 BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
595 BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
596 BENCHMARK_CONV(f32_igemm_6x2__neonfma_lane_ld64)
597 BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
598 BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld128)
599 BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld64)
600 BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld64)
601 BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
602 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
603
604
605 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
606 static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
607 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1,
608 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
609 }
f32_igemm_4x2__neon_lane_ld64(benchmark::State & state,const char * net)610 static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
611 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1,
612 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
613 }
f32_igemm_6x2__neon_lane_ld64(benchmark::State & state,const char * net)614 static void f32_igemm_6x2__neon_lane_ld64(benchmark::State& state, const char* net) {
615 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x2__neon_lane_ld64, 6, 2, 1, 1,
616 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
617 }
f32_igemm_4x4__neon_lane_ld64(benchmark::State & state,const char * net)618 static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
619 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1,
620 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
621 }
f32_igemm_4x8__neon_lane_ld64(benchmark::State & state,const char * net)622 static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
623 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1,
624 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
625 }
f32_igemm_4x8__neon_lane_ld128(benchmark::State & state,const char * net)626 static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
627 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1,
628 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
629 }
f32_igemm_6x8__neon_lane_ld64(benchmark::State & state,const char * net)630 static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
631 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1,
632 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
633 }
f32_igemm_6x8__neon_lane_ld128(benchmark::State & state,const char * net)634 static void f32_igemm_6x8__neon_lane_ld128(benchmark::State& state, const char* net) {
635 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128, 6, 8, 1, 1,
636 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
637 }
f32_igemm_1x8__neon_dup_ld64(benchmark::State & state,const char * net)638 static void f32_igemm_1x8__neon_dup_ld64(benchmark::State& state, const char* net) {
639 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neon_dup_ld64, 1, 8, 1, 1,
640 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
641 }
f32_igemm_4x8__neon_dup_ld128(benchmark::State & state,const char * net)642 static void f32_igemm_4x8__neon_dup_ld128(benchmark::State& state, const char* net) {
643 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld128, 4, 8, 1, 1,
644 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
645 }
f32_igemm_4x8__neon_dup_ld64(benchmark::State & state,const char * net)646 static void f32_igemm_4x8__neon_dup_ld64(benchmark::State& state, const char* net) {
647 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neon_dup_ld64, 4, 8, 1, 1,
648 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
649 }
f32_igemm_6x8__neon_dup_ld64(benchmark::State & state,const char * net)650 static void f32_igemm_6x8__neon_dup_ld64(benchmark::State& state, const char* net) {
651 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld64, 6, 8, 1, 1,
652 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
653 }
f32_igemm_6x8__neon_dup_ld128(benchmark::State & state,const char * net)654 static void f32_igemm_6x8__neon_dup_ld128(benchmark::State& state, const char* net) {
655 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neon_dup_ld128, 6, 8, 1, 1,
656 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
657 }
f32_igemm_1x8__neonfma_dup_ld64(benchmark::State & state,const char * net)658 static void f32_igemm_1x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
659 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__neonfma_dup_ld64, 1, 8, 1, 1,
660 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
661 }
f32_igemm_4x8__neonfma_dup_ld128(benchmark::State & state,const char * net)662 static void f32_igemm_4x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
663 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld128, 4, 8, 1, 1,
664 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
665 }
f32_igemm_4x8__neonfma_dup_ld64(benchmark::State & state,const char * net)666 static void f32_igemm_4x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
667 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__neonfma_dup_ld64, 4, 8, 1, 1,
668 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
669 }
f32_igemm_6x8__neonfma_dup_ld64(benchmark::State & state,const char * net)670 static void f32_igemm_6x8__neonfma_dup_ld64(benchmark::State& state, const char* net) {
671 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld64, 6, 8, 1, 1,
672 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
673 }
f32_igemm_6x8__neonfma_dup_ld128(benchmark::State & state,const char * net)674 static void f32_igemm_6x8__neonfma_dup_ld128(benchmark::State& state, const char* net) {
675 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__neonfma_dup_ld128, 6, 8, 1, 1,
676 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
677 }
f32_igemm_1x8s4__neon(benchmark::State & state,const char * net)678 static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
679 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neon, 1, 8, 1, 4,
680 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
681 }
f32_igemm_4x8s4__neon(benchmark::State & state,const char * net)682 static void f32_igemm_4x8s4__neon(benchmark::State& state, const char* net) {
683 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neon, 4, 8, 1, 4,
684 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
685 }
f32_igemm_6x8s4__neon(benchmark::State & state,const char * net)686 static void f32_igemm_6x8s4__neon(benchmark::State& state, const char* net) {
687 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neon, 6, 8, 1, 4,
688 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
689 }
f32_igemm_8x8s4__neon(benchmark::State & state,const char * net)690 static void f32_igemm_8x8s4__neon(benchmark::State& state, const char* net) {
691 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neon, 8, 8, 1, 4,
692 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEON);
693 }
f32_igemm_1x8s4__neonfma(benchmark::State & state,const char * net)694 static void f32_igemm_1x8s4__neonfma(benchmark::State& state, const char* net) {
695 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma, 1, 8, 1, 4,
696 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
697 }
f32_igemm_4x8s4__neonfma(benchmark::State & state,const char * net)698 static void f32_igemm_4x8s4__neonfma(benchmark::State& state, const char* net) {
699 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__neonfma, 4, 8, 1, 4,
700 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
701 }
f32_igemm_6x8s4__neonfma(benchmark::State & state,const char * net)702 static void f32_igemm_6x8s4__neonfma(benchmark::State& state, const char* net) {
703 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma, 6, 8, 1, 4,
704 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
705 }
f32_igemm_8x8s4__neonfma(benchmark::State & state,const char * net)706 static void f32_igemm_8x8s4__neonfma(benchmark::State& state, const char* net) {
707 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma, 8, 8, 1, 4,
708 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckNEONFMA);
709 }
710
711 BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)712 BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)
713 BENCHMARK_CONV(f32_igemm_6x2__neon_lane_ld64)
714 BENCHMARK_CONV(f32_igemm_4x4__neon_lane_ld64)
715 BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
716 BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
717 BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
718 BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld128)
719 BENCHMARK_CONV(f32_igemm_1x8__neon_dup_ld64)
720 BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld128)
721 BENCHMARK_CONV(f32_igemm_4x8__neon_dup_ld64)
722 BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld64)
723 BENCHMARK_CONV(f32_igemm_6x8__neon_dup_ld128)
724 BENCHMARK_CONV(f32_igemm_1x8__neonfma_dup_ld64)
725 BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld128)
726 BENCHMARK_CONV(f32_igemm_4x8__neonfma_dup_ld64)
727 BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld64)
728 BENCHMARK_CONV(f32_igemm_6x8__neonfma_dup_ld128)
729
730 BENCHMARK_CONV(f32_igemm_1x8s4__neon)
731 BENCHMARK_CONV(f32_igemm_4x8s4__neon)
732 BENCHMARK_CONV(f32_igemm_6x8s4__neon)
733 BENCHMARK_CONV(f32_igemm_8x8s4__neon)
734 BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
735 BENCHMARK_CONV(f32_igemm_4x8s4__neonfma)
736 BENCHMARK_CONV(f32_igemm_6x8s4__neonfma)
737 BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
738 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
739
740
741 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
742 static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
743 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_load1, 1, 8, 1, 1,
744 xnn_init_f32_minmax_sse_params);
745 }
f32_igemm_3x8__sse_load1(benchmark::State & state,const char * net)746 static void f32_igemm_3x8__sse_load1(benchmark::State& state, const char* net) {
747 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_load1, 3, 8, 1, 1,
748 xnn_init_f32_minmax_sse_params);
749 }
f32_igemm_4x8__sse_load1(benchmark::State & state,const char * net)750 static void f32_igemm_4x8__sse_load1(benchmark::State& state, const char* net) {
751 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_load1, 4, 8, 1, 1,
752 xnn_init_f32_minmax_sse_params);
753 }
f32_igemm_5x8__sse_load1(benchmark::State & state,const char * net)754 static void f32_igemm_5x8__sse_load1(benchmark::State& state, const char* net) {
755 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_load1, 5, 8, 1, 1,
756 xnn_init_f32_minmax_sse_params);
757 }
758
f32_igemm_1x8__sse_dup(benchmark::State & state,const char * net)759 static void f32_igemm_1x8__sse_dup(benchmark::State& state, const char* net) {
760 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__sse_dup, 1, 8, 1, 1,
761 xnn_init_f32_minmax_sse_params);
762 }
f32_igemm_3x8__sse_dup(benchmark::State & state,const char * net)763 static void f32_igemm_3x8__sse_dup(benchmark::State& state, const char* net) {
764 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__sse_dup, 3, 8, 1, 1,
765 xnn_init_f32_minmax_sse_params);
766 }
f32_igemm_4x8__sse_dup(benchmark::State & state,const char * net)767 static void f32_igemm_4x8__sse_dup(benchmark::State& state, const char* net) {
768 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__sse_dup, 4, 8, 1, 1,
769 xnn_init_f32_minmax_sse_params);
770 }
f32_igemm_5x8__sse_dup(benchmark::State & state,const char * net)771 static void f32_igemm_5x8__sse_dup(benchmark::State& state, const char* net) {
772 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__sse_dup, 5, 8, 1, 1,
773 xnn_init_f32_minmax_sse_params);
774 }
775
f32_igemm_1x8s4__sse(benchmark::State & state,const char * net)776 static void f32_igemm_1x8s4__sse(benchmark::State& state, const char* net) {
777 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8s4__sse, 1, 8, 1, 4,
778 xnn_init_f32_minmax_sse_params);
779 }
f32_igemm_3x8s4__sse(benchmark::State & state,const char * net)780 static void f32_igemm_3x8s4__sse(benchmark::State& state, const char* net) {
781 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8s4__sse, 3, 8, 1, 4,
782 xnn_init_f32_minmax_sse_params);
783 }
f32_igemm_4x8s4__sse(benchmark::State & state,const char * net)784 static void f32_igemm_4x8s4__sse(benchmark::State& state, const char* net) {
785 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__sse, 4, 8, 1, 4,
786 xnn_init_f32_minmax_sse_params);
787 }
f32_igemm_5x8s4__sse(benchmark::State & state,const char * net)788 static void f32_igemm_5x8s4__sse(benchmark::State& state, const char* net) {
789 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8s4__sse, 5, 8, 1, 4,
790 xnn_init_f32_minmax_sse_params);
791 }
792
f32_igemm_1x8__sse2_dup(benchmark::State & state,const char * net)793 static void f32_igemm_1x8__sse2_dup(benchmark::State& state, const char* net) {
794 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__sse2_dup, 1, 8, 1, 1,
795 xnn_init_f32_minmax_sse_params);
796 }
f32_igemm_3x8__sse2_dup(benchmark::State & state,const char * net)797 static void f32_igemm_3x8__sse2_dup(benchmark::State& state, const char* net) {
798 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__sse2_dup, 3, 8, 1, 1,
799 xnn_init_f32_minmax_sse_params);
800 }
f32_igemm_4x8__sse2_dup(benchmark::State & state,const char * net)801 static void f32_igemm_4x8__sse2_dup(benchmark::State& state, const char* net) {
802 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__sse2_dup, 4, 8, 1, 1,
803 xnn_init_f32_minmax_sse_params);
804 }
f32_igemm_5x8__sse2_dup(benchmark::State & state,const char * net)805 static void f32_igemm_5x8__sse2_dup(benchmark::State& state, const char* net) {
806 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__sse2_dup, 5, 8, 1, 1,
807 xnn_init_f32_minmax_sse_params);
808 }
809
f32_igemm_1x8__avx_broadcast(benchmark::State & state,const char * net)810 static void f32_igemm_1x8__avx_broadcast(benchmark::State& state, const char* net) {
811 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__avx_broadcast, 1, 8, 1, 1,
812 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
813 }
f32_igemm_4x8__avx_broadcast(benchmark::State & state,const char * net)814 static void f32_igemm_4x8__avx_broadcast(benchmark::State& state, const char* net) {
815 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__avx_broadcast, 4, 8, 1, 1,
816 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
817 }
f32_igemm_5x8__avx_broadcast(benchmark::State & state,const char * net)818 static void f32_igemm_5x8__avx_broadcast(benchmark::State& state, const char* net) {
819 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast, 5, 8, 1, 1,
820 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
821 }
f32_igemm_6x8__avx_broadcast(benchmark::State & state,const char * net)822 static void f32_igemm_6x8__avx_broadcast(benchmark::State& state, const char* net) {
823 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast, 6, 8, 1, 1,
824 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
825 }
f32_igemm_7x8__avx_broadcast(benchmark::State & state,const char * net)826 static void f32_igemm_7x8__avx_broadcast(benchmark::State& state, const char* net) {
827 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast, 7, 8, 1, 1,
828 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckAVX);
829 }
830
f32_igemm_1x8__fma3_broadcast(benchmark::State & state,const char * net)831 static void f32_igemm_1x8__fma3_broadcast(benchmark::State& state, const char* net) {
832 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x8__fma3_broadcast, 1, 8, 1, 1,
833 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
834 }
f32_igemm_4x8__fma3_broadcast(benchmark::State & state,const char * net)835 static void f32_igemm_4x8__fma3_broadcast(benchmark::State& state, const char* net) {
836 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__fma3_broadcast, 4, 8, 1, 1,
837 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
838 }
f32_igemm_5x8__fma3_broadcast(benchmark::State & state,const char * net)839 static void f32_igemm_5x8__fma3_broadcast(benchmark::State& state, const char* net) {
840 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast, 5, 8, 1, 1,
841 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
842 }
f32_igemm_6x8__fma3_broadcast(benchmark::State & state,const char * net)843 static void f32_igemm_6x8__fma3_broadcast(benchmark::State& state, const char* net) {
844 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast, 6, 8, 1, 1,
845 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
846 }
f32_igemm_7x8__fma3_broadcast(benchmark::State & state,const char * net)847 static void f32_igemm_7x8__fma3_broadcast(benchmark::State& state, const char* net) {
848 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast, 7, 8, 1, 1,
849 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
850 }
f32_igemm_8x8__fma3_broadcast(benchmark::State & state,const char * net)851 static void f32_igemm_8x8__fma3_broadcast(benchmark::State& state, const char* net) {
852 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast, 8, 8, 1, 1,
853 xnn_init_f32_minmax_avx_params, benchmark::utils::CheckFMA3);
854 }
855
f32_igemm_1x16__avx512f_broadcast(benchmark::State & state,const char * net)856 static void f32_igemm_1x16__avx512f_broadcast(benchmark::State& state, const char* net) {
857 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast, 1, 16, 1, 1,
858 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
859 }
f32_igemm_4x16__avx512f_broadcast(benchmark::State & state,const char * net)860 static void f32_igemm_4x16__avx512f_broadcast(benchmark::State& state, const char* net) {
861 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x16__avx512f_broadcast, 4, 16, 1, 1,
862 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
863 }
f32_igemm_5x16__avx512f_broadcast(benchmark::State & state,const char * net)864 static void f32_igemm_5x16__avx512f_broadcast(benchmark::State& state, const char* net) {
865 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x16__avx512f_broadcast, 5, 16, 1, 1,
866 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
867 }
f32_igemm_6x16__avx512f_broadcast(benchmark::State & state,const char * net)868 static void f32_igemm_6x16__avx512f_broadcast(benchmark::State& state, const char* net) {
869 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x16__avx512f_broadcast, 6, 16, 1, 1,
870 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
871 }
f32_igemm_7x16__avx512f_broadcast(benchmark::State & state,const char * net)872 static void f32_igemm_7x16__avx512f_broadcast(benchmark::State& state, const char* net) {
873 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast, 7, 16, 1, 1,
874 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
875 }
f32_igemm_8x16__avx512f_broadcast(benchmark::State & state,const char * net)876 static void f32_igemm_8x16__avx512f_broadcast(benchmark::State& state, const char* net) {
877 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_8x16__avx512f_broadcast, 8, 16, 1, 1,
878 xnn_init_f32_minmax_scalar_params, benchmark::utils::CheckFMA3);
879 }
880
881 BENCHMARK_CONV(f32_igemm_1x8__sse_load1)
BENCHMARK_CONV(f32_igemm_3x8__sse_load1)882 BENCHMARK_CONV(f32_igemm_3x8__sse_load1)
883 BENCHMARK_CONV(f32_igemm_4x8__sse_load1)
884 BENCHMARK_CONV(f32_igemm_5x8__sse_load1)
885
886 BENCHMARK_CONV(f32_igemm_1x8__sse_dup)
887 BENCHMARK_CONV(f32_igemm_3x8__sse_dup)
888 BENCHMARK_CONV(f32_igemm_4x8__sse_dup)
889 BENCHMARK_CONV(f32_igemm_5x8__sse_dup)
890
891 BENCHMARK_CONV(f32_igemm_1x8s4__sse)
892 BENCHMARK_CONV(f32_igemm_3x8s4__sse)
893 BENCHMARK_CONV(f32_igemm_4x8s4__sse)
894 BENCHMARK_CONV(f32_igemm_5x8s4__sse)
895
896 BENCHMARK_CONV(f32_igemm_1x8__sse2_dup)
897 BENCHMARK_CONV(f32_igemm_3x8__sse2_dup)
898 BENCHMARK_CONV(f32_igemm_4x8__sse2_dup)
899 BENCHMARK_CONV(f32_igemm_5x8__sse2_dup)
900
901 BENCHMARK_CONV(f32_igemm_1x8__avx_broadcast)
902 BENCHMARK_CONV(f32_igemm_4x8__avx_broadcast)
903 BENCHMARK_CONV(f32_igemm_5x8__avx_broadcast)
904 BENCHMARK_CONV(f32_igemm_6x8__avx_broadcast)
905 BENCHMARK_CONV(f32_igemm_7x8__avx_broadcast)
906
907 BENCHMARK_CONV(f32_igemm_1x8__fma3_broadcast)
908 BENCHMARK_CONV(f32_igemm_4x8__fma3_broadcast)
909 BENCHMARK_CONV(f32_igemm_5x8__fma3_broadcast)
910 BENCHMARK_CONV(f32_igemm_6x8__fma3_broadcast)
911 BENCHMARK_CONV(f32_igemm_7x8__fma3_broadcast)
912 BENCHMARK_CONV(f32_igemm_8x8__fma3_broadcast)
913
914 BENCHMARK_CONV(f32_igemm_1x16__avx512f_broadcast)
915 BENCHMARK_CONV(f32_igemm_4x16__avx512f_broadcast)
916 BENCHMARK_CONV(f32_igemm_5x16__avx512f_broadcast)
917 BENCHMARK_CONV(f32_igemm_6x16__avx512f_broadcast)
918 BENCHMARK_CONV(f32_igemm_7x16__avx512f_broadcast)
919 BENCHMARK_CONV(f32_igemm_8x16__avx512f_broadcast)
920 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
921
922
923 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
924 static void f32_igemm_3x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
925 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_loadsplat, 3, 8, 1, 1,
926 xnn_init_f32_minmax_scalar_params);
927 }
928
f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)929 static void f32_igemm_4x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
930 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_loadsplat, 4, 8, 1, 1,
931 xnn_init_f32_minmax_scalar_params);
932 }
933
f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)934 static void f32_igemm_5x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
935 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, 5, 8, 1, 1,
936 xnn_init_f32_minmax_scalar_params);
937 }
938
f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State & state,const char * net)939 static void f32_igemm_6x8__wasmsimd_arm_loadsplat(benchmark::State& state, const char* net) {
940 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_loadsplat, 6, 8, 1, 1,
941 xnn_init_f32_minmax_scalar_params);
942 }
943
f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)944 static void f32_igemm_3x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
945 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, 3, 8, 1, 1,
946 xnn_init_f32_minmax_scalar_params);
947 }
948
f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)949 static void f32_igemm_4x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
950 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, 4, 8, 1, 1,
951 xnn_init_f32_minmax_scalar_params);
952 }
953
f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)954 static void f32_igemm_5x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
955 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_loadsplat, 5, 8, 1, 1,
956 xnn_init_f32_minmax_scalar_params);
957 }
958
f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State & state,const char * net)959 static void f32_igemm_6x8__wasmsimd_x86_loadsplat(benchmark::State& state, const char* net) {
960 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, 6, 8, 1, 1,
961 xnn_init_f32_minmax_scalar_params);
962 }
963
f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)964 static void f32_igemm_3x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
965 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_arm_splat, 3, 8, 1, 1,
966 xnn_init_f32_minmax_scalar_params);
967 }
968
f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)969 static void f32_igemm_4x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
970 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_arm_splat, 4, 8, 1, 1,
971 xnn_init_f32_minmax_scalar_params);
972 }
973
f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)974 static void f32_igemm_5x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
975 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat, 5, 8, 1, 1,
976 xnn_init_f32_minmax_scalar_params);
977 }
978
f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State & state,const char * net)979 static void f32_igemm_6x8__wasmsimd_arm_splat(benchmark::State& state, const char* net) {
980 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_arm_splat, 6, 8, 1, 1,
981 xnn_init_f32_minmax_scalar_params);
982 }
983
f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)984 static void f32_igemm_3x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
985 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8__wasmsimd_x86_splat, 3, 8, 1, 1,
986 xnn_init_f32_minmax_scalar_params);
987 }
988
f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)989 static void f32_igemm_4x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
990 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat, 4, 8, 1, 1,
991 xnn_init_f32_minmax_scalar_params);
992 }
993
f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)994 static void f32_igemm_5x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
995 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_x86_splat, 5, 8, 1, 1,
996 xnn_init_f32_minmax_scalar_params);
997 }
998
f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State & state,const char * net)999 static void f32_igemm_6x8__wasmsimd_x86_splat(benchmark::State& state, const char* net) {
1000 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8__wasmsimd_x86_splat, 6, 8, 1, 1,
1001 xnn_init_f32_minmax_scalar_params);
1002 }
1003
f32_igemm_3x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1004 static void f32_igemm_3x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1005 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm, 3, 8, 1, 4,
1006 xnn_init_f32_minmax_scalar_params);
1007 }
1008
f32_igemm_4x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1009 static void f32_igemm_4x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1010 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_arm, 4, 8, 1, 4,
1011 xnn_init_f32_minmax_scalar_params);
1012 }
1013
f32_igemm_5x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1014 static void f32_igemm_5x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1015 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_arm, 5, 8, 1, 4,
1016 xnn_init_f32_minmax_scalar_params);
1017 }
1018
f32_igemm_6x8s4__wasmsimd_arm(benchmark::State & state,const char * net)1019 static void f32_igemm_6x8s4__wasmsimd_arm(benchmark::State& state, const char* net) {
1020 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm, 6, 8, 1, 4,
1021 xnn_init_f32_minmax_scalar_params);
1022 }
1023
f32_igemm_3x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1024 static void f32_igemm_3x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1025 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_x86, 3, 8, 1, 4,
1026 xnn_init_f32_minmax_scalar_params);
1027 }
1028
f32_igemm_4x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1029 static void f32_igemm_4x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1030 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x8s4__wasmsimd_x86, 4, 8, 1, 4,
1031 xnn_init_f32_minmax_scalar_params);
1032 }
1033
f32_igemm_5x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1034 static void f32_igemm_5x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1035 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_5x8s4__wasmsimd_x86, 5, 8, 1, 4,
1036 xnn_init_f32_minmax_scalar_params);
1037 }
1038
f32_igemm_6x8s4__wasmsimd_x86(benchmark::State & state,const char * net)1039 static void f32_igemm_6x8s4__wasmsimd_x86(benchmark::State& state, const char* net) {
1040 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86, 6, 8, 1, 4,
1041 xnn_init_f32_minmax_scalar_params);
1042 }
1043
1044 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_loadsplat)
BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)1045 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_loadsplat)
1046 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_loadsplat)
1047 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_loadsplat)
1048 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_loadsplat)
1049 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_loadsplat)
1050 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_loadsplat)
1051 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_loadsplat)
1052 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_arm_splat)
1053 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_arm_splat)
1054 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_arm_splat)
1055 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_arm_splat)
1056 BENCHMARK_CONV(f32_igemm_3x8__wasmsimd_x86_splat)
1057 BENCHMARK_CONV(f32_igemm_4x8__wasmsimd_x86_splat)
1058 BENCHMARK_CONV(f32_igemm_5x8__wasmsimd_x86_splat)
1059 BENCHMARK_CONV(f32_igemm_6x8__wasmsimd_x86_splat)
1060 BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_arm)
1061 BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_arm)
1062 BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_arm)
1063 BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_arm)
1064 BENCHMARK_CONV(f32_igemm_3x8s4__wasmsimd_x86)
1065 BENCHMARK_CONV(f32_igemm_4x8s4__wasmsimd_x86)
1066 BENCHMARK_CONV(f32_igemm_5x8s4__wasmsimd_x86)
1067 BENCHMARK_CONV(f32_igemm_6x8s4__wasmsimd_x86)
1068 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1069
1070
1071 static void f32_igemm_1x4__scalar(benchmark::State& state, const char* net) {
1072 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_1x4__scalar, 1, 4, 1, 1,
1073 xnn_init_f32_minmax_scalar_params);
1074 }
1075
f32_igemm_2x4__scalar(benchmark::State & state,const char * net)1076 static void f32_igemm_2x4__scalar(benchmark::State& state, const char* net) {
1077 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_2x4__scalar, 2, 4, 1, 1,
1078 xnn_init_f32_minmax_scalar_params);
1079 }
1080
f32_igemm_4x4__scalar(benchmark::State & state,const char * net)1081 static void f32_igemm_4x4__scalar(benchmark::State& state, const char* net) {
1082 f32_igemm(state, xnn_f32_igemm_minmax_ukernel_4x4__scalar, 4, 4, 1, 1,
1083 xnn_init_f32_minmax_scalar_params);
1084 }
1085
1086 BENCHMARK_CONV(f32_igemm_1x4__scalar)
1087 BENCHMARK_CONV(f32_igemm_2x4__scalar)
1088 BENCHMARK_CONV(f32_igemm_4x4__scalar)
1089
1090 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1091 BENCHMARK_MAIN();
1092 #endif
1093