xref: /aosp_15_r20/external/XNNPACK/bench/convolution.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <cmath>
12 #include <functional>
13 #include <limits>
14 #include <ostream>
15 #include <random>
16 #include <string>
17 #include <vector>
18 
19 #include <xnnpack.h>
20 
21 #include <benchmark/benchmark.h>
22 #include <fp16.h>
23 #ifdef BENCHMARK_TENSORFLOW_LITE
24 #include "flatbuffers/include/flatbuffers/flatbuffers.h"
25 #include "tensorflow/lite/interpreter.h"
26 #include "tensorflow/lite/kernels/register.h"
27 #include "tensorflow/lite/model.h"
28 #include "tensorflow/lite/schema/schema_generated.h"
29 #include "tensorflow/lite/version.h"
30 #endif  // BENCHMARK_TENSORFLOW_LITE
31 #include "bench/utils.h"
32 
33 #ifndef XNN_NO_QU8_OPERATORS
xnnpack_convolution_qu8(benchmark::State & state,const char * net)34 void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {
35   const size_t batch_size = state.range(0);
36   const size_t input_height = state.range(1);
37   const size_t input_width = state.range(2);
38   const size_t kernel_height = state.range(3);
39   const size_t kernel_width = state.range(4);
40   const size_t padding_height = state.range(5);
41   const size_t padding_width = state.range(6);
42   const size_t subsampling = state.range(7);
43   const size_t dilation = state.range(8);
44   const size_t groups = state.range(9);
45   const size_t group_input_channels = state.range(10);
46   const size_t group_output_channels = state.range(11);
47 
48   std::random_device random_device;
49   auto rng = std::mt19937(random_device());
50   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
51   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
52 
53   const size_t output_pixel_stride = groups * group_output_channels;
54   const size_t input_pixel_stride = groups * group_input_channels;
55   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
56   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
57   const size_t padding_left = padding_width / 2;
58   const size_t padding_top = padding_height / 2;
59   const size_t padding_right = padding_width - padding_left;
60   const size_t padding_bottom = padding_height - padding_top;
61   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
62   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
63 
64   std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
65   std::generate(input.begin(), input.end(), std::ref(u8rng));
66   std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
67   std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
68   std::vector<int32_t> bias(groups * group_output_channels);
69   std::generate(bias.begin(), bias.end(), std::ref(i32rng));
70   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
71 
72   xnn_status status = xnn_initialize(nullptr /* allocator */);
73   if (status != xnn_status_success) {
74     state.SkipWithError("failed to initialize XNNPACK");
75     return;
76   }
77 
78   const size_t num_buffers = 1 +
79     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
80       sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
81   std::vector<uint8_t> output(output_elements * num_buffers);
82 
83   std::vector<xnn_operator_t> convolution_operators(num_buffers);
84   for (xnn_operator_t& convolution_op : convolution_operators) {
85     status = xnn_create_convolution2d_nhwc_qu8(
86       padding_top, padding_right, padding_bottom, padding_left,
87       kernel_height, kernel_width,
88       subsampling, subsampling,
89       dilation, dilation,
90       groups, group_input_channels, group_output_channels,
91       input_pixel_stride, output_pixel_stride,
92       127, 0.5f,
93       127, 0.5f,
94       kernel.data(), bias.data(),
95       127, 0.5f, 0, 255,
96       0 /* flags */, NULL, &convolution_op);
97     if (status != xnn_status_success) {
98       state.SkipWithError("failed to create QUINT8 Convolution operator");
99       return;
100     }
101   }
102 
103   for (size_t i = 0; i < convolution_operators.size(); i++) {
104     status = xnn_setup_convolution2d_nhwc_qu8(
105       convolution_operators[i],
106       batch_size, input_height, input_width,
107       input.data(), output.data() + i * output_elements,
108       nullptr /* thread pool */);
109     if (status != xnn_status_success) {
110       state.SkipWithError("failed to setup QUINT8 Convolution operator");
111       return;
112     }
113   }
114 
115   size_t buffer_index = 0;
116   for (auto _ : state) {
117     state.PauseTiming();
118     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
119     buffer_index = (buffer_index + 1) % num_buffers;
120     state.ResumeTiming();
121 
122     status = xnn_run_operator(convolution_operators[buffer_index],
123       nullptr /* thread pool */);
124     if (status != xnn_status_success) {
125       state.SkipWithError("failed to run QUINT8 Convolution operator");
126       return;
127     }
128   }
129 
130   for (xnn_operator_t& convolution_op : convolution_operators) {
131     status = xnn_delete_operator(convolution_op);
132     if (status != xnn_status_success) {
133       state.SkipWithError("failed to delete QUINT8 Convolution operator");
134       return;
135     }
136     convolution_op = nullptr;
137   }
138 
139   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
140   if (cpu_frequency != 0) {
141     state.counters["cpufreq"] = cpu_frequency;
142   }
143 
144   state.counters["OPS"] = benchmark::Counter(
145     uint64_t(state.iterations()) * 2 *
146       batch_size * output_height * output_width *
147       groups * group_input_channels * group_output_channels *
148       kernel_height * kernel_width,
149     benchmark::Counter::kIsRate);
150 }
151 #endif  // XNN_NO_QU8_OPERATORS
152 
153 #ifndef XNN_NO_QS8_OPERATORS
xnnpack_convolution_qs8(benchmark::State & state,const char * net)154 void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {
155   const size_t batch_size = state.range(0);
156   const size_t input_height = state.range(1);
157   const size_t input_width = state.range(2);
158   const size_t kernel_height = state.range(3);
159   const size_t kernel_width = state.range(4);
160   const size_t padding_height = state.range(5);
161   const size_t padding_width = state.range(6);
162   const size_t subsampling = state.range(7);
163   const size_t dilation = state.range(8);
164   const size_t groups = state.range(9);
165   const size_t group_input_channels = state.range(10);
166   const size_t group_output_channels = state.range(11);
167 
168   std::random_device random_device;
169   auto rng = std::mt19937(random_device());
170   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
171   auto i8rng = std::bind(
172     std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
173 
174   const size_t output_pixel_stride = groups * group_output_channels;
175   const size_t input_pixel_stride = groups * group_input_channels;
176   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
177   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
178   const size_t padding_left = padding_width / 2;
179   const size_t padding_top = padding_height / 2;
180   const size_t padding_right = padding_width - padding_left;
181   const size_t padding_bottom = padding_height - padding_top;
182   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
183   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
184 
185   std::vector<int8_t> input(batch_size * input_height * input_width * input_pixel_stride);
186   std::generate(input.begin(), input.end(), std::ref(i8rng));
187   std::vector<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
188   std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
189   std::vector<int32_t> bias(groups * group_output_channels);
190   std::generate(bias.begin(), bias.end(), std::ref(i32rng));
191   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
192 
193   xnn_status status = xnn_initialize(nullptr /* allocator */);
194   if (status != xnn_status_success) {
195     state.SkipWithError("failed to initialize XNNPACK");
196     return;
197   }
198 
199   const size_t num_buffers = 1 +
200     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
201       sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements);
202   std::vector<int8_t> output(output_elements * num_buffers);
203 
204   std::vector<xnn_operator_t> convolution_operators(num_buffers);
205   for (xnn_operator_t& convolution_op : convolution_operators) {
206     status = xnn_create_convolution2d_nhwc_qs8(
207       padding_top, padding_right, padding_bottom, padding_left,
208       kernel_height, kernel_width,
209       subsampling, subsampling,
210       dilation, dilation,
211       groups, group_input_channels, group_output_channels,
212       input_pixel_stride, output_pixel_stride,
213       127, 0.5f, 0.5f,
214       kernel.data(), bias.data(),
215       127, 0.5f, -128, 127,
216       0 /* flags */, NULL, &convolution_op);
217     if (status != xnn_status_success) {
218       state.SkipWithError("failed to create QINT8 Convolution operator");
219       return;
220     }
221   }
222 
223   for (size_t i = 0; i < convolution_operators.size(); i++) {
224     status = xnn_setup_convolution2d_nhwc_qs8(
225       convolution_operators[i],
226       batch_size, input_height, input_width,
227       input.data(), output.data() + i * output_elements,
228       nullptr /* thread pool */);
229     if (status != xnn_status_success) {
230       state.SkipWithError("failed to setup QINT8 Convolution operator");
231       return;
232     }
233   }
234 
235   size_t buffer_index = 0;
236   for (auto _ : state) {
237     state.PauseTiming();
238     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
239     buffer_index = (buffer_index + 1) % num_buffers;
240     state.ResumeTiming();
241 
242     status = xnn_run_operator(convolution_operators[buffer_index],
243       nullptr /* thread pool */);
244     if (status != xnn_status_success) {
245       state.SkipWithError("failed to run QINT8 Convolution operator");
246       return;
247     }
248   }
249 
250   for (xnn_operator_t& convolution_op : convolution_operators) {
251     status = xnn_delete_operator(convolution_op);
252     if (status != xnn_status_success) {
253       state.SkipWithError("failed to delete QINT8 Convolution operator");
254       return;
255     }
256     convolution_op = nullptr;
257   }
258 
259   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
260   if (cpu_frequency != 0) {
261     state.counters["cpufreq"] = cpu_frequency;
262   }
263 
264   state.counters["OPS"] = benchmark::Counter(
265     uint64_t(state.iterations()) * 2 *
266       batch_size * output_height * output_width *
267       groups * group_input_channels * group_output_channels *
268       kernel_height * kernel_width,
269     benchmark::Counter::kIsRate);
270 }
271 #endif  // XNN_NO_QS8_OPERATORS
272 
273 #ifndef XNN_NO_F16_OPERATORS
xnnpack_convolution_f16(benchmark::State & state,const char * net)274 void xnnpack_convolution_f16(benchmark::State& state, const char* net) {
275   if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
276     return;
277   }
278   const size_t batch_size = state.range(0);
279   const size_t input_height = state.range(1);
280   const size_t input_width = state.range(2);
281   const size_t kernel_height = state.range(3);
282   const size_t kernel_width = state.range(4);
283   const size_t padding_height = state.range(5);
284   const size_t padding_width = state.range(6);
285   const size_t subsampling = state.range(7);
286   const size_t dilation = state.range(8);
287   const size_t groups = state.range(9);
288   const size_t group_input_channels = state.range(10);
289   const size_t group_output_channels = state.range(11);
290 
291   std::random_device random_device;
292   auto rng = std::mt19937(random_device());
293   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
294   auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
295 
296   const size_t output_pixel_stride = groups * group_output_channels;
297   const size_t input_pixel_stride = groups * group_input_channels;
298   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
299   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
300   const size_t padding_left = padding_width / 2;
301   const size_t padding_top = padding_height / 2;
302   const size_t padding_right = padding_width - padding_left;
303   const size_t padding_bottom = padding_height - padding_top;
304   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
305   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
306 
307   std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
308   std::generate(input.begin(), input.end(), std::ref(f16rng));
309   std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
310   std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
311   std::vector<uint16_t> bias(groups * group_output_channels);
312   std::generate(bias.begin(), bias.end(), std::ref(f16rng));
313   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
314 
315   xnn_status status = xnn_initialize(nullptr /* allocator */);
316   if (status != xnn_status_success) {
317     state.SkipWithError("failed to initialize XNNPACK");
318     return;
319   }
320 
321   const size_t num_buffers = 1 +
322     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
323       sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));
324   std::vector<uint16_t> output(output_elements * num_buffers);
325 
326   std::vector<xnn_operator_t> convolution_operators(num_buffers);
327   for (xnn_operator_t& convolution_op : convolution_operators) {
328     status = xnn_create_convolution2d_nhwc_f16(
329       padding_top, padding_right, padding_bottom, padding_left,
330       kernel_height, kernel_width,
331       subsampling, subsampling,
332       dilation, dilation,
333       groups, group_input_channels, group_output_channels,
334       input_pixel_stride, output_pixel_stride,
335       kernel.data(), bias.data(),
336       -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
337       0 /* flags */, NULL, &convolution_op);
338     if (status != xnn_status_success) {
339       state.SkipWithError("failed to create FP16 Convolution operator");
340       return;
341     }
342   }
343 
344   for (size_t i = 0; i < convolution_operators.size(); i++) {
345     status = xnn_setup_convolution2d_nhwc_f16(
346       convolution_operators[i],
347       batch_size, input_height, input_width,
348       input.data(), output.data() + i * output_elements,
349       nullptr /* thread pool */);
350     if (status != xnn_status_success) {
351       state.SkipWithError("failed to setup FP16 Convolution operator");
352       return;
353     }
354   }
355 
356   size_t buffer_index = 0;
357   for (auto _ : state) {
358     state.PauseTiming();
359     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
360     buffer_index = (buffer_index + 1) % num_buffers;
361     state.ResumeTiming();
362 
363     status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
364     if (status != xnn_status_success) {
365       state.SkipWithError("failed to run FP16 Convolution operator");
366       return;
367     }
368   }
369 
370   for (xnn_operator_t& convolution_op : convolution_operators) {
371     status = xnn_delete_operator(convolution_op);
372     if (status != xnn_status_success) {
373       state.SkipWithError("failed to delete FP16 Convolution operator");
374       return;
375     }
376     convolution_op = nullptr;
377   }
378 
379   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
380   if (cpu_frequency != 0) {
381     state.counters["cpufreq"] = cpu_frequency;
382   }
383 
384   state.counters["FLOPS"] = benchmark::Counter(
385     uint64_t(state.iterations()) * 2 *
386       batch_size * output_height * output_width *
387       groups * group_input_channels * group_output_channels *
388       kernel_height * kernel_width,
389     benchmark::Counter::kIsRate);
390 }
391 #endif  // XNN_NO_F16_OPERATORS
392 
xnnpack_convolution_f32(benchmark::State & state,const char * net)393 void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
394   const size_t batch_size = state.range(0);
395   const size_t input_height = state.range(1);
396   const size_t input_width = state.range(2);
397   const size_t kernel_height = state.range(3);
398   const size_t kernel_width = state.range(4);
399   const size_t padding_height = state.range(5);
400   const size_t padding_width = state.range(6);
401   const size_t subsampling = state.range(7);
402   const size_t dilation = state.range(8);
403   const size_t groups = state.range(9);
404   const size_t group_input_channels = state.range(10);
405   const size_t group_output_channels = state.range(11);
406 
407   std::random_device random_device;
408   auto rng = std::mt19937(random_device());
409   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
410 
411   const size_t output_pixel_stride = groups * group_output_channels;
412   const size_t input_pixel_stride = groups * group_input_channels;
413   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
414   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
415   const size_t padding_left = padding_width / 2;
416   const size_t padding_top = padding_height / 2;
417   const size_t padding_right = padding_width - padding_left;
418   const size_t padding_bottom = padding_height - padding_top;
419   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
420   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
421 
422   std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
423   std::generate(input.begin(), input.end(), std::ref(f32rng));
424   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
425   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
426   std::vector<float> bias(groups * group_output_channels);
427   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
428   const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
429 
430   xnn_status status = xnn_initialize(nullptr /* allocator */);
431   if (status != xnn_status_success) {
432     state.SkipWithError("failed to initialize XNNPACK");
433     return;
434   }
435 
436   const size_t num_buffers = 1 +
437     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
438       sizeof(float) * (kernel.size() + bias.size() + output_elements));
439   std::vector<float> output(output_elements * num_buffers);
440 
441   std::vector<xnn_operator_t> convolution_operators(num_buffers);
442   for (xnn_operator_t& convolution_op : convolution_operators) {
443     status = xnn_create_convolution2d_nhwc_f32(
444       padding_top, padding_right, padding_bottom, padding_left,
445       kernel_height, kernel_width,
446       subsampling, subsampling,
447       dilation, dilation,
448       groups, group_input_channels, group_output_channels,
449       input_pixel_stride, output_pixel_stride,
450       kernel.data(), bias.data(),
451       -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
452       0 /* flags */, NULL, &convolution_op);
453     if (status != xnn_status_success) {
454       state.SkipWithError("failed to create FP32 Convolution operator");
455       return;
456     }
457   }
458 
459   for (size_t i = 0; i < convolution_operators.size(); i++) {
460     status = xnn_setup_convolution2d_nhwc_f32(
461       convolution_operators[i],
462       batch_size, input_height, input_width,
463       input.data(), output.data() + i * output_elements,
464       nullptr /* thread pool */);
465     if (status != xnn_status_success) {
466       state.SkipWithError("failed to setup FP32 Convolution operator");
467       return;
468     }
469   }
470 
471   size_t buffer_index = 0;
472   for (auto _ : state) {
473     state.PauseTiming();
474     benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
475     buffer_index = (buffer_index + 1) % num_buffers;
476     state.ResumeTiming();
477 
478     status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
479     if (status != xnn_status_success) {
480       state.SkipWithError("failed to run FP32 Convolution operator");
481       return;
482     }
483   }
484 
485   for (xnn_operator_t& convolution_op : convolution_operators) {
486     status = xnn_delete_operator(convolution_op);
487     if (status != xnn_status_success) {
488       state.SkipWithError("failed to delete FP32 Convolution operator");
489       return;
490     }
491     convolution_op = nullptr;
492   }
493 
494   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
495   if (cpu_frequency != 0) {
496     state.counters["cpufreq"] = cpu_frequency;
497   }
498 
499   state.counters["FLOPS"] = benchmark::Counter(
500     uint64_t(state.iterations()) * 2 *
501       batch_size * output_height * output_width *
502       groups * group_input_channels * group_output_channels *
503       kernel_height * kernel_width,
504     benchmark::Counter::kIsRate);
505 }
506 
507 #ifdef BENCHMARK_TENSORFLOW_LITE
tflite_convolution_f32(benchmark::State & state,const char * net)508 void tflite_convolution_f32(benchmark::State& state, const char* net) {
509   const size_t batch_size = state.range(0);
510   const size_t input_height = state.range(1);
511   const size_t input_width = state.range(2);
512   const size_t kernel_height = state.range(3);
513   const size_t kernel_width = state.range(4);
514   const size_t padding_height = state.range(5);
515   const size_t padding_width = state.range(6);
516   const size_t subsampling = state.range(7);
517   const size_t dilation = state.range(8);
518   const size_t groups = state.range(9);
519   const size_t group_input_channels = state.range(10);
520   const size_t group_output_channels = state.range(11);
521 
522   bool is_depthwise = false;
523   if (groups != 1) {
524     if (group_input_channels == 1) {
525       is_depthwise = true;
526     } else {
527       state.SkipWithError("grouped convolution is not supported");
528       return;
529     }
530   }
531 
532   std::random_device random_device;
533   auto rng = std::mt19937(random_device());
534   auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
535 
536   const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
537   const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
538 
539   tflite::Padding padding = tflite::Padding_VALID;
540   if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
541     padding = tflite::Padding_SAME;
542   } else if (padding_width == 0 && padding_height == 0) {
543     padding = tflite::Padding_VALID;
544   } else {
545     state.SkipWithError("unsupported padding");
546     return;
547   }
548 
549   const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
550   const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
551 
552   std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
553   std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
554   std::vector<float> bias(groups * group_output_channels);
555   std::generate(bias.begin(), bias.end(), std::ref(f32rng));
556 
557   flatbuffers::FlatBufferBuilder builder;
558   flatbuffers::Offset<tflite::OperatorCode> operator_code =
559       CreateOperatorCode(
560         builder,
561         is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
562         0);
563 
564   flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
565       builder,
566       padding,
567       static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
568       tflite::ActivationFunctionType_NONE,
569       static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
570 
571   flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
572       builder,
573       padding,
574       static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
575       static_cast<int32_t>(group_output_channels),
576       tflite::ActivationFunctionType_NONE,
577       static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
578 
579   flatbuffers::Offset<tflite::Buffer> buffers[3] = {
580     tflite::CreateBuffer(builder, builder.CreateVector({})),
581     tflite::CreateBuffer(builder, builder.CreateVector(
582       reinterpret_cast<const uint8_t*>(kernel.data()),
583       sizeof(float) * kernel.size())),
584     tflite::CreateBuffer(builder, builder.CreateVector(
585       reinterpret_cast<const uint8_t*>(bias.data()),
586       sizeof(float) * bias.size())),
587   };
588 
589   const int32_t input_shape[4] = {
590     static_cast<int32_t>(batch_size),
591     static_cast<int32_t>(input_height),
592     static_cast<int32_t>(input_width),
593     static_cast<int32_t>(groups * group_input_channels)
594   };
595   const int32_t output_shape[4] = {
596     static_cast<int32_t>(batch_size),
597     static_cast<int32_t>(output_height),
598     static_cast<int32_t>(output_width),
599     static_cast<int32_t>(groups * group_output_channels)
600   };
601   const int32_t filter_shape[4] = {
602     static_cast<int32_t>(group_output_channels),
603     static_cast<int32_t>(kernel_height),
604     static_cast<int32_t>(kernel_width),
605     static_cast<int32_t>(groups * group_input_channels)
606   };
607   const int32_t bias_shape[1] = {
608     static_cast<int32_t>(groups * group_output_channels)
609   };
610 
611   flatbuffers::Offset<tflite::Tensor> tensors[4] = {
612     tflite::CreateTensor(builder,
613                          builder.CreateVector<int32_t>(input_shape, 4),
614                          tflite::TensorType_FLOAT32,
615                          0 /* buffer id */,
616                          builder.CreateString("input")),
617     tflite::CreateTensor(builder,
618                          builder.CreateVector<int32_t>(filter_shape, 4),
619                          tflite::TensorType_FLOAT32,
620                          1 /* buffer id */,
621                          builder.CreateString("filter")),
622     tflite::CreateTensor(builder,
623                          builder.CreateVector<int32_t>(bias_shape, 1),
624                          tflite::TensorType_FLOAT32,
625                          2 /* buffer id */,
626                          builder.CreateString("bias")),
627     tflite::CreateTensor(builder,
628                          builder.CreateVector<int32_t>(output_shape, 4),
629                          tflite::TensorType_FLOAT32,
630                          0 /* buffer id */,
631                          builder.CreateString("output")),
632   };
633 
634   const int32_t op_inputs[3] = { 0, 1, 2 };
635   const int32_t op_outputs[1] = { 3 };
636   flatbuffers::Offset<tflite::Operator> op = CreateOperator(
637       builder,
638       0 /* opcode_index */,
639       builder.CreateVector<int32_t>(op_inputs, 3),
640       builder.CreateVector<int32_t>(op_outputs, 1),
641       is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
642       is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
643       /*custom_options */ 0,
644       tflite::CustomOptionsFormat_FLEXBUFFERS);
645 
646   const int32_t graph_inputs[1] = { 0 };
647   const int32_t graph_outputs[1] = { 3 };
648   flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
649       builder,
650       builder.CreateVector(tensors, 4),
651       builder.CreateVector<int32_t>(graph_inputs, 1),
652       builder.CreateVector<int32_t>(graph_outputs, 1),
653       builder.CreateVector(&op, 1),
654       builder.CreateString("Conv2D subgraph"));
655 
656   flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
657 
658   flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
659       TFLITE_SCHEMA_VERSION,
660       builder.CreateVector(&operator_code, 1),
661       builder.CreateVector(&subgraph, 1),
662       description,
663       builder.CreateVector(buffers, 3));
664 
665   builder.Finish(model_buffer);
666 
667   const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
668   tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
669   tflite::InterpreterBuilder interpreterBuilder(model, resolver);
670   std::unique_ptr<tflite::Interpreter> interpreter;
671   if (interpreterBuilder(&interpreter) != kTfLiteOk) {
672     state.SkipWithError("failed to create TFLite interpreter");
673     return;
674   }
675   if (interpreter == nullptr) {
676     state.SkipWithError("TFLite interpreter is null");
677     return;
678   }
679   interpreter->SetNumThreads(1);
680 
681   if (interpreter->AllocateTensors() != kTfLiteOk) {
682     state.SkipWithError("failed to allocate tensors");
683     return;
684   }
685 
686   std::generate(
687     interpreter->typed_tensor<float>(0),
688     interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
689     std::ref(f32rng));
690 
691   for (auto _ : state) {
692     state.PauseTiming();
693     benchmark::utils::WipeCache();
694     benchmark::utils::PrefetchToL1(
695       interpreter->typed_tensor<float>(0),
696       batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
697     state.ResumeTiming();
698 
699     if (interpreter->Invoke() != kTfLiteOk) {
700       state.SkipWithError("failed to invoke TFLite interpreter");
701       return;
702     }
703   }
704 
705   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
706   if (cpu_frequency != 0) {
707     state.counters["cpufreq"] = cpu_frequency;
708   }
709 
710   state.counters["FLOPS"] = benchmark::Counter(
711     uint64_t(state.iterations()) * 2 *
712       batch_size * output_height * output_width *
713       groups * group_input_channels * group_output_channels *
714       kernel_height * kernel_width,
715     benchmark::Counter::kIsRate);
716 
717   interpreter.reset();
718 }
719 #endif  // BENCHMARK_TENSORFLOW_LITE
720 
721 // ShuffleNet v1 with 1 group.
ShuffleNetV1G1(benchmark::internal::Benchmark * b)722 static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
723   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
724 
725   /*************************** Conv 1 **************************/
726   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
727   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
728   /******************* Stage 2: stride-2 unit ******************/
729   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
730   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   36});
731   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  36,    1,    1});
732   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  120});
733   /******************* Stage 2: stride-1 units *****************/
734   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
735   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   36});
736   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  36,    1,    1});
737   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   36,  144});
738   /******************* Stage 3: stride-2 unit ******************/
739   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
740   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   72});
741   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  72,    1,    1});
742   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  144});
743   /******************* Stage 3: stride-1 units *****************/
744   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
745   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,   72});
746   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  72,    1,    1});
747   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   72,  288});
748   /******************* Stage 4: stride-2 unit ******************/
749   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
750   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  288,  144});
751   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 144,    1,    1});
752   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  288});
753   /******************* Stage 4: stride-1 units *****************/
754   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
755   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  144});
756   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 144,    1,    1});
757   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  144,  576});
758 }
759 
760 // ShuffleNet v1 with 2 groups.
ShuffleNetV1G2(benchmark::internal::Benchmark * b)761 static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
762   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
763 
764   /*************************** Conv 1 **************************/
765   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
766   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
767   /******************* Stage 2: stride-2 unit ******************/
768   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
769   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   50});
770   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  50,    1,    1});
771   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,   88});
772   /******************* Stage 2: stride-1 units *****************/
773   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
774   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   25});
775   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  50,    1,    1});
776   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,   25,  100});
777   /******************* Stage 3: stride-2 unit ******************/
778   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
779   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   2,  100,   50});
780   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 100,    1,    1});
781   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  100});
782   /******************* Stage 3: stride-1 units *****************/
783   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
784   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,   50});
785   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 100,    1,    1});
786   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,   50,  200});
787   /******************* Stage 4: stride-2 unit ******************/
788   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
789   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   2,  200,  100});
790   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 200,    1,    1});
791   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  200});
792   /******************* Stage 4: stride-1 units *****************/
793   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
794   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  400,  100});
795   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 200,    1,    1});
796   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   2,  100,  400});
797 }
798 
799 // ShuffleNet v1 with 3 groups.
ShuffleNetV1G3(benchmark::internal::Benchmark * b)800 static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
801   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
802 
803   /*************************** Conv 1 **************************/
804   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
805   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
806   /******************* Stage 2: stride-2 unit ******************/
807   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
808   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   60});
809   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  60,    1,    1});
810   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   72});
811   /******************* Stage 2: stride-1 units *****************/
812   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
813   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   20});
814   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  60,    1,    1});
815   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   20,   80});
816   /******************* Stage 3: stride-2 unit ******************/
817   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
818   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   3,   80,   40});
819   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 120,    1,    1});
820   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,   80});
821   /******************* Stage 3: stride-1 units *****************/
822   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
823   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   40});
824   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 120,    1,    1});
825   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,   40,  160});
826   /******************* Stage 4: stride-2 unit ******************/
827   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
828   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   3,  160,   80});
829   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 240,    1,    1});
830   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  160});
831   /******************* Stage 4: stride-1 units *****************/
832   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
833   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,  320,   80});
834   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 240,    1,    1});
835   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   3,   80,  320});
836 }
837 
838 // ShuffleNet v1 with 4 groups.
ShuffleNetV1G4(benchmark::internal::Benchmark * b)839 static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
840   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
841 
842   /*************************** Conv 1 **************************/
843   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
844   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
845   /******************* Stage 2: stride-2 unit ******************/
846   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
847   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   68});
848   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  68,    1,    1});
849   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   62});
850   /******************* Stage 2: stride-1 units *****************/
851   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
852   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   17});
853   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  68,    1,    1});
854   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   17,   68});
855   /******************* Stage 3: stride-2 unit ******************/
856   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
857   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   4,   68,   34});
858   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 136,    1,    1});
859   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,   68});
860   /******************* Stage 3: stride-1 units *****************/
861   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
862   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   34});
863   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 136,    1,    1});
864   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,   34,  136});
865   /******************* Stage 4: stride-2 unit ******************/
866   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
867   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   4,  136,   68});
868   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 272,    1,    1});
869   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  136});
870   /******************* Stage 4: stride-1 units *****************/
871   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
872   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,  272,   68});
873   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 272,    1,    1});
874   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   4,   68,  272});
875 }
876 
877 // ShuffleNet v1 with 8 groups.
ShuffleNetV1G8(benchmark::internal::Benchmark * b)878 static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
879   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
880 
881   /*************************** Conv 1 **************************/
882   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
883   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
884   /******************* Stage 2: stride-2 unit ******************/
885   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
886   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   96});
887   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  96,    1,    1});
888   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   45});
889   /******************* Stage 2: stride-1 units *****************/
890   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
891   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   12});
892   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  96,    1,    1});
893   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   12,   48});
894   /******************* Stage 3: stride-2 unit ******************/
895   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
896   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   8,   48,   24});
897   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
898   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   48});
899   /******************* Stage 3: stride-1 units *****************/
900   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
901   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   24});
902   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 192,    1,    1});
903   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   24,   96});
904   /******************* Stage 4: stride-2 unit ******************/
905   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
906   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   8,   96,   48});
907   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 384,    1,    1});
908   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,   96});
909   /******************* Stage 4: stride-1 units *****************/
910   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
911   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,  192,   48});
912   b->Args({1,   7,   7,  3,  3,  2,  2, 2, 1, 384,    1,    1});
913   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   8,   48,  192});
914 }
915 
916 // ShuffleNet v2 (0.5X scale)
ShuffleNetV2X05(benchmark::internal::Benchmark * b)917 static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
918   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
919 
920   /*************************** Conv 1 **************************/
921   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
922   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
923   /************************** Stage 2 **************************/
924   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
925   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
926   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   24});
927   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   24});
928   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  24,    1,    1});
929   /************************** Stage 3 **************************/
930   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
931   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  48,    1,    1});
932   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,   48});
933   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   48,   48});
934   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  48,    1,    1});
935   /************************** Stage 4 **************************/
936   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
937   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  96,    1,    1});
938   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,   96});
939   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   96});
940   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1,  96,    1,    1});
941   /*************************** Conv 5 **************************/
942   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
943   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  192, 1024});
944 }
945 
946 // ShuffleNet v2 (1.0X scale)
ShuffleNetV2X10(benchmark::internal::Benchmark * b)947 static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
948   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
949 
950   /*************************** Conv 1 **************************/
951   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
952   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
953   /************************** Stage 2 **************************/
954   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
955   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
956   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   58});
957   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   58});
958   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  58,    1,    1});
959   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   58,   58});
960   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  58,    1,    1});
961   /************************** Stage 3 **************************/
962   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
963   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 116,    1,    1});
964   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  116,  116});
965   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  116,  116});
966   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 116,    1,    1});
967   /************************** Stage 4 **************************/
968   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
969   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 232,    1,    1});
970   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  232,  232});
971   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  232,  232});
972   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 232,    1,    1});
973   /*************************** Conv 5 **************************/
974   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
975   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  464, 1024});
976 }
977 
978 // ShuffleNet v2 (1.5X scale)
ShuffleNetV2X15(benchmark::internal::Benchmark * b)979 static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
980   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
981 
982   /*************************** Conv 1 **************************/
983   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
984   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
985   /************************** Stage 2 **************************/
986   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
987   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
988   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
989   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   88});
990   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  88,    1,    1});
991   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   88});
992   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
993   /************************** Stage 3 **************************/
994   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
995   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 176,    1,    1});
996   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  176,  176});
997   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  176,  176});
998   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 176,    1,    1});
999   /************************** Stage 4 **************************/
1000   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1001   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 352,    1,    1});
1002   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  352,  352});
1003   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  352,  352});
1004   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 352,    1,    1});
1005   /*************************** Conv 5 **************************/
1006   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1007   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  704, 1024});
1008 }
1009 
1010 // ShuffleNet v2 (2.0X scale)
ShuffleNetV2X20(benchmark::internal::Benchmark * b)1011 static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
1012   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1013 
1014   /*************************** Conv 1 **************************/
1015   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1016   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   24});
1017   /************************** Stage 2 **************************/
1018   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1019   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  24,    1,    1});
1020   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,  122});
1021   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  122});
1022   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 122,    1,    1});
1023   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  122,  122});
1024   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 122,    1,    1});
1025   /************************** Stage 3 **************************/
1026   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1027   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 244,    1,    1});
1028   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  244,  244});
1029   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  244,  244});
1030   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 244,    1,    1});
1031   /************************** Stage 4 **************************/
1032   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1033   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 488,    1,    1});
1034   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  488,  488});
1035   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  488,  488});
1036   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 488,    1,    1});
1037   /*************************** Conv 5 **************************/
1038   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1039   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  976, 2048});
1040 }
1041 
MobileNetV1(benchmark::internal::Benchmark * b)1042 static void MobileNetV1(benchmark::internal::Benchmark* b) {
1043   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1044 
1045   /*       N   H    W   KH  KW  PH  PW  S  D    G   GCin  GCout */
1046   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,    1,    3,   32});
1047   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,   32,    1,    1});
1048   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,    1,   32,   64});
1049   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,   64,    1,    1});
1050   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,   64,  128});
1051   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  128,    1,    1});
1052   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,    1,  128,  128});
1053   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  128,    1,    1});
1054   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  128,  256});
1055   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  256,    1,    1});
1056   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,    1,  256,  256});
1057   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1,  256,    1,    1});
1058   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  256,  512});
1059   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1,  512,    1,    1});
1060   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,    1,  512,  512});
1061   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1,  512,    1,    1});
1062   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1,  512, 1024});
1063   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1024,    1,    1});
1064   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,    1, 1024, 1024});
1065 }
1066 
MobileNetV2(benchmark::internal::Benchmark * b)1067 static void MobileNetV2(benchmark::internal::Benchmark* b) {
1068   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1069 
1070   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1071   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   32});
1072 
1073   /************************ Bottleneck 1 ***********************/
1074   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1075   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  32,    1,    1});
1076   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   32,   16});
1077 
1078   /************************ Bottleneck 2 ***********************/
1079   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1080   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   96});
1081   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  96,    1,    1});
1082   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   96,   24});
1083   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
1084   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 144,    1,    1});
1085   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,  144,   24});
1086 
1087   /************************ Bottleneck 3 ***********************/
1088   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1089 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,  144});
1090   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 144,    1,    1});
1091   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  144,   32});
1092   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1093   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
1094   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
1095 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1096 //b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 192,    1,    1});
1097 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  192,   32});
1098 
1099   /************************ Bottleneck 4 ***********************/
1100   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1101 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   32,  192});
1102   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 192,    1,    1});
1103   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  192,   64});
1104   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1105   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1106   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1107 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1108 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1109 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1110 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1111 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1112 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   64});
1113 
1114   /************************ Bottleneck 5 ***********************/
1115   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1116 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   64,  384});
1117 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 384,    1,    1});
1118   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  384,   96});
1119   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1120   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
1121   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1122 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1123 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 576,    1,    1});
1124 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1125 
1126   /************************ Bottleneck 6 ***********************/
1127   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1128 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1129   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 576,    1,    1});
1130   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,  160});
1131   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1132   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1133   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1134 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1135 //b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1136 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1137 
1138   /************************ Bottleneck 7 ***********************/
1139   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1140 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1141 //b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 960,    1,    1});
1142   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  320});
1143 
1144   /******************** Pre-pooling Conv2D *********************/
1145   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1146   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  320, 1280});
1147   /******************** Post-pooling Conv2D ********************/
1148   /*       N   H    W   KH  KW  PH  PW  S  D    G  GCin  GCout */
1149   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1000});
1150 }
1151 
MobileNetV3Small(benchmark::internal::Benchmark * b)1152 static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
1153   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1154 
1155   /*********************** Initial Stage ***********************/
1156   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1157   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
1158   /*********************** Bottleneck 1 ************************/
1159   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1160   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  16,    1,    1});
1161   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   16,    8});
1162   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,    8,   16});
1163   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   16});
1164   /*********************** Bottleneck 2 ************************/
1165   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1166   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   16,   72});
1167   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1,  72,    1,    1});
1168   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1169   /*********************** Bottleneck 3 ************************/
1170   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1171   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   88});
1172   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1,  88,    1,    1});
1173   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   88,   24});
1174   /*********************** Bottleneck 4 ************************/
1175   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1176   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1177   b->Args({1,  28,  28,  5,  5,  4,  4, 2, 1,  96,    1,    1});
1178   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   96,   24});
1179   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   96});
1180   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   96,   40});
1181   /*********************** Bottleneck 5 ************************/
1182   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1183   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1184   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
1185   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
1186   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
1187   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
1188   /*********************** Bottleneck 6 ************************/
1189   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1190 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1191 //b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 240,    1,    1});
1192 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,   64});
1193 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   64,  240});
1194 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   40});
1195   /*********************** Bottleneck 7 ************************/
1196   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1197   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1198   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1199   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1200   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1201   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  120,   48});
1202   /*********************** Bottleneck 8 ************************/
1203   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1204   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  144});
1205   b->Args({1,  14,  14,  5,  5,  4,  4, 1, 1, 144,    1,    1});
1206   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,   40});
1207   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   40,  144});
1208   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  144,   48});
1209   /*********************** Bottleneck 9 ************************/
1210   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1211   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   48,  288});
1212   b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 288,    1,    1});
1213   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  288,   72});
1214   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,  288});
1215   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  288,   96});
1216   /*********************** Bottleneck 10 ***********************/
1217   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1218   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1219   b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
1220   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1221   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1222   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1223   /*********************** Bottleneck 11 ***********************/
1224   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1225 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1226 //b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 576,    1,    1});
1227 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576,  144});
1228 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  144,  576});
1229 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  576,   96});
1230   /************************ Last Stage  ************************/
1231   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1232 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,   96,  576});
1233   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  576, 1024});
1234   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1024, 1001});
1235 }
1236 
MobileNetV3Large(benchmark::internal::Benchmark * b)1237 static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
1238   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1239 
1240   /*********************** Initial Stage ***********************/
1241   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1242   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1,   1,    3,   16});
1243   /*********************** Bottleneck 1 ************************/
1244   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1245   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1,  16,    1,    1});
1246   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   16});
1247   /*********************** Bottleneck 2 ************************/
1248   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1249   b->Args({1, 112, 112,  1,  1,  0,  0, 1, 1,   1,   16,   64});
1250   b->Args({1, 112, 112,  3,  3,  2,  2, 2, 1,  64,    1,    1});
1251   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   64,   24});
1252   /*********************** Bottleneck 3 ************************/
1253   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1254   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1255   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1,  72,    1,    1});
1256   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1257   /*********************** Bottleneck 4 ************************/
1258   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1259 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1260   b->Args({1,  56,  56,  5,  5,  4,  4, 2, 1,  72,    1,    1});
1261   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   72,   24});
1262   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   24,   72});
1263   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   72,   40});
1264   /*********************** Bottleneck 5 ************************/
1265   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1266   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1267   b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1268   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1269   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1270   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
1271   /*********************** Bottleneck 6 ************************/
1272   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1273 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  120});
1274 //b->Args({1,  28,  28,  5,  5,  4,  4, 1, 1, 120,    1,    1});
1275 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,   32});
1276 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,   32,  120});
1277 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,  120,   40});
1278   /*********************** Bottleneck 7 ************************/
1279   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1280   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1,   1,   40,  240});
1281   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 240,    1,    1});
1282   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  240,   80});
1283   /*********************** Bottleneck 8 ************************/
1284   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1285   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  200});
1286   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 200,    1,    1});
1287   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  200,   80});
1288   /*********************** Bottleneck 9 ************************/
1289   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1290   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
1291   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
1292   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
1293   /********************** Bottleneck 10 ***********************/
1294   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1295 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  184});
1296 //b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 184,    1,    1});
1297 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  184,   80});
1298   /********************** Bottleneck 11 ***********************/
1299   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1300   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,   80,  480});
1301   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 480,    1,    1});
1302   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  480,  120});
1303   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  120,  480});
1304   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  480,  112});
1305   /********************** Bottleneck 12 ***********************/
1306   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1307   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
1308   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 672,    1,    1});
1309   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  672,  168});
1310   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  168,  672});
1311   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  672,  112});
1312   /********************** Bottleneck 13 ***********************/
1313   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1314 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1,   1,  112,  672});
1315   b->Args({1,  14,  14,  5,  5,  4,  4, 2, 1, 672,    1,    1});
1316   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  672,  160});
1317   /********************** Bottleneck 14 ***********************/
1318   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1319   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1320   b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
1321   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
1322   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
1323   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1324   /********************** Bottleneck 15 ***********************/
1325   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1326 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1327 //b->Args({1,   7,   7,  5,  5,  4,  4, 1, 1, 960,    1,    1});
1328 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960,  240});
1329 //b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  240,  960});
1330 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  960,  160});
1331   /************************ Last Stage  ***********************/
1332   /*       N   H    W   KH  KW  PH  PW  S  D   G   GCin  GCout */
1333 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1,   1,  160,  960});
1334   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1,  960, 1280});
1335   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1,   1, 1280, 1001});
1336 }
1337 
1338 // SqueezeNet 1.0
SqueezeNetV10(benchmark::internal::Benchmark * b)1339 static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
1340   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1341 
1342   /************************** Conv 1 *************************/
1343   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1344   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   96});
1345   /************************** Fire 2 *************************/
1346   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1347   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   96,   16});
1348   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1349   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1350   /************************** Fire 3 *************************/
1351   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1352   b->Args({1,  56,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
1353 //b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1354 //b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1355   /************************** Fire 4 *************************/
1356   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1357   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   32});
1358   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1359   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1360   /************************** Fire 5 *************************/
1361   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1362   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
1363   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1364   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1365   /************************** Fire 6 *************************/
1366   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1367   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1368   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1369   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1370   /************************** Fire 7 *************************/
1371   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1372   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   48});
1373 //b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1374 //b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1375   /************************** Fire 8 *************************/
1376   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1377   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  384,   64});
1378   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1379   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1380   /************************** Fire 9 *************************/
1381   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1382   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
1383   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1384   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1385   /************************* Conv 10 *************************/
1386   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1387   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
1388 }
1389 
1390 // SqueezeNet 1.1
SqueezeNetV11(benchmark::internal::Benchmark * b)1391 static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
1392   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1393 
1394   /************************** Conv 1 *************************/
1395   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1396   b->Args({1, 224, 224,  3,  3,  2,  2, 2, 1, 1,    3,   64});
1397   /************************** Fire 2 *************************/
1398   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1399   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   64,   16});
1400   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1401   b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1402   /************************** Fire 3 *************************/
1403   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1404   b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,  128,   16});
1405 //b->Args({1,  55,  55,  1,  1,  0,  0, 1, 1, 1,   16,   64});
1406 //b->Args({1,  55,  55,  3,  3,  2,  2, 1, 1, 1,   16,   64});
1407   /************************** Fire 4 *************************/
1408   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1409   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  128,   32});
1410   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1411   b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1412   /************************** Fire 5 *************************/
1413   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1414   b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,  256,   32});
1415 //b->Args({1,  27,  27,  1,  1,  0,  0, 1, 1, 1,   32,  128});
1416 //b->Args({1,  27,  27,  3,  3,  2,  2, 1, 1, 1,   32,  128});
1417   /************************** Fire 6 *************************/
1418   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1419   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1420   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1421   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1422   /************************** Fire 7 *************************/
1423   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1424   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   48});
1425 //b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   48,  192});
1426 //b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   48,  192});
1427   /************************** Fire 8 *************************/
1428   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1429   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  384,   64});
1430   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1431   b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1432   /************************** Fire 9 *************************/
1433   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1434   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512,   64});
1435 //b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1436 //b->Args({1,  13,  13,  3,  3,  2,  2, 1, 1, 1,   64,  256});
1437   /************************* Conv 10 *************************/
1438   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1439   b->Args({1,  13,  13,  1,  1,  0,  0, 1, 1, 1,  512, 1000});
1440 }
1441 
InceptionV3(benchmark::internal::Benchmark * b)1442 static void InceptionV3(benchmark::internal::Benchmark* b) {
1443   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1444 
1445   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1446   b->Args({1, 299, 299,  3,  3,  0,  0, 2, 1, 1,    3,   32});
1447   b->Args({1, 149, 149,  3,  3,  0,  0, 1, 1, 1,   32,   32});
1448   b->Args({1, 147, 147,  3,  3,  2,  2, 1, 1, 1,   32,   64});
1449   b->Args({1,  73,  73,  1,  1,  0,  0, 1, 1, 1,   64,   80});
1450   b->Args({1,  73,  73,  3,  3,  0,  0, 1, 1, 1,   80,  192});
1451   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   64});
1452   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   48});
1453   b->Args({1,  35,  35,  5,  5,  4,  4, 1, 1, 1,   48,   64});
1454   b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   64,   96});
1455   b->Args({1,  35,  35,  3,  3,  2,  2, 1, 1, 1,   96,   96});
1456   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  192,   32});
1457   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   64});
1458   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  256,   48});
1459   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   64});
1460   b->Args({1,  35,  35,  1,  1,  0,  0, 1, 1, 1,  288,   48});
1461   b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,  288,  384});
1462   b->Args({1,  35,  35,  3,  3,  0,  0, 2, 1, 1,   96,   96});
1463   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  192});
1464   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  128});
1465   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  128});
1466   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  192});
1467   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  128,  128});
1468   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  128,  192});
1469   b->Args({1,  17,  17,  1,  1,  0,  0, 1, 1, 1,  768,  160});
1470   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  160});
1471   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  192});
1472   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  160,  160});
1473   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  160,  192});
1474   b->Args({1,  17,  17,  1,  7,  0,  6, 1, 1, 1,  192,  192});
1475   b->Args({1,  17,  17,  7,  1,  6,  0, 1, 1, 1,  192,  192});
1476   b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  320});
1477   b->Args({1,  17,  17,  3,  3,  0,  0, 2, 1, 1,  192,  192});
1478   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  320});
1479   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  384});
1480   b->Args({1,   8,   8,  1,  3,  0,  2, 1, 1, 1,  384,  384});
1481   b->Args({1,   8,   8,  3,  1,  2,  0, 1, 1, 1,  384,  384});
1482   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  448});
1483   b->Args({1,   8,   8,  3,  3,  2,  2, 1, 1, 1,  448,  384});
1484   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 1280,  192});
1485   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  320});
1486   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  384});
1487   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  448});
1488   b->Args({1,   8,   8,  1,  1,  0,  0, 1, 1, 1, 2048,  192});
1489   b->Args({1,   1,   1,  1,  1,  0,  0, 1, 1, 1, 2048, 1001});
1490 }
1491 
ResNet18(benchmark::internal::Benchmark * b)1492 static void ResNet18(benchmark::internal::Benchmark* b) {
1493   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1494 
1495   /************************* Conv 1 *************************/
1496   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1497   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
1498   /************************ Conv 2.X ************************/
1499   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1500   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1501   /************************ Conv 3.X ************************/
1502   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1503   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,   64,  128});
1504   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1505   b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,   64,  128});
1506   /************************ Conv 4.X ************************/
1507   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1508   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  128,  256});
1509   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1510   b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  128,  256});
1511   /************************ Conv 5.X ************************/
1512   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1513   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  256,  512});
1514   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1515   b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1,  256,  512});
1516 }
1517 
ResNet50(benchmark::internal::Benchmark * b)1518 static void ResNet50(benchmark::internal::Benchmark* b) {
1519   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1520 
1521   /************************* Conv 1 *************************/
1522   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1523   b->Args({1, 224, 224,  7,  7,  6,  6, 2, 1, 1,    3,   64});
1524   /************************ Conv 2.1 ************************/
1525   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1526   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,   64});
1527   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1528   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1529 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1530   /************************ Conv 2.X ************************/
1531   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1532   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,   64});
1533 //b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1534 //b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,   64,  256});
1535   /************************ Conv 3.1 ************************/
1536   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1537   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  128});
1538   b->Args({1,  56,  56,  3,  3,  2,  2, 2, 1, 1,  128,  128});
1539   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
1540   b->Args({1,  56,  56,  1,  1,  0,  0, 2, 1, 1,  256,  512});
1541   /************************ Conv 3.X ************************/
1542   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1543   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  128});
1544   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1545 //b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  128,  512});
1546   /************************ Conv 4.1 ************************/
1547   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1548   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  256});
1549   b->Args({1,  28,  28,  3,  3,  2,  2, 2, 1, 1,  256,  256});
1550   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
1551   b->Args({1,  28,  28,  1,  1,  0,  0, 2, 1, 1,  512, 1024});
1552   /************************ Conv 4.X ************************/
1553   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1554   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  256});
1555   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1556 //b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  256, 1024});
1557   /************************ Conv 5.1 ************************/
1558   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1559   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1, 1024,  512});
1560   b->Args({1,  14,  14,  3,  3,  2,  2, 2, 1, 1,  512,  512});
1561   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
1562   b->Args({1,  14,  14,  1,  1,  0,  0, 2, 1, 1, 1024, 2048});
1563   /************************ Conv 5.X ************************/
1564   /*       N   H    W   KH  KW  PH  PW  S  D  G GCin  GCout */
1565   b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1, 2048,  512});
1566   b->Args({1,   7,   7,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1567 //b->Args({1,   7,   7,  1,  1,  0,  0, 1, 1, 1,  512, 2048});
1568 }
1569 
VGG(benchmark::internal::Benchmark * b)1570 static void VGG(benchmark::internal::Benchmark* b) {
1571   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1572 
1573   /************************* Conv 1.1 ************************/
1574   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1575   b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,    3,   64});
1576   /************************* Conv 1.2 ************************/
1577   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1578   b->Args({1, 224, 224,  3,  3,  2,  2, 1, 1, 1,   64,   64});
1579 
1580   /************************* Conv 2.1 ************************/
1581   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1582   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,   64,  128});
1583   /************************* Conv 2.2 ************************/
1584   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1585   b->Args({1, 112, 112,  3,  3,  2,  2, 1, 1, 1,  128,  128});
1586 
1587   /************************* Conv 3.1 ************************/
1588   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1589   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  128,  256});
1590   /************************* Conv 3.2 ************************/
1591   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1592   b->Args({1,  56,  56,  3,  3,  2,  2, 1, 1, 1,  256,  256});
1593   /************************* Conv 3.3 ************************/
1594   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1595   b->Args({1,  56,  56,  1,  1,  0,  0, 1, 1, 1,  256,  256});
1596 
1597   /************************* Conv 4.1 ************************/
1598   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1599   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  256,  512});
1600   /************************* Conv 4.2 ************************/
1601   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1602   b->Args({1,  28,  28,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1603   /************************* Conv 4.3 ************************/
1604   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1605   b->Args({1,  28,  28,  1,  1,  0,  0, 1, 1, 1,  512,  512});
1606 
1607   /************************* Conv 5.X ************************/
1608   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1609   b->Args({1,  14,  14,  3,  3,  2,  2, 1, 1, 1,  512,  512});
1610   /************************* Conv 5.3 ************************/
1611   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1612   b->Args({1,  14,  14,  1,  1,  0,  0, 1, 1, 1,  512,  512});
1613 }
1614 
1615 // SRCNN (9-1-5)
SRCNN915(benchmark::internal::Benchmark * b)1616 static void SRCNN915(benchmark::internal::Benchmark* b) {
1617   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1618 
1619   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1620   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1621   b->Args({1, 376, 376,  1,  1,  0,  0, 1, 1, 1,   64,   32});
1622   b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1623 }
1624 
1625 // SRCNN (9-3-5)
SRCNN935(benchmark::internal::Benchmark * b)1626 static void SRCNN935(benchmark::internal::Benchmark* b) {
1627   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1628 
1629   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1630   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1631   b->Args({1, 376, 376,  3,  3,  0,  0, 1, 1, 1,   64,   32});
1632   b->Args({1, 374, 374,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1633 }
1634 
1635 // SRCNN (9-5-5)
SRCNN955(benchmark::internal::Benchmark * b)1636 static void SRCNN955(benchmark::internal::Benchmark* b) {
1637   b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1638 
1639   /*       N   H    W   KH  KW  PH  PW  S  D  G  GCin  GCout */
1640   b->Args({1, 384, 384,  9,  9,  0,  0, 1, 1, 1,    1,   64});
1641   b->Args({1, 376, 376,  5,  5,  0,  0, 1, 1, 1,   64,   32});
1642   b->Args({1, 372, 372,  5,  5,  0,  0, 1, 1, 1,   32,    1});
1643 }
1644 
1645 #ifndef XNN_NO_F16_OPERATORS
1646   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1647   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1648   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1649   BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1650   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1651   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1652   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1653   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1654   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1655   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1656   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1657   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1658   BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1659   BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1660   BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1661   BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1662   BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1663   BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1664   BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();
1665   BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1666   BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1667   BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1668 #endif  // XNN_NO_F16_OPERATORS
1669 
1670 #ifndef XNN_NO_F32_OPERATORS
1671   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1672   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1673   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1674   BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1675   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1676   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1677   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1678   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1679   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1680   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1681   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1682   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1683   BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1684   BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1685   BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1686   BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1687   BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1688   BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1689   BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1690   BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1691   BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1692   BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1693 #endif  // XNN_NO_F32_OPERATORS
1694 
1695 #ifndef XNN_NO_QS8_OPERATORS
1696   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1697   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1698   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1699   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1700   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1701   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1702   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1703   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1704   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1705   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1706   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1707   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1708   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1709   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1710   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1711   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1712   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1713   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1714   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, vgg, "VGG")->Apply(VGG)->UseRealTime();
1715   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1716   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1717   BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1718 #endif  // XNN_NO_QS8_OPERATORS
1719 
1720 #ifndef XNN_NO_QU8_OPERATORS
1721   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1722   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1723   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1724   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1725   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1726   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1727   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1728   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1729   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1730   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1731   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1732   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1733   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1734   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1735   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1736   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1737   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1738   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1739   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, vgg, "VGG")->Apply(VGG)->UseRealTime();
1740   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1741   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1742   BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1743 #endif  // XNN_NO_QU8_OPERATORS
1744 
1745 #ifdef BENCHMARK_TENSORFLOW_LITE
1746   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1747   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1748   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1749   BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1750   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1751   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1752   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1753   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1754   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1755   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1756   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1757   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1758   BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1759   BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1760   BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1761   BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1762   BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1763   BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1764   BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1765   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1766   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1767   BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1768 #endif  // BENCHMARK_TENSORFLOW_LITE
1769 
1770 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1771 BENCHMARK_MAIN();
1772 #endif
1773