1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "arm_gemm_local.hpp"
26
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32
33 #include "depthwise_implementation_constraints.hpp"
34
35 #include "interleaves/list.hpp"
36
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
39 #if defined(ARM_COMPUTE_ENABLE_SME2)
40 #include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
42 #include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
43 #include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
44
45 #include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
46 #include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
47 #include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
48 #include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
49
50 #include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
51 #include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
52 #include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
53 #include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
54 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
55
56 #include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
57 #include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
58 #include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
59 #include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
60 #include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
61 #include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
62 #include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
63 #include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
64 #include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
65 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
66 #include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
67 #include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
68 #include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
69 #include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
70 #include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
71 #include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
72 #include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
73 #include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
74 #include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
75 #endif // defined(__aarch64__)
76
77 namespace arm_conv {
78 namespace depthwise {
79
80 namespace
81 {
82 template <class Strategy>
cycle_estimate(const DepthwiseArgs & args,const Nothing &)83 unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
84 {
85 // First-pass: compute the number of output pixels which will be computed.
86 return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
87 arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
88 arm_gemm::iceildiv(
89 (long unsigned) args.input_channels * args.channel_multiplier,
90 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
91 );
92 }
93
94 template <class Strategy>
planar_cycle_estimate(const DepthwiseArgs & args,const Nothing &)95 unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
96 {
97 // First-pass: compute the number of output pixels which will be computed.
98 return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
99 args.output_cols *
100 arm_gemm::iceildiv(
101 (long unsigned) args.input_channels * args.channel_multiplier,
102 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
103 );
104 }
105
106 #if defined(__aarch64__)
not_preferred(const DepthwiseArgs &,const Nothing &)107 unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
108 {
109 return std::numeric_limits<unsigned int>::max();
110 }
111
112 bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
fast_mode_enabled(const DepthwiseArgs & args,const void *)113 bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
114 {
115 return args.fast_mode;
116 }
117 #endif // defined(__aarch64__)
118 }
119
120 static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
121 #if defined(__aarch64__)
122 #if defined(ARM_COMPUTE_ENABLE_SVE)
123 #if defined(ARM_COMPUTE_ENABLE_SME2)
124 {
125 DepthwiseMethod::PLANAR,
126 "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
127 constraint(fast_mode_enabled,
128 cpu_has_sme, cpu_has_sme2,
129 is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
130 has_no_channel_multiplier, no_prime_right_pad),
131 nullptr,
__anon360745110202() 132 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
133 auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
134 return new DepthwisePlanar<float>(strat, args);
135 },
136 },
137 {
138 DepthwiseMethod::PLANAR,
139 "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
140 constraint(fast_mode_enabled,
141 cpu_has_sme, cpu_has_sme2,
142 is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
143 has_no_channel_multiplier, no_prime_right_pad),
144 nullptr,
__anon360745110302() 145 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
146 auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
147 return new DepthwisePlanar<float>(strat, args);
148 },
149 },
150 {
151 DepthwiseMethod::PLANAR,
152 "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
153 constraint(fast_mode_enabled,
154 cpu_has_sme, cpu_has_sme2,
155 is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
156 has_no_channel_multiplier, no_prime_right_pad),
157 nullptr,
__anon360745110402() 158 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
159 auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
160 return new DepthwisePlanar<float>(strat, args);
161 },
162 },
163 {
164 DepthwiseMethod::PLANAR,
165 "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
166 constraint(fast_mode_enabled,
167 cpu_has_sme, cpu_has_sme2,
168 is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
169 has_no_channel_multiplier, no_prime_right_pad),
170 nullptr,
__anon360745110502() 171 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
172 auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
173 return new DepthwisePlanar<float>(strat, args);
174 },
175 },
176
177 {
178 DepthwiseMethod::PLANAR,
179 "sme2_fp32_planar_3x3_s1_4rows_mla_za",
180 constraint(cpu_has_sme, cpu_has_sme2,
181 is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
182 has_no_channel_multiplier, no_prime_right_pad),
__anon360745110602() 183 [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
184 // Heuristic, don't prefer this kernel unless the input plane is greater
185 // than the number of channels.
186 if (args.input_rows * args.input_cols < args.input_channels)
187 return UINT32_MAX;
188
189 return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
190 },
__anon360745110702() 191 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
192 auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
193 return new DepthwisePlanar<float>(strat, args);
194 },
195 },
196 {
197 DepthwiseMethod::PLANAR,
198 "sme2_fp32_planar_3x3_s2_4rows_mla_za",
199 constraint(cpu_has_sme, cpu_has_sme2,
200 is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
201 has_no_channel_multiplier, no_prime_right_pad),
202 planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
__anon360745110802() 203 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
204 auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
205 return new DepthwisePlanar<float>(strat, args);
206 },
207 },
208 {
209 DepthwiseMethod::PLANAR,
210 "sme2_fp32_planar_5x5_s1_4rows_mla_za",
211 constraint(cpu_has_sme, cpu_has_sme2,
212 is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
213 has_no_channel_multiplier, no_prime_right_pad),
214 nullptr,
__anon360745110902() 215 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
216 auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
217 return new DepthwisePlanar<float>(strat, args);
218 },
219 },
220 {
221 DepthwiseMethod::PLANAR,
222 "sme2_fp32_planar_5x5_s2_4rows_mla_za",
223 constraint(cpu_has_sme, cpu_has_sme2,
224 is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
225 has_no_channel_multiplier, no_prime_right_pad),
226 nullptr,
__anon360745110a02() 227 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
228 auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
229 return new DepthwisePlanar<float>(strat, args);
230 },
231 },
232
233 {
234 DepthwiseMethod::DEPTHFIRST,
235 "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
236 constraint(cpu_has_sme, cpu_has_sme2,
237 is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
238 has_no_channel_multiplier),
239 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon360745110b02() 240 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
241 auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
242 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
243 },
244 },
245 {
246 DepthwiseMethod::DEPTHFIRST,
247 "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
248 constraint(cpu_has_sme, cpu_has_sme2,
249 is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
250 has_no_channel_multiplier),
251 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon360745110c02() 252 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
253 auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
254 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
255 },
256 },
257 {
258 DepthwiseMethod::DEPTHFIRST,
259 "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
260 constraint(cpu_has_sme, cpu_has_sme2,
261 is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
262 has_no_channel_multiplier),
263 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon360745110d02() 264 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
265 auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
266 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
267 },
268 },
269 {
270 DepthwiseMethod::DEPTHFIRST,
271 "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
272 constraint(cpu_has_sme, cpu_has_sme2,
273 is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
274 has_no_channel_multiplier),
275 cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon360745110e02() 276 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
277 auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
278 return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
279 },
280 },
281 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
282 {
283 DepthwiseMethod::DEPTHFIRST,
284 "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
285 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
286 has_no_channel_multiplier,
287 cpu_has_sve),
288 cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon360745110f02() 289 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
290 auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
291 return new DepthwiseDepthfirst<float>(strat, args);
292 },
293 },
294 {
295 DepthwiseMethod::DEPTHFIRST,
296 "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
297 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
298 has_no_channel_multiplier,
299 cpu_has_sve),
300 cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon360745111002() 301 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
302 auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
303 return new DepthwiseDepthfirst<float>(strat, args);
304 },
305 },
306 {
307 DepthwiseMethod::DEPTHFIRST,
308 "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
309 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
310 has_no_channel_multiplier,
311 cpu_has_sve),
312 cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon360745111102() 313 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
314 auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
315 return new DepthwiseDepthfirst<float>(strat, args);
316 },
317 },
318 {
319 DepthwiseMethod::DEPTHFIRST,
320 "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
321 constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
322 has_no_channel_multiplier,
323 cpu_has_sve),
324 cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon360745111202() 325 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
326 auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
327 return new DepthwiseDepthfirst<float>(strat, args);
328 },
329 },
330 {
331 DepthwiseMethod::DEPTHFIRST,
332 "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
333 constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
334 has_no_channel_multiplier,
335 cpu_has_sve),
336 cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon360745111302() 337 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
338 auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
339 return new DepthwiseDepthfirst<float>(strat, args);
340 },
341 },
342 {
343 DepthwiseMethod::DEPTHFIRST,
344 "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
345 constraint(has_no_channel_multiplier, cpu_has_sve),
346 not_preferred,
__anon360745111402() 347 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
348 auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
349 auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
350 return new DepthwiseDepthfirstGeneric<float>(strat, args);
351 },
352 },
353 {
354 DepthwiseMethod::DEPTHFIRST,
355 "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
356 constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
357 cpu_has_sve, has_channel_multiplier),
358 nullptr,
__anon360745111502() 359 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
360 auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
361 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
362 },
363 },
364 {
365 DepthwiseMethod::DEPTHFIRST,
366 "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
367 constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
368 cpu_has_sve, has_channel_multiplier),
369 nullptr,
__anon360745111602() 370 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
371 auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
372 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
373 },
374 },
375 {
376 DepthwiseMethod::DEPTHFIRST,
377 "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
378 constraint(cpu_has_sve, has_channel_multiplier),
379 nullptr,
__anon360745111702() 380 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
381 auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
382 auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
383 return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
384 },
385 },
386 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
387 {
388 DepthwiseMethod::DEPTHFIRST,
389 "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
390 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
391 has_no_channel_multiplier),
392 cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon360745111802() 393 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
394 auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
395 return new DepthwiseDepthfirst<float>(strat, args);
396 },
397 },
398 {
399 DepthwiseMethod::DEPTHFIRST,
400 "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
401 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
402 has_no_channel_multiplier),
403 cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon360745111902() 404 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
405 auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
406 return new DepthwiseDepthfirst<float>(strat, args);
407 },
408 },
409 {
410 DepthwiseMethod::DEPTHFIRST,
411 "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
412 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
413 has_no_channel_multiplier),
414 cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon360745111a02() 415 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
416 auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
417 return new DepthwiseDepthfirst<float>(strat, args);
418 },
419 },
420 {
421 DepthwiseMethod::DEPTHFIRST,
422 "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
423 constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
424 has_no_channel_multiplier),
425 cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon360745111b02() 426 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
427 auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
428 return new DepthwiseDepthfirst<float>(strat, args);
429 },
430 },
431 {
432 DepthwiseMethod::DEPTHFIRST,
433 "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
434 constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
435 has_no_channel_multiplier),
436 cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon360745111c02() 437 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
438 auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
439 return new DepthwiseDepthfirst<float>(strat, args);
440 },
441 },
442 {
443 DepthwiseMethod::DEPTHFIRST,
444 "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
445 constraint(has_no_channel_multiplier),
446 not_preferred,
__anon360745111d02() 447 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
448 auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
449 auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
450 return new DepthwiseDepthfirstGeneric<float>(strat, args);
451 },
452 },
453 {
454 DepthwiseMethod::DEPTHFIRST,
455 "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
456 constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
457 has_channel_multiplier),
458 nullptr,
__anon360745111e02() 459 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
460 auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
461 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
462 },
463 },
464 {
465 DepthwiseMethod::DEPTHFIRST,
466 "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
467 constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
468 has_channel_multiplier),
469 nullptr,
__anon360745111f02() 470 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
471 auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
472 return new DepthwiseDepthfirstMultiplier<float>(strat, args);
473 },
474 },
475 {
476 DepthwiseMethod::DEPTHFIRST,
477 "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
478 constraint(has_channel_multiplier),
479 nullptr,
__anon360745112002() 480 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
481 auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
482 auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
483 return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
484 },
485 },
486 #endif // defined(__aarch64__)
487 { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
488 };
489
490 template <>
depthwise_implementation_list()491 const DepthwiseImplementation<float> *depthwise_implementation_list()
492 {
493 return depthwise_fp32_methods;
494 }
495
496 template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
497 template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
498
499 } // namespace depthwise
500 } // namespace arm_conv
501