xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32 
33 #include "depthwise_implementation_constraints.hpp"
34 
35 #include "interleaves/list.hpp"
36 
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
39 #if defined(ARM_COMPUTE_ENABLE_SME2)
40 #include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
42 #include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
43 #include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
44 
45 #include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
46 #include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
47 #include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
48 #include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
49 
50 #include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
51 #include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
52 #include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
53 #include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
54 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
55 
56 #include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
57 #include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
58 #include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
59 #include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
60 #include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
61 #include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
62 #include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
63 #include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
64 #include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
65 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
66 #include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
67 #include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
68 #include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
69 #include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
70 #include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
71 #include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
72 #include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
73 #include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
74 #include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
75 #endif  // defined(__aarch64__)
76 
77 namespace arm_conv {
78 namespace depthwise {
79 
80 namespace
81 {
82   template <class Strategy>
cycle_estimate(const DepthwiseArgs & args,const Nothing &)83   unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
84   {
85     // First-pass: compute the number of output pixels which will be computed.
86     return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
87            arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
88            arm_gemm::iceildiv(
89             (long unsigned) args.input_channels * args.channel_multiplier,
90             arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
91           );
92   }
93 
94   template <class Strategy>
planar_cycle_estimate(const DepthwiseArgs & args,const Nothing &)95   unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
96   {
97     // First-pass: compute the number of output pixels which will be computed.
98     return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
99            args.output_cols *
100            arm_gemm::iceildiv(
101             (long unsigned) args.input_channels * args.channel_multiplier,
102             arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
103           );
104   }
105 
106 #if defined(__aarch64__)
not_preferred(const DepthwiseArgs &,const Nothing &)107   unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
108   {
109     return std::numeric_limits<unsigned int>::max();
110   }
111 
112   bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
fast_mode_enabled(const DepthwiseArgs & args,const void *)113   bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
114   {
115     return args.fast_mode;
116   }
117 #endif // defined(__aarch64__)
118 }
119 
120 static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
121 #if defined(__aarch64__)
122 #if defined(ARM_COMPUTE_ENABLE_SVE)
123 #if defined(ARM_COMPUTE_ENABLE_SME2)
124   {
125     DepthwiseMethod::PLANAR,
126     "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
127     constraint(fast_mode_enabled,
128                cpu_has_sme, cpu_has_sme2,
129                is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
130                has_no_channel_multiplier, no_prime_right_pad),
131     nullptr,
__anon360745110202() 132     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
133       auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
134       return new DepthwisePlanar<float>(strat, args);
135     },
136   },
137   {
138     DepthwiseMethod::PLANAR,
139     "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
140     constraint(fast_mode_enabled,
141                cpu_has_sme, cpu_has_sme2,
142                is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
143                has_no_channel_multiplier, no_prime_right_pad),
144     nullptr,
__anon360745110302() 145     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
146       auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
147       return new DepthwisePlanar<float>(strat, args);
148     },
149   },
150   {
151     DepthwiseMethod::PLANAR,
152     "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
153     constraint(fast_mode_enabled,
154                cpu_has_sme, cpu_has_sme2,
155                is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
156                has_no_channel_multiplier, no_prime_right_pad),
157     nullptr,
__anon360745110402() 158     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
159       auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
160       return new DepthwisePlanar<float>(strat, args);
161     },
162   },
163   {
164     DepthwiseMethod::PLANAR,
165     "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
166     constraint(fast_mode_enabled,
167                cpu_has_sme, cpu_has_sme2,
168                is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
169                has_no_channel_multiplier, no_prime_right_pad),
170     nullptr,
__anon360745110502() 171     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
172       auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
173       return new DepthwisePlanar<float>(strat, args);
174     },
175   },
176 
177   {
178     DepthwiseMethod::PLANAR,
179     "sme2_fp32_planar_3x3_s1_4rows_mla_za",
180     constraint(cpu_has_sme, cpu_has_sme2,
181                is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
182                has_no_channel_multiplier, no_prime_right_pad),
__anon360745110602() 183     [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
184       // Heuristic, don't prefer this kernel unless the input plane is greater
185       // than the number of channels.
186       if (args.input_rows * args.input_cols < args.input_channels)
187         return UINT32_MAX;
188 
189       return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
190     },
__anon360745110702() 191     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
192       auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
193       return new DepthwisePlanar<float>(strat, args);
194     },
195   },
196   {
197     DepthwiseMethod::PLANAR,
198     "sme2_fp32_planar_3x3_s2_4rows_mla_za",
199     constraint(cpu_has_sme, cpu_has_sme2,
200                is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
201                has_no_channel_multiplier, no_prime_right_pad),
202     planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
__anon360745110802() 203     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
204       auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
205       return new DepthwisePlanar<float>(strat, args);
206     },
207   },
208   {
209     DepthwiseMethod::PLANAR,
210     "sme2_fp32_planar_5x5_s1_4rows_mla_za",
211     constraint(cpu_has_sme, cpu_has_sme2,
212                is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
213                has_no_channel_multiplier, no_prime_right_pad),
214     nullptr,
__anon360745110902() 215     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
216       auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
217       return new DepthwisePlanar<float>(strat, args);
218     },
219   },
220   {
221     DepthwiseMethod::PLANAR,
222     "sme2_fp32_planar_5x5_s2_4rows_mla_za",
223     constraint(cpu_has_sme, cpu_has_sme2,
224                is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
225                has_no_channel_multiplier, no_prime_right_pad),
226     nullptr,
__anon360745110a02() 227     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
228       auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
229       return new DepthwisePlanar<float>(strat, args);
230     },
231   },
232 
233   {
234     DepthwiseMethod::DEPTHFIRST,
235     "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
236     constraint(cpu_has_sme,  cpu_has_sme2,
237                is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
238                has_no_channel_multiplier),
239     cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon360745110b02() 240     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
241       auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
242       return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
243     },
244   },
245   {
246     DepthwiseMethod::DEPTHFIRST,
247     "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
248     constraint(cpu_has_sme, cpu_has_sme2,
249                is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
250                has_no_channel_multiplier),
251     cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon360745110c02() 252     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
253       auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
254       return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
255     },
256   },
257   {
258     DepthwiseMethod::DEPTHFIRST,
259     "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
260     constraint(cpu_has_sme, cpu_has_sme2,
261                is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
262                has_no_channel_multiplier),
263     cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon360745110d02() 264     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
265       auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
266       return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
267     },
268   },
269   {
270     DepthwiseMethod::DEPTHFIRST,
271     "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
272     constraint(cpu_has_sme, cpu_has_sme2,
273                is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
274                has_no_channel_multiplier),
275     cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon360745110e02() 276     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
277       auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
278       return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
279     },
280   },
281 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
282   {
283     DepthwiseMethod::DEPTHFIRST,
284     "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
285     constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
286                has_no_channel_multiplier,
287                cpu_has_sve),
288     cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon360745110f02() 289     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
290       auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
291       return new DepthwiseDepthfirst<float>(strat, args);
292     },
293   },
294   {
295     DepthwiseMethod::DEPTHFIRST,
296     "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
297     constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
298                has_no_channel_multiplier,
299                cpu_has_sve),
300     cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon360745111002() 301     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
302       auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
303       return new DepthwiseDepthfirst<float>(strat, args);
304     },
305   },
306   {
307     DepthwiseMethod::DEPTHFIRST,
308     "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
309     constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
310               has_no_channel_multiplier,
311               cpu_has_sve),
312     cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon360745111102() 313     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
314       auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
315       return new DepthwiseDepthfirst<float>(strat, args);
316     },
317   },
318   {
319     DepthwiseMethod::DEPTHFIRST,
320     "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
321     constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
322                has_no_channel_multiplier,
323                cpu_has_sve),
324     cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon360745111202() 325     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
326       auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
327       return new DepthwiseDepthfirst<float>(strat, args);
328     },
329   },
330   {
331     DepthwiseMethod::DEPTHFIRST,
332     "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
333     constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
334                has_no_channel_multiplier,
335                cpu_has_sve),
336     cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon360745111302() 337     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
338       auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
339       return new DepthwiseDepthfirst<float>(strat, args);
340     },
341   },
342   {
343     DepthwiseMethod::DEPTHFIRST,
344     "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
345     constraint(has_no_channel_multiplier, cpu_has_sve),
346     not_preferred,
__anon360745111402() 347     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
348       auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
349       auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
350       return new DepthwiseDepthfirstGeneric<float>(strat, args);
351     },
352   },
353   {
354     DepthwiseMethod::DEPTHFIRST,
355     "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
356     constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
357                cpu_has_sve, has_channel_multiplier),
358     nullptr,
__anon360745111502() 359     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
360       auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
361       return new DepthwiseDepthfirstMultiplier<float>(strat, args);
362     },
363   },
364   {
365     DepthwiseMethod::DEPTHFIRST,
366     "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
367     constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
368                cpu_has_sve, has_channel_multiplier),
369     nullptr,
__anon360745111602() 370     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
371       auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
372       return new DepthwiseDepthfirstMultiplier<float>(strat, args);
373     },
374   },
375   {
376     DepthwiseMethod::DEPTHFIRST,
377     "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
378     constraint(cpu_has_sve, has_channel_multiplier),
379     nullptr,
__anon360745111702() 380     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
381       auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
382       auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
383       return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
384     },
385   },
386 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
387   {
388     DepthwiseMethod::DEPTHFIRST,
389     "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
390     constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
391                has_no_channel_multiplier),
392     cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon360745111802() 393     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
394       auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
395       return new DepthwiseDepthfirst<float>(strat, args);
396     },
397   },
398   {
399     DepthwiseMethod::DEPTHFIRST,
400     "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
401     constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
402                has_no_channel_multiplier),
403     cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon360745111902() 404     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
405       auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
406       return new DepthwiseDepthfirst<float>(strat, args);
407     },
408   },
409   {
410     DepthwiseMethod::DEPTHFIRST,
411     "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
412     constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
413                             has_no_channel_multiplier),
414     cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon360745111a02() 415     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
416       auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
417       return new DepthwiseDepthfirst<float>(strat, args);
418     },
419   },
420   {
421     DepthwiseMethod::DEPTHFIRST,
422     "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
423     constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
424                has_no_channel_multiplier),
425     cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon360745111b02() 426     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
427       auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
428       return new DepthwiseDepthfirst<float>(strat, args);
429     },
430   },
431   {
432     DepthwiseMethod::DEPTHFIRST,
433     "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
434     constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
435                has_no_channel_multiplier),
436     cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon360745111c02() 437     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
438       auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
439       return new DepthwiseDepthfirst<float>(strat, args);
440     },
441   },
442   {
443     DepthwiseMethod::DEPTHFIRST,
444     "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
445     constraint(has_no_channel_multiplier),
446     not_preferred,
__anon360745111d02() 447     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
448       auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
449       auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
450       return new DepthwiseDepthfirstGeneric<float>(strat, args);
451     },
452   },
453   {
454     DepthwiseMethod::DEPTHFIRST,
455     "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
456     constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
457                has_channel_multiplier),
458     nullptr,
__anon360745111e02() 459     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
460       auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
461       return new DepthwiseDepthfirstMultiplier<float>(strat, args);
462     },
463   },
464   {
465     DepthwiseMethod::DEPTHFIRST,
466     "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
467     constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
468                has_channel_multiplier),
469     nullptr,
__anon360745111f02() 470     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
471       auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
472       return new DepthwiseDepthfirstMultiplier<float>(strat, args);
473     },
474   },
475   {
476     DepthwiseMethod::DEPTHFIRST,
477     "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
478     constraint(has_channel_multiplier),
479     nullptr,
__anon360745112002() 480     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
481       auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
482       auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
483       return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
484     },
485   },
486 #endif  // defined(__aarch64__)
487   { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
488 };
489 
490 template <>
depthwise_implementation_list()491 const DepthwiseImplementation<float> *depthwise_implementation_list()
492 {
493   return depthwise_fp32_methods;
494 }
495 
496 template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
497 template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
498 
499 }  // namespace depthwise
500 }  // namespace arm_conv
501