xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 
32 #include "depthwise_implementation_constraints.hpp"
33 
34 // This can only be built if the target/compiler supports FP16 arguments.
35 #if defined(__ARM_FP16_ARGS)
36 
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
39 #include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
40 #include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
41 #include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
42 #include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
43 #include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
44 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
45 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
46 #include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
47 #include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
48 #include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
49 #include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
50 #include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
51 #include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
52 #include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
53 #endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
54 #endif  // defined(__aarch64__)
55 
56 namespace arm_conv {
57 namespace depthwise {
58 
59 namespace
60 {
61   template <class Strategy>
cycle_estimate(const DepthwiseArgs & args,const Nothing &)62   unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
63   {
64     // First-pass: compute the number of output pixels which will be computed.
65     return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
66            arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
67            arm_gemm::iceildiv(
68             (long unsigned) args.input_channels * args.channel_multiplier,
69             arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
70           );
71   }
72 
73 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
74   unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
not_preferred(const DepthwiseArgs &,const Nothing &)75   unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
76   {
77     return std::numeric_limits<unsigned int>::max();
78   }
79 #endif  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
80 }
81 
82 static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
83 #if defined(__aarch64__)
84 #if defined(ARM_COMPUTE_ENABLE_SVE)
85   {
86     DepthwiseMethod::DEPTHFIRST,
87     "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
88     constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
89                has_no_channel_multiplier,
90                cpu_has_sve),
91     cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon31a555d30202() 92     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
93       auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
94       return new DepthwiseDepthfirst<__fp16>(strat, args);
95     },
96   },
97   {
98     DepthwiseMethod::DEPTHFIRST,
99     "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
100     constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
101                has_no_channel_multiplier,
102                cpu_has_sve),
103     cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon31a555d30302() 104     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
105       auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
106       return new DepthwiseDepthfirst<__fp16>(strat, args);
107     },
108   },
109   {
110     DepthwiseMethod::DEPTHFIRST,
111     "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
112     constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
113               has_no_channel_multiplier,
114               cpu_has_sve),
115     cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon31a555d30402() 116     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
117       auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
118       return new DepthwiseDepthfirst<__fp16>(strat, args);
119     },
120   },
121   {
122     DepthwiseMethod::DEPTHFIRST,
123     "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
124     constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
125                has_no_channel_multiplier,
126                cpu_has_sve),
127     cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon31a555d30502() 128     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
129       auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
130       return new DepthwiseDepthfirst<__fp16>(strat, args);
131     },
132   },
133   {
134     DepthwiseMethod::DEPTHFIRST,
135     "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
136     constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
137                has_no_channel_multiplier,
138                cpu_has_sve),
139     cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon31a555d30602() 140     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
141       auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
142       return new DepthwiseDepthfirst<__fp16>(strat, args);
143     },
144   },
145 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
146 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
147   {
148     DepthwiseMethod::DEPTHFIRST,
149     "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
150     constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
151                has_no_channel_multiplier,
152                cpu_has_fp16),
153     cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon31a555d30702() 154     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
155       auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
156       return new DepthwiseDepthfirst<__fp16>(strat, args);
157     },
158   },
159   {
160     DepthwiseMethod::DEPTHFIRST,
161     "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
162     constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
163                has_no_channel_multiplier,
164                cpu_has_fp16),
165     cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon31a555d30802() 166     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
167       auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
168       return new DepthwiseDepthfirst<__fp16>(strat, args);
169     },
170   },
171   {
172     DepthwiseMethod::DEPTHFIRST,
173     "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
174     constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
175                has_no_channel_multiplier,
176                cpu_has_fp16),
177     cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon31a555d30902() 178     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
179       auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
180       return new DepthwiseDepthfirst<__fp16>(strat, args);
181     },
182   },
183   {
184     DepthwiseMethod::DEPTHFIRST,
185     "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
186     constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
187                has_no_channel_multiplier,
188                cpu_has_fp16),
189     cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon31a555d30a02() 190     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
191       auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
192       return new DepthwiseDepthfirst<__fp16>(strat, args);
193     },
194   },
195   {
196     DepthwiseMethod::DEPTHFIRST,
197     "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
198     constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
199                has_no_channel_multiplier,
200                cpu_has_fp16),
201     cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon31a555d30b02() 202     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
203       auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
204       return new DepthwiseDepthfirst<__fp16>(strat, args);
205     },
206   },
207   {
208     DepthwiseMethod::DEPTHFIRST,
209     "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
210     constraint(has_no_channel_multiplier, cpu_has_fp16),
211     not_preferred,
__anon31a555d30c02() 212     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
213       auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
214       auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
215       return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
216     },
217   },
218   {
219     DepthwiseMethod::DEPTHFIRST,
220     "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
221     constraint(cpu_has_fp16, has_channel_multiplier),
222     nullptr,
__anon31a555d30d02() 223     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
224       auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
225       auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
226       return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
227     },
228   },
229 #endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
230 #endif  // defined(__aarch64__)
231   { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
232 };
233 
234 template <>
depthwise_implementation_list()235 const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
236 {
237   return depthwise_fp16_methods;
238 }
239 
240 template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
241 template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
242 
243 }  // namespace depthwise
244 }  // namespace arm_conv
245 
246 #endif  // defined(__ARM_FP16_ARGS)
247