1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "arm_gemm_local.hpp"
26
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31
32 #include "depthwise_implementation_constraints.hpp"
33
34 // This can only be built if the target/compiler supports FP16 arguments.
35 #if defined(__ARM_FP16_ARGS)
36
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
39 #include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
40 #include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
41 #include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
42 #include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
43 #include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
44 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
45 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
46 #include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
47 #include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
48 #include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
49 #include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
50 #include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
51 #include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
52 #include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
53 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
54 #endif // defined(__aarch64__)
55
56 namespace arm_conv {
57 namespace depthwise {
58
59 namespace
60 {
61 template <class Strategy>
cycle_estimate(const DepthwiseArgs & args,const Nothing &)62 unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
63 {
64 // First-pass: compute the number of output pixels which will be computed.
65 return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
66 arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
67 arm_gemm::iceildiv(
68 (long unsigned) args.input_channels * args.channel_multiplier,
69 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
70 );
71 }
72
73 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
74 unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
not_preferred(const DepthwiseArgs &,const Nothing &)75 unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
76 {
77 return std::numeric_limits<unsigned int>::max();
78 }
79 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
80 }
81
82 static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
83 #if defined(__aarch64__)
84 #if defined(ARM_COMPUTE_ENABLE_SVE)
85 {
86 DepthwiseMethod::DEPTHFIRST,
87 "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
88 constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
89 has_no_channel_multiplier,
90 cpu_has_sve),
91 cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon31a555d30202() 92 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
93 auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
94 return new DepthwiseDepthfirst<__fp16>(strat, args);
95 },
96 },
97 {
98 DepthwiseMethod::DEPTHFIRST,
99 "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
100 constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
101 has_no_channel_multiplier,
102 cpu_has_sve),
103 cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon31a555d30302() 104 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
105 auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
106 return new DepthwiseDepthfirst<__fp16>(strat, args);
107 },
108 },
109 {
110 DepthwiseMethod::DEPTHFIRST,
111 "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
112 constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
113 has_no_channel_multiplier,
114 cpu_has_sve),
115 cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon31a555d30402() 116 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
117 auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
118 return new DepthwiseDepthfirst<__fp16>(strat, args);
119 },
120 },
121 {
122 DepthwiseMethod::DEPTHFIRST,
123 "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
124 constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
125 has_no_channel_multiplier,
126 cpu_has_sve),
127 cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon31a555d30502() 128 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
129 auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
130 return new DepthwiseDepthfirst<__fp16>(strat, args);
131 },
132 },
133 {
134 DepthwiseMethod::DEPTHFIRST,
135 "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
136 constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
137 has_no_channel_multiplier,
138 cpu_has_sve),
139 cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon31a555d30602() 140 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
141 auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
142 return new DepthwiseDepthfirst<__fp16>(strat, args);
143 },
144 },
145 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
146 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
147 {
148 DepthwiseMethod::DEPTHFIRST,
149 "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
150 constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
151 has_no_channel_multiplier,
152 cpu_has_fp16),
153 cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
__anon31a555d30702() 154 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
155 auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
156 return new DepthwiseDepthfirst<__fp16>(strat, args);
157 },
158 },
159 {
160 DepthwiseMethod::DEPTHFIRST,
161 "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
162 constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
163 has_no_channel_multiplier,
164 cpu_has_fp16),
165 cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
__anon31a555d30802() 166 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
167 auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
168 return new DepthwiseDepthfirst<__fp16>(strat, args);
169 },
170 },
171 {
172 DepthwiseMethod::DEPTHFIRST,
173 "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
174 constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
175 has_no_channel_multiplier,
176 cpu_has_fp16),
177 cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
__anon31a555d30902() 178 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
179 auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
180 return new DepthwiseDepthfirst<__fp16>(strat, args);
181 },
182 },
183 {
184 DepthwiseMethod::DEPTHFIRST,
185 "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
186 constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
187 has_no_channel_multiplier,
188 cpu_has_fp16),
189 cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
__anon31a555d30a02() 190 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
191 auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
192 return new DepthwiseDepthfirst<__fp16>(strat, args);
193 },
194 },
195 {
196 DepthwiseMethod::DEPTHFIRST,
197 "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
198 constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
199 has_no_channel_multiplier,
200 cpu_has_fp16),
201 cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
__anon31a555d30b02() 202 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
203 auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
204 return new DepthwiseDepthfirst<__fp16>(strat, args);
205 },
206 },
207 {
208 DepthwiseMethod::DEPTHFIRST,
209 "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
210 constraint(has_no_channel_multiplier, cpu_has_fp16),
211 not_preferred,
__anon31a555d30c02() 212 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
213 auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
214 auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
215 return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
216 },
217 },
218 {
219 DepthwiseMethod::DEPTHFIRST,
220 "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
221 constraint(cpu_has_fp16, has_channel_multiplier),
222 nullptr,
__anon31a555d30d02() 223 [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
224 auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
225 auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
226 return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
227 },
228 },
229 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
230 #endif // defined(__aarch64__)
231 { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
232 };
233
234 template <>
depthwise_implementation_list()235 const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
236 {
237 return depthwise_fp16_methods;
238 }
239
240 template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
241 template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
242
243 } // namespace depthwise
244 } // namespace arm_conv
245
246 #endif // defined(__ARM_FP16_ARGS)
247