1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "arm_gemm_local.hpp"
26
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32
33 #include "depthwise_implementation_constraints.hpp"
34
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
38 #include "kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp"
39 #include "kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp"
40 #include "kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp"
42 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
43
44 #include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
45 #include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
46 #include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
47 #include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
48 #include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
49 #include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
50 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
51 #include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
52
53 #include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
54 #include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
55 #include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
56
57 #include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
58 #include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
59 #include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
60 #include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
61 #include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
62 #include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
63 #include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
64
65 #endif // defined(__aarch64__)
66
67 #include <cstdint>
68
69 using arm_gemm::Requantize32;
70
71 namespace arm_conv {
72 namespace depthwise {
73
74 static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
75 #if defined(__aarch64__)
76 #if defined(ARM_COMPUTE_ENABLE_SVE)
77 #if defined(ARM_COMPUTE_ENABLE_SME2)
78 {
79 DepthwiseMethod::PLANAR,
80 "sme2_u8q_planar_3x3_s1_4rows_dot_za",
81 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
82 is_supported<sme2_u8q_planar_3x3_s1_4rows_dot_za>,
83 has_no_channel_multiplier,
84 qp_has_no_left_shift, no_prime_right_pad),
85 nullptr,
__anon3c5a01740102() 86 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
87 auto strat = new sme2_u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
88 return new DepthwisePlanar<uint8_t>(strat, args, qp);
89 },
90 },
91 {
92 DepthwiseMethod::PLANAR,
93 "sme2_u8q_planar_3x3_s2_4rows_dot_za",
94 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
95 is_supported<sme2_u8q_planar_3x3_s2_4rows_dot_za>,
96 has_no_channel_multiplier,
97 qp_has_no_left_shift, no_prime_right_pad),
98 nullptr,
__anon3c5a01740202() 99 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
100 auto strat = new sme2_u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
101 return new DepthwisePlanar<uint8_t>(strat, args, qp);
102 },
103 },
104 {
105 DepthwiseMethod::PLANAR,
106 "sme2_u8q_planar_5x5_s1_4rows_dot_za",
107 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
108 is_supported<sme2_u8q_planar_5x5_s1_4rows_dot_za>,
109 has_no_channel_multiplier,
110 qp_has_no_left_shift, no_prime_right_pad),
111 nullptr,
__anon3c5a01740302() 112 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
113 auto strat = new sme2_u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
114 return new DepthwisePlanar<uint8_t>(strat, args, qp);
115 },
116 },
117 {
118 DepthwiseMethod::PLANAR,
119 "sme2_u8q_planar_5x5_s2_4rows_dot_za",
120 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
121 is_supported<sme2_u8q_planar_5x5_s2_4rows_dot_za>,
122 has_no_channel_multiplier,
123 qp_has_no_left_shift, no_prime_right_pad),
124 nullptr,
__anon3c5a01740402() 125 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
126 auto strat = new sme2_u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
127 return new DepthwisePlanar<uint8_t>(strat, args, qp);
128 },
129 },
130 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
131 {
132 DepthwiseMethod::DEPTHFIRST,
133 "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
134 constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
135 has_no_channel_multiplier,
136 qp_has_no_left_shift,
137 cpu_has_sve2),
138 nullptr,
__anon3c5a01740502() 139 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
140 auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
141 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
142 },
143 },
144 {
145 DepthwiseMethod::DEPTHFIRST,
146 "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
147 constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
148 has_no_channel_multiplier,
149 qp_has_no_left_shift,
150 cpu_has_sve2),
151 nullptr,
__anon3c5a01740602() 152 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
153 auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
154 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
155 },
156 },
157 {
158 DepthwiseMethod::DEPTHFIRST,
159 "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
160 constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
161 has_no_channel_multiplier,
162 qp_has_no_left_shift,
163 cpu_has_sve2),
164 nullptr,
__anon3c5a01740702() 165 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
166 auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
167 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
168 },
169 },
170 {
171 DepthwiseMethod::DEPTHFIRST,
172 "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
173 constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
174 has_no_channel_multiplier,
175 qp_has_no_left_shift,
176 cpu_has_sve2),
177 nullptr,
__anon3c5a01740802() 178 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
179 auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
180 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
181 },
182 },
183 {
184 DepthwiseMethod::DEPTHFIRST,
185 "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
186 constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
187 qp_has_no_left_shift,
188 has_channel_multiplier,
189 cpu_has_sve2),
190 nullptr,
__anon3c5a01740902() 191 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
192 auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
193 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
194 },
195 },
196 {
197 DepthwiseMethod::DEPTHFIRST,
198 "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
199 constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
200 qp_has_no_left_shift,
201 has_channel_multiplier,
202 cpu_has_sve2),
203 nullptr,
__anon3c5a01740a02() 204 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
205 auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
206 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
207 },
208 },
209 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
210 {
211 DepthwiseMethod::DEPTHFIRST,
212 "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
213 constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
214 cpu_has_dot_product,
215 has_no_channel_multiplier,
216 qp_has_no_left_shift),
217 nullptr,
__anon3c5a01740b02() 218 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
219 auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
220 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
221 },
222 },
223
224 {
225 DepthwiseMethod::DEPTHFIRST,
226 "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
227 constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
228 has_no_channel_multiplier,
229 qp_zero_a_offset,
230 qp_has_no_left_shift),
231 nullptr,
__anon3c5a01740c02() 232 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
233 auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
234 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
235 },
236 },
237 {
238 DepthwiseMethod::DEPTHFIRST,
239 "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
240 constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
241 has_no_channel_multiplier,
242 qp_zero_a_offset,
243 qp_has_no_left_shift),
244 nullptr,
__anon3c5a01740d02() 245 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
246 auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
247 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
248 },
249 },
250 {
251 DepthwiseMethod::DEPTHFIRST,
252 "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
253 constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
254 has_no_channel_multiplier,
255 qp_zero_a_offset,
256 qp_has_no_left_shift),
257 nullptr,
__anon3c5a01740e02() 258 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
259 auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
260 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
261 },
262 },
263
264 {
265 DepthwiseMethod::DEPTHFIRST,
266 "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
267 constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
268 has_no_channel_multiplier,
269 qp_has_no_left_shift),
270 nullptr,
__anon3c5a01740f02() 271 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
272 auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
273 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
274 },
275 },
276 {
277 DepthwiseMethod::DEPTHFIRST,
278 "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
279 constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
280 has_no_channel_multiplier,
281 qp_has_no_left_shift),
282 nullptr,
__anon3c5a01741002() 283 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
284 auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
285 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
286 },
287 },
288 {
289 DepthwiseMethod::DEPTHFIRST,
290 "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
291 constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
292 has_no_channel_multiplier,
293 qp_has_no_left_shift),
294 nullptr,
__anon3c5a01741102() 295 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
296 auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
297 return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
298 },
299 },
300 {
301 DepthwiseMethod::DEPTHFIRST,
302 "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
303 constraint<Requantize32>(has_no_channel_multiplier),
304 nullptr,
__anon3c5a01741202() 305 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
306 auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
307 auto strat = new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3, args);
308 return new DepthwiseDepthfirstGeneric<uint8_t>(strat, args, qp);
309 },
310 },
311 {
312 DepthwiseMethod::DEPTHFIRST,
313 "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
314 constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
315 cpu_has_dot_product,
316 has_channel_multiplier,
317 qp_has_no_left_shift),
318 nullptr,
__anon3c5a01741302() 319 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
320 auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
321 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
322 },
323 },
324 {
325 DepthwiseMethod::DEPTHFIRST,
326 "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
327 constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
328 cpu_has_dot_product,
329 has_channel_multiplier,
330 qp_has_no_left_shift),
331 nullptr,
__anon3c5a01741402() 332 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
333 auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
334 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
335 },
336 },
337 {
338 DepthwiseMethod::DEPTHFIRST,
339 "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
340 constraint<Requantize32>(has_channel_multiplier),
341 nullptr,
__anon3c5a01741502() 342 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
343 auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
344 auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t>(kern, args);
345 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat, args, qp);
346 },
347 },
348
349 #endif // defined(__aarch64__)
350 { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
351 };
352
353 template <>
depthwise_implementation_list()354 const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
355 {
356 return depthwise_u8q_methods;
357 }
358
359 template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
360 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
361
362 } // namespace depthwise
363 } // namespace arm_conv
364