xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32 
33 #include "depthwise_implementation_constraints.hpp"
34 
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
38 #include "kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp"
39 #include "kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp"
40 #include "kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp"
42 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
43 
44 #include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
45 #include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
46 #include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
47 #include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
48 #include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
49 #include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
50 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
51 #include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
52 
53 #include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
54 #include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
55 #include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
56 
57 #include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
58 #include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
59 #include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
60 #include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
61 #include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
62 #include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
63 #include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
64 
65 #endif  // defined(__aarch64__)
66 
67 #include <cstdint>
68 
69 using arm_gemm::Requantize32;
70 
71 namespace arm_conv {
72 namespace depthwise {
73 
74 static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
75 #if defined(__aarch64__)
76 #if defined(ARM_COMPUTE_ENABLE_SVE)
77 #if defined(ARM_COMPUTE_ENABLE_SME2)
78   {
79     DepthwiseMethod::PLANAR,
80     "sme2_u8q_planar_3x3_s1_4rows_dot_za",
81     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
82                              is_supported<sme2_u8q_planar_3x3_s1_4rows_dot_za>,
83                              has_no_channel_multiplier,
84                              qp_has_no_left_shift, no_prime_right_pad),
85     nullptr,
__anon3c5a01740102() 86     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
87       auto strat = new sme2_u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
88       return new DepthwisePlanar<uint8_t>(strat, args, qp);
89     },
90   },
91   {
92     DepthwiseMethod::PLANAR,
93     "sme2_u8q_planar_3x3_s2_4rows_dot_za",
94     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
95                              is_supported<sme2_u8q_planar_3x3_s2_4rows_dot_za>,
96                              has_no_channel_multiplier,
97                              qp_has_no_left_shift, no_prime_right_pad),
98     nullptr,
__anon3c5a01740202() 99     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
100       auto strat = new sme2_u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
101       return new DepthwisePlanar<uint8_t>(strat, args, qp);
102     },
103   },
104   {
105     DepthwiseMethod::PLANAR,
106     "sme2_u8q_planar_5x5_s1_4rows_dot_za",
107     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
108                              is_supported<sme2_u8q_planar_5x5_s1_4rows_dot_za>,
109                              has_no_channel_multiplier,
110                              qp_has_no_left_shift, no_prime_right_pad),
111     nullptr,
__anon3c5a01740302() 112     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
113       auto strat = new sme2_u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
114       return new DepthwisePlanar<uint8_t>(strat, args, qp);
115     },
116   },
117   {
118     DepthwiseMethod::PLANAR,
119     "sme2_u8q_planar_5x5_s2_4rows_dot_za",
120     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
121                              is_supported<sme2_u8q_planar_5x5_s2_4rows_dot_za>,
122                              has_no_channel_multiplier,
123                              qp_has_no_left_shift, no_prime_right_pad),
124     nullptr,
__anon3c5a01740402() 125     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
126       auto strat = new sme2_u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
127       return new DepthwisePlanar<uint8_t>(strat, args, qp);
128     },
129   },
130 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
131   {
132     DepthwiseMethod::DEPTHFIRST,
133     "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
134     constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
135                              has_no_channel_multiplier,
136                              qp_has_no_left_shift,
137                              cpu_has_sve2),
138     nullptr,
__anon3c5a01740502() 139     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
140       auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
141       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
142     },
143   },
144   {
145     DepthwiseMethod::DEPTHFIRST,
146     "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
147     constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
148                              has_no_channel_multiplier,
149                              qp_has_no_left_shift,
150                              cpu_has_sve2),
151     nullptr,
__anon3c5a01740602() 152     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
153       auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
154       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
155     },
156   },
157   {
158     DepthwiseMethod::DEPTHFIRST,
159     "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
160     constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
161                              has_no_channel_multiplier,
162                              qp_has_no_left_shift,
163                              cpu_has_sve2),
164     nullptr,
__anon3c5a01740702() 165     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
166       auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
167       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
168     },
169   },
170   {
171     DepthwiseMethod::DEPTHFIRST,
172     "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
173     constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
174                              has_no_channel_multiplier,
175                              qp_has_no_left_shift,
176                              cpu_has_sve2),
177     nullptr,
__anon3c5a01740802() 178     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
179       auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
180       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
181     },
182   },
183   {
184     DepthwiseMethod::DEPTHFIRST,
185     "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
186     constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
187                              qp_has_no_left_shift,
188                              has_channel_multiplier,
189                              cpu_has_sve2),
190     nullptr,
__anon3c5a01740902() 191     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
192       auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
193       return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
194     },
195   },
196   {
197     DepthwiseMethod::DEPTHFIRST,
198     "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
199     constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
200                              qp_has_no_left_shift,
201                              has_channel_multiplier,
202                              cpu_has_sve2),
203     nullptr,
__anon3c5a01740a02() 204     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
205       auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
206       return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
207     },
208   },
209 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
210   {
211     DepthwiseMethod::DEPTHFIRST,
212     "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
213     constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
214                              cpu_has_dot_product,
215                              has_no_channel_multiplier,
216                              qp_has_no_left_shift),
217     nullptr,
__anon3c5a01740b02() 218     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
219       auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
220       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
221     },
222   },
223 
224   {
225     DepthwiseMethod::DEPTHFIRST,
226     "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
227     constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
228                              has_no_channel_multiplier,
229                              qp_zero_a_offset,
230                              qp_has_no_left_shift),
231     nullptr,
__anon3c5a01740c02() 232     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
233       auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
234       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
235     },
236   },
237   {
238     DepthwiseMethod::DEPTHFIRST,
239     "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
240     constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
241                              has_no_channel_multiplier,
242                              qp_zero_a_offset,
243                              qp_has_no_left_shift),
244     nullptr,
__anon3c5a01740d02() 245     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
246       auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
247       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
248     },
249   },
250   {
251     DepthwiseMethod::DEPTHFIRST,
252     "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
253     constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
254                              has_no_channel_multiplier,
255                              qp_zero_a_offset,
256                              qp_has_no_left_shift),
257     nullptr,
__anon3c5a01740e02() 258     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
259       auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
260       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
261     },
262   },
263 
264   {
265     DepthwiseMethod::DEPTHFIRST,
266     "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
267     constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
268                              has_no_channel_multiplier,
269                              qp_has_no_left_shift),
270     nullptr,
__anon3c5a01740f02() 271     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
272       auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
273       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
274     },
275   },
276   {
277     DepthwiseMethod::DEPTHFIRST,
278     "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
279     constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
280                              has_no_channel_multiplier,
281                              qp_has_no_left_shift),
282     nullptr,
__anon3c5a01741002() 283     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
284       auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
285       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
286     },
287   },
288   {
289     DepthwiseMethod::DEPTHFIRST,
290     "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
291     constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
292                              has_no_channel_multiplier,
293                              qp_has_no_left_shift),
294     nullptr,
__anon3c5a01741102() 295     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
296       auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
297       return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
298     },
299   },
300   {
301     DepthwiseMethod::DEPTHFIRST,
302     "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
303     constraint<Requantize32>(has_no_channel_multiplier),
304     nullptr,
__anon3c5a01741202() 305     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
306       auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
307       auto strat = new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3, args);
308       return new DepthwiseDepthfirstGeneric<uint8_t>(strat, args, qp);
309     },
310   },
311   {
312     DepthwiseMethod::DEPTHFIRST,
313     "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
314     constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
315                              cpu_has_dot_product,
316                              has_channel_multiplier,
317                              qp_has_no_left_shift),
318     nullptr,
__anon3c5a01741302() 319     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
320       auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
321       return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
322     },
323   },
324   {
325     DepthwiseMethod::DEPTHFIRST,
326     "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
327     constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
328                              cpu_has_dot_product,
329                              has_channel_multiplier,
330                              qp_has_no_left_shift),
331     nullptr,
__anon3c5a01741402() 332     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
333       auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
334       return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
335     },
336   },
337   {
338     DepthwiseMethod::DEPTHFIRST,
339     "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
340     constraint<Requantize32>(has_channel_multiplier),
341     nullptr,
__anon3c5a01741502() 342     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
343       auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
344       auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t>(kern, args);
345       return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat, args, qp);
346     },
347   },
348 
349 #endif  // defined(__aarch64__)
350   { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
351 };
352 
353 template <>
depthwise_implementation_list()354 const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
355 {
356   return depthwise_u8q_methods;
357 }
358 
359 template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
360 template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
361 
362 }  // namespace depthwise
363 }  // namespace arm_conv
364