xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32 
33 #include "depthwise_implementation_constraints.hpp"
34 
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
38 #include "kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp"
39 #include "kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp"
40 #include "kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp"
42 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
43 
44 #include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
45 #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
46 #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
47 #include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
48 #include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
49 #include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
50 #include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
51 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
52 #include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
53 #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
54 #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
55 #include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
56 #include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
57 #include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
58 #include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
59 #include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
60 #include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
61 #endif  // defined(__aarch64__)
62 
63 #include <cstdint>
64 
65 using arm_gemm::Requantize32;
66 
67 namespace arm_conv {
68 namespace depthwise {
69 
70 namespace
71 {
72 #if defined(__aarch64__)
qp_weights_are_symmetric(const DepthwiseArgs &,const void * _qp)73 bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
74 {
75   const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
76   return qp->b_offset == 0;
77 }
78 #endif // defined(__aarch64__)
79 }
80 
81 static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
82 #if defined(__aarch64__)
83 #if defined(ARM_COMPUTE_ENABLE_SVE)
84 #if defined(ARM_COMPUTE_ENABLE_SME2)
85   {
86     DepthwiseMethod::PLANAR,
87     "sme2_s8q_planar_3x3_s1_4rows_dot_za",
88     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
89                              is_supported<sme2_s8q_planar_3x3_s1_4rows_dot_za>,
90                              has_no_channel_multiplier,
91                              qp_has_no_left_shift, no_prime_right_pad),
92     nullptr,
__anona26587f20202() 93     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
94       auto strat = new sme2_s8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
95       return new DepthwisePlanar<int8_t>(strat, args, qp);
96     },
97   },
98   {
99     DepthwiseMethod::PLANAR,
100     "sme2_s8q_planar_3x3_s2_4rows_dot_za",
101     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
102                              is_supported<sme2_s8q_planar_3x3_s2_4rows_dot_za>,
103                              has_no_channel_multiplier,
104                              qp_has_no_left_shift, no_prime_right_pad),
105     nullptr,
__anona26587f20302() 106     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
107       auto strat = new sme2_s8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
108       return new DepthwisePlanar<int8_t>(strat, args, qp);
109     },
110   },
111   {
112     DepthwiseMethod::PLANAR,
113     "sme2_s8q_planar_5x5_s1_4rows_dot_za",
114     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
115                              is_supported<sme2_s8q_planar_5x5_s1_4rows_dot_za>,
116                              has_no_channel_multiplier,
117                              qp_has_no_left_shift, no_prime_right_pad),
118     nullptr,
__anona26587f20402() 119     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
120       auto strat = new sme2_s8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
121       return new DepthwisePlanar<int8_t>(strat, args, qp);
122     },
123   },
124   {
125     DepthwiseMethod::PLANAR,
126     "sme2_s8q_planar_5x5_s2_4rows_dot_za",
127     constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
128                              is_supported<sme2_s8q_planar_5x5_s2_4rows_dot_za>,
129                              has_no_channel_multiplier,
130                              qp_has_no_left_shift, no_prime_right_pad),
131     nullptr,
__anona26587f20502() 132     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
133       auto strat = new sme2_s8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
134       return new DepthwisePlanar<int8_t>(strat, args, qp);
135     },
136   },
137 #endif  // defined(ARM_COMPUTE_ENABLE_SME2)
138   {
139     DepthwiseMethod::DEPTHFIRST,
140     "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
141     constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
142                              has_no_channel_multiplier,
143                              qp_has_no_left_shift,
144                              qp_weights_are_symmetric,
145                              cpu_has_sve2),
146     nullptr,
__anona26587f20602() 147     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
148       auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
149       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
150     },
151   },
152   {
153     DepthwiseMethod::DEPTHFIRST,
154     "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
155     constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
156                              has_no_channel_multiplier,
157                              qp_has_no_left_shift,
158                              cpu_has_sve2),
159     nullptr,
__anona26587f20702() 160     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
161       auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
162       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
163     },
164   },
165   {
166     DepthwiseMethod::DEPTHFIRST,
167     "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
168     constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
169                              has_no_channel_multiplier,
170                              qp_has_no_left_shift,
171                              cpu_has_sve2),
172     nullptr,
__anona26587f20802() 173     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
174       auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
175       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
176     },
177   },
178   {
179     DepthwiseMethod::DEPTHFIRST,
180     "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
181     constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
182                              has_no_channel_multiplier,
183                              qp_has_no_left_shift,
184                              cpu_has_sve2),
185     nullptr,
__anona26587f20902() 186     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
187       auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
188       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
189     },
190   },
191   {
192     DepthwiseMethod::DEPTHFIRST,
193     "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
194     constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
195                              has_no_channel_multiplier,
196                              qp_has_no_left_shift,
197                              cpu_has_sve2),
198     nullptr,
__anona26587f20a02() 199     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
200       auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
201       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
202     },
203   },
204   {
205     DepthwiseMethod::DEPTHFIRST,
206     "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
207     constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
208                              qp_has_no_left_shift,
209                              has_channel_multiplier,
210                              cpu_has_sve2),
211     nullptr,
__anona26587f20b02() 212     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
213       auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
214       return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
215     },
216   },
217   {
218     DepthwiseMethod::DEPTHFIRST,
219     "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
220     constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
221                              qp_has_no_left_shift,
222                              has_channel_multiplier,
223                              cpu_has_sve2),
224     nullptr,
__anona26587f20c02() 225     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
226       auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
227       return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
228     },
229   },
230 #endif  // defined(ARM_COMPUTE_ENABLE_SVE)
231   {
232     DepthwiseMethod::DEPTHFIRST,
233     "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
234     constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
235                              has_no_channel_multiplier,
236                              qp_weights_are_symmetric,
237                              qp_has_no_left_shift,
238                              cpu_has_dot_product),
239     nullptr,
__anona26587f20d02() 240     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
241       auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
242       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
243     },
244   },
245   {
246     DepthwiseMethod::DEPTHFIRST,
247     "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
248     constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
249                              has_no_channel_multiplier,
250                              qp_has_no_left_shift,
251                              cpu_has_dot_product),
252     nullptr,
__anona26587f20e02() 253     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
254       auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
255       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
256     },
257   },
258   {
259     DepthwiseMethod::DEPTHFIRST,
260     "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
261     constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
262                              has_no_channel_multiplier,
263                              qp_has_no_left_shift),
264     nullptr,
__anona26587f20f02() 265     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
266       auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
267       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
268     },
269   },
270   {
271     DepthwiseMethod::DEPTHFIRST,
272     "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
273     constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
274                              has_no_channel_multiplier,
275                              qp_has_no_left_shift),
276     nullptr,
__anona26587f21002() 277     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
278       auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
279       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
280     },
281   },
282   {
283     DepthwiseMethod::DEPTHFIRST,
284     "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
285     constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
286                              has_no_channel_multiplier,
287                              qp_has_no_left_shift),
288     nullptr,
__anona26587f21102() 289     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
290       auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
291       return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
292     },
293   },
294   {
295     DepthwiseMethod::DEPTHFIRST,
296     "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
297     constraint<Requantize32>(has_no_channel_multiplier),
298     nullptr,
__anona26587f21202() 299     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
300       auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
301       auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
302       return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
303     },
304   },
305   {
306     DepthwiseMethod::DEPTHFIRST,
307     "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
308     constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
309                              qp_has_no_left_shift,
310                              has_channel_multiplier,
311                              cpu_has_dot_product),
312     nullptr,
__anona26587f21302() 313     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
314       auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
315       return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
316     },
317   },
318   {
319     DepthwiseMethod::DEPTHFIRST,
320     "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
321     constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
322                              qp_has_no_left_shift,
323                              has_channel_multiplier,
324                              cpu_has_dot_product),
325     nullptr,
__anona26587f21402() 326     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
327       auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
328       return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
329     },
330   },
331   {
332     DepthwiseMethod::DEPTHFIRST,
333     "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
334     constraint<Requantize32>(has_channel_multiplier),
335     nullptr,
__anona26587f21502() 336     [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
337       auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
338       auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
339       return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
340     },
341   },
342 #endif  // defined(__aarch64__)
343   { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr },  // End of list
344 };
345 
346 template <>
depthwise_implementation_list()347 const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
348 {
349   return depthwise_s8q_methods;
350 }
351 
352 template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
353 template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
354 
355 }  // namespace depthwise
356 }  // namespace arm_conv
357