1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "arm_gemm_local.hpp"
26
27 #include "depthwise_implementation.hpp"
28 #include "depthwise_depthfirst.hpp"
29 #include "depthwise_depthfirst_generic.hpp"
30 #include "depthwise_depthfirst_multiplier.hpp"
31 #include "depthwise_planar.hpp"
32
33 #include "depthwise_implementation_constraints.hpp"
34
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
38 #include "kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp"
39 #include "kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp"
40 #include "kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp"
41 #include "kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp"
42 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
43
44 #include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
45 #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
46 #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
47 #include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
48 #include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
49 #include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
50 #include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
51 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
52 #include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
53 #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
54 #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
55 #include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
56 #include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
57 #include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
58 #include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
59 #include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
60 #include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
61 #endif // defined(__aarch64__)
62
63 #include <cstdint>
64
65 using arm_gemm::Requantize32;
66
67 namespace arm_conv {
68 namespace depthwise {
69
70 namespace
71 {
72 #if defined(__aarch64__)
qp_weights_are_symmetric(const DepthwiseArgs &,const void * _qp)73 bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
74 {
75 const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
76 return qp->b_offset == 0;
77 }
78 #endif // defined(__aarch64__)
79 }
80
81 static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
82 #if defined(__aarch64__)
83 #if defined(ARM_COMPUTE_ENABLE_SVE)
84 #if defined(ARM_COMPUTE_ENABLE_SME2)
85 {
86 DepthwiseMethod::PLANAR,
87 "sme2_s8q_planar_3x3_s1_4rows_dot_za",
88 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
89 is_supported<sme2_s8q_planar_3x3_s1_4rows_dot_za>,
90 has_no_channel_multiplier,
91 qp_has_no_left_shift, no_prime_right_pad),
92 nullptr,
__anona26587f20202() 93 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
94 auto strat = new sme2_s8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
95 return new DepthwisePlanar<int8_t>(strat, args, qp);
96 },
97 },
98 {
99 DepthwiseMethod::PLANAR,
100 "sme2_s8q_planar_3x3_s2_4rows_dot_za",
101 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
102 is_supported<sme2_s8q_planar_3x3_s2_4rows_dot_za>,
103 has_no_channel_multiplier,
104 qp_has_no_left_shift, no_prime_right_pad),
105 nullptr,
__anona26587f20302() 106 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
107 auto strat = new sme2_s8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
108 return new DepthwisePlanar<int8_t>(strat, args, qp);
109 },
110 },
111 {
112 DepthwiseMethod::PLANAR,
113 "sme2_s8q_planar_5x5_s1_4rows_dot_za",
114 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
115 is_supported<sme2_s8q_planar_5x5_s1_4rows_dot_za>,
116 has_no_channel_multiplier,
117 qp_has_no_left_shift, no_prime_right_pad),
118 nullptr,
__anona26587f20402() 119 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
120 auto strat = new sme2_s8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
121 return new DepthwisePlanar<int8_t>(strat, args, qp);
122 },
123 },
124 {
125 DepthwiseMethod::PLANAR,
126 "sme2_s8q_planar_5x5_s2_4rows_dot_za",
127 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
128 is_supported<sme2_s8q_planar_5x5_s2_4rows_dot_za>,
129 has_no_channel_multiplier,
130 qp_has_no_left_shift, no_prime_right_pad),
131 nullptr,
__anona26587f20502() 132 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
133 auto strat = new sme2_s8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
134 return new DepthwisePlanar<int8_t>(strat, args, qp);
135 },
136 },
137 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
138 {
139 DepthwiseMethod::DEPTHFIRST,
140 "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
141 constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
142 has_no_channel_multiplier,
143 qp_has_no_left_shift,
144 qp_weights_are_symmetric,
145 cpu_has_sve2),
146 nullptr,
__anona26587f20602() 147 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
148 auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
149 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
150 },
151 },
152 {
153 DepthwiseMethod::DEPTHFIRST,
154 "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
155 constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
156 has_no_channel_multiplier,
157 qp_has_no_left_shift,
158 cpu_has_sve2),
159 nullptr,
__anona26587f20702() 160 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
161 auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
162 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
163 },
164 },
165 {
166 DepthwiseMethod::DEPTHFIRST,
167 "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
168 constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
169 has_no_channel_multiplier,
170 qp_has_no_left_shift,
171 cpu_has_sve2),
172 nullptr,
__anona26587f20802() 173 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
174 auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
175 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
176 },
177 },
178 {
179 DepthwiseMethod::DEPTHFIRST,
180 "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
181 constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
182 has_no_channel_multiplier,
183 qp_has_no_left_shift,
184 cpu_has_sve2),
185 nullptr,
__anona26587f20902() 186 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
187 auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
188 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
189 },
190 },
191 {
192 DepthwiseMethod::DEPTHFIRST,
193 "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
194 constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
195 has_no_channel_multiplier,
196 qp_has_no_left_shift,
197 cpu_has_sve2),
198 nullptr,
__anona26587f20a02() 199 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
200 auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
201 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
202 },
203 },
204 {
205 DepthwiseMethod::DEPTHFIRST,
206 "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
207 constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
208 qp_has_no_left_shift,
209 has_channel_multiplier,
210 cpu_has_sve2),
211 nullptr,
__anona26587f20b02() 212 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
213 auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
214 return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
215 },
216 },
217 {
218 DepthwiseMethod::DEPTHFIRST,
219 "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
220 constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
221 qp_has_no_left_shift,
222 has_channel_multiplier,
223 cpu_has_sve2),
224 nullptr,
__anona26587f20c02() 225 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
226 auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
227 return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
228 },
229 },
230 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
231 {
232 DepthwiseMethod::DEPTHFIRST,
233 "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
234 constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
235 has_no_channel_multiplier,
236 qp_weights_are_symmetric,
237 qp_has_no_left_shift,
238 cpu_has_dot_product),
239 nullptr,
__anona26587f20d02() 240 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
241 auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
242 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
243 },
244 },
245 {
246 DepthwiseMethod::DEPTHFIRST,
247 "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
248 constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
249 has_no_channel_multiplier,
250 qp_has_no_left_shift,
251 cpu_has_dot_product),
252 nullptr,
__anona26587f20e02() 253 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
254 auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
255 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
256 },
257 },
258 {
259 DepthwiseMethod::DEPTHFIRST,
260 "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
261 constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
262 has_no_channel_multiplier,
263 qp_has_no_left_shift),
264 nullptr,
__anona26587f20f02() 265 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
266 auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
267 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
268 },
269 },
270 {
271 DepthwiseMethod::DEPTHFIRST,
272 "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
273 constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
274 has_no_channel_multiplier,
275 qp_has_no_left_shift),
276 nullptr,
__anona26587f21002() 277 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
278 auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
279 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
280 },
281 },
282 {
283 DepthwiseMethod::DEPTHFIRST,
284 "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
285 constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
286 has_no_channel_multiplier,
287 qp_has_no_left_shift),
288 nullptr,
__anona26587f21102() 289 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
290 auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
291 return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
292 },
293 },
294 {
295 DepthwiseMethod::DEPTHFIRST,
296 "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
297 constraint<Requantize32>(has_no_channel_multiplier),
298 nullptr,
__anona26587f21202() 299 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
300 auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
301 auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
302 return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
303 },
304 },
305 {
306 DepthwiseMethod::DEPTHFIRST,
307 "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
308 constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
309 qp_has_no_left_shift,
310 has_channel_multiplier,
311 cpu_has_dot_product),
312 nullptr,
__anona26587f21302() 313 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
314 auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
315 return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
316 },
317 },
318 {
319 DepthwiseMethod::DEPTHFIRST,
320 "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
321 constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
322 qp_has_no_left_shift,
323 has_channel_multiplier,
324 cpu_has_dot_product),
325 nullptr,
__anona26587f21402() 326 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
327 auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
328 return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
329 },
330 },
331 {
332 DepthwiseMethod::DEPTHFIRST,
333 "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
334 constraint<Requantize32>(has_channel_multiplier),
335 nullptr,
__anona26587f21502() 336 [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
337 auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
338 auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
339 return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
340 },
341 },
342 #endif // defined(__aarch64__)
343 { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
344 };
345
346 template <>
depthwise_implementation_list()347 const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
348 {
349 return depthwise_s8q_methods;
350 }
351
352 template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
353 template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
354
355 } // namespace depthwise
356 } // namespace arm_conv
357