1 /*
2 * Copyright (c) 2020-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "asmlib.hpp"
26 #include "convolution_parameters.hpp"
27 #include "convolver.hpp"
28 #include "interleave_indirect.hpp"
29 #include "bfloat.hpp"
30
31 #if !defined(_WIN64) && !defined(__OpenBSD__)
32 #include <alloca.h>
33 #endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
34
35 #include <algorithm>
36 #include <cstddef>
37 #include <cstdint>
38 #include <cstdio>
39 #include <cstring>
40 #include <tuple>
41 #include <type_traits>
42 #include <vector>
43
44 #include <arm_neon.h>
45
46 #include "utils.hpp"
47
48 namespace arm_gemm {
49
50 /*
51 * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
52 *
53 * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
54 * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
55 * with a particular value.
56 *
57 * Note that it is not expected for this templated version to ever be used - all cases that matter should be
58 * explicitly specialized with an optimized implementation.
59 */
60 template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
interleave_block(TOut * & out,const TIn * const * in,size_t width,size_t height,size_t row_offset,bool first)61 void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
62 const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
63
64 std::vector<int32_t> the_sums;
65
66 if (integrate_sums) {
67 the_sums = std::vector<int32_t>(int_by, 0);
68
69 if (!first) {
70 // In 'integrate sums' mode, we dump the sums at the end on each pass.
71
72 // On the last pass this is correct, but on other passes it is not -
73 // so on the subsequent pass we need to take the output written by
74 // the previous pass as starting point for the sums, and then
75 // overwrite them with new interleaved data.
76 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
77
78 // Rewind pointer to where we wrote out the sums last time.
79 out_int32 -= int_by;
80
81 // Restore the running sums.
82 memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
83
84 // Update the "real" pointer so that the next output will clobber the old sums.
85 out = reinterpret_cast<TOut *>(out_int32);
86 }
87 }
88
89 for (unsigned int pos=0; pos<width; pos+=block) {
90 for (unsigned int row=0; row<int_by; row++) {
91 // Row out of range - pad 'block' entries.
92 if (row >= height) {
93 for (unsigned int col=0; col<block; col++) {
94 *out++ = 0;
95 }
96 continue;
97 }
98
99 for (unsigned int col=0; col<block; col++) {
100 // Column out of range - pad a single entry
101 if (pos + col >= width) {
102 *out++ = 0;
103 continue;
104 }
105
106 if (integrate_sums) {
107 the_sums[row] += in[row][row_offset + pos + col];
108 }
109
110 *out++ = in[row][row_offset + pos + col];
111 }
112 }
113 }
114
115 if (integrate_sums) {
116 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
117
118 memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
119
120 out = reinterpret_cast<TOut *>(out_int32 + int_by);
121 }
122 }
123
124 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
FixupRowSums(TOut * & out,const int32_t row_sum_multiplier)125 inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
126 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
127
128 // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
129 if (row_sum_multiplier) {
130 // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
131 // next block (post sums).
132 // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
133 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
134
135 out_int32 -= height;
136 for (unsigned int i=0; i<height; i++) {
137 out_int32[i] *= row_sum_multiplier;
138 }
139 } else {
140 // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
141 // sum block. We need to insert the (zero) sums, and advance 'out'.
142 int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
143
144 for (unsigned int i=0; i<height; i++) {
145 out_int32[i] = 0;
146 }
147
148 out_int32 += height;
149
150 out = reinterpret_cast<TOut *>(out_int32);
151 }
152 }
153
154 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
IndirectInterleave(TOut * out,const TIn * const * const * ptr,unsigned int stringlen,unsigned int rounded_stringlen,const unsigned int y0,const unsigned int ymax,const unsigned int k0,const unsigned int kmax,bool integrate_sums,const int32_t row_sum_multiplier)155 void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
156 unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
157 const unsigned int k0, const unsigned int kmax, bool integrate_sums,
158 const int32_t row_sum_multiplier) {
159 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
160
161 // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
162 // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
163 // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
164 // pointers and conditionally overriding the out of range ones.
165
166 // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
167 // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
168 // expensive in highly threaded scenarios.
169 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
170
171 // Figure out the starting position based on k0 (with rounded length)
172 unsigned int start_string = k0 / rounded_stringlen;
173 unsigned int start_stringpos = k0 % rounded_stringlen;
174
175 // Process blocks of 'height' height...
176 for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
177 // Height to process
178 unsigned int active_height = std::min(ymax - ybase, height);
179
180 // Track our progress through the various strings
181 unsigned int k_left = (kmax - k0);
182 unsigned int string = start_string;
183 unsigned int stringpos = start_stringpos;
184
185 bool first = true;
186
187 // Prepare to call 'interleave_block' above for each string encompassed by K range
188 while (k_left > 0) {
189 // Width to process - and the width we will generate (with padding)
190 unsigned int in_width = std::min(k_left, stringlen - stringpos);
191 unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
192
193 const TIn * const *row_base = ptr[string] + ybase;
194
195 // If not all rows are valid, copy the ones that are into local array (see above comment).
196 if (active_height < height) {
197 for (unsigned int i=0; i<active_height; i++) {
198 row_ptrs[i] = ptr[string][ybase + i];
199 }
200
201 row_base = row_ptrs;
202 }
203
204 // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
205 // much code. However, integrated sums make no sense for non-integral types and won't ever be
206 // requested. So put a type trait check here to avoid generating pointless code.
207 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
208 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
209 } else {
210 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
211 }
212
213 k_left -= out_width;
214 string++;
215 stringpos=0;
216 first=false;
217 }
218
219 if (std::is_integral<TOut>::value && integrate_sums) {
220 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
221 }
222 }
223 }
224
225 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
ConvolutionInterleave(TOut * out,const TIn * in,size_t in_stride,const convolver<TIn> & conv,const unsigned int rounded_stringlen,const unsigned int y0,const unsigned int ymax,const unsigned int k0,const unsigned int kmax,bool integrate_sums,const int32_t row_sum_multiplier)226 void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
227 const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
228 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
229
230 auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
231
232 // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
233 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
234
235 for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
236 // How many of the rows are active - the rest will get padded in interleave_block.
237 unsigned int active_height = std::min(ymax - ybase, height);
238 bool first = true;
239
240 auto conv_rows = conv_cols.process_rows(ybase, active_height);
241
242 while (!conv_rows.finished()) {
243 unsigned int width, offset;
244
245 // Get next set of parameters
246 std::tie(width, offset) = conv_rows.next_block(row_ptrs);
247
248 // Perform the interleave
249 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
250 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
251 } else {
252 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
253 }
254
255 first=false;
256 }
257
258 if (std::is_integral<TOut>::value && integrate_sums) {
259 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
260 }
261 }
262 }
263
264 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
Interleave(TOut * out,const TIn * in,size_t in_stride,const unsigned int y0,const unsigned int ymax,const unsigned int k0,const unsigned int kmax,bool integrate_sums,const int32_t row_sum_multiplier)265 void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
266 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
267
268 // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
269 const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
270
271 const unsigned int width=kmax-k0;
272
273 for (unsigned int y=y0; y<ymax; y+=height) {
274 for (unsigned int r=0; r<height; r++) {
275 row_ptrs[r] = in + ((y + r) * in_stride);
276 }
277
278 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
279 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280 } else {
281 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
282 }
283
284 if (std::is_integral<TOut>::value && integrate_sums) {
285 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
286 }
287 }
288 }
289
290 #include "indirect-interleaves/list.hpp"
291
292 /**** Instantiate needed implementations ****/
293
294 /* AArch32 */
295 #ifdef __arm__
296 /* FP32 */
297 /* Arm® Neon™ implementation (height 6) */
298 template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
299 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
300 template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
301
302 /* FP16 */
303 #if __ARM_FP16_ARGS
304 /* Arm® Neon™ implementation using FP32 kernel (height 6) */
305 template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
306 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
307 template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
308 #endif /* __ARM_FP16_ARGS */
309
310 /* BF16 */
311 /* Arm® Neon™ implementation using FP32 kernel */
312 template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
313 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
314 template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
315 #endif
316
317 /* AArch64 */
318 #ifdef __aarch64__
319 /* FP32 */
320 /* Arm® Neon™/SVE implementation (height 8) */
321 template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
322 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
323 template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
324
325 #if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM)
326 /* FMMLA */
327 template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
328 template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
329 template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
330 #endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM
331
332 /* FP16 */
333 #if defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
334 template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
335 template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
336 template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
337 #endif // FP16_KERNELS ar __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
338
339 template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
340 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
341 template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
342
343 /* BF16 */
344 /* Arm® Neon™/SVE BFDOT */
345 #ifdef ARM_COMPUTE_ENABLE_BF16
346 template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
347 template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
348 template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
349
350 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
351 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
352 template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
353
354 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
355 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
356 template void Interleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
357 #endif // ARM_COMPUTE_ENABLE_BF16
358
359 /* Arm® Neon™/SVE using FP32 kernel */
360 template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
361 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
362 template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
363
364 /* INT16 */
365 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
366 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
367 template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
368
369 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
370 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
371 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
372
373 /* INT8 */
374 /* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
375 template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
376 template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
377 template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
378
379 /* Arm® Neon™ SDOT (height 8, block 4) */
380 template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
381 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
382 template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
383
384 /* MMLA SMMLA (height 8, block 8) */
385 template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
386 template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
387 template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
388
389 /* Arm® Neon™ SDOT (height 8, block 1) */
390 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
391 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
392 template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
393
394 /* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
395 template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
396 template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
397 template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
398
399 /* Arm® Neon™ SDOT (height 8, block 4) */
400 template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
401 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
402 template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
403
404 /* MMLA SMMLA (height 8, block 8) */
405 template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
406 template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
407 template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
408
409 /* Arm® Neon™ 16-bit (height 8, block 1) */
410 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
411 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
412 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
413 #endif // __aarch64__
414
415 } // namespace arm_gemm
416