1 /*
2 * Copyright (c) 2017 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #ifdef __aarch64__
25
26 #include <arm_neon.h>
27
28 #include "../../asmlib.hpp"
29
30 namespace arm_gemm {
31
a64_gemm_s8_4x4(const int8_t * Apanel,const int8_t * Bpanel,int32_t * Cpanel,int ablocks,int bblocks,int K)32 void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
33 const int8_t *a_ptr = Apanel;
34 int32_t *c_ptr = Cpanel;
35
36 K /= 16;
37 int oddk = (K & 1);
38
39 for (int yb=0; yb<ablocks; yb++) {
40 const int8_t *a_ptr0 = a_ptr;
41 const int8_t *b_ptr = Bpanel;
42
43 for (int xb=0; xb<bblocks; xb++) {
44 a_ptr = a_ptr0;
45
46 int k = ((K+1)/2)-1;
47
48 register int8x16_t b0 asm("v4");
49 register int8x16_t b1 asm("v5");
50 register int8x16_t b2 asm("v6");
51 register int8x16_t b3 asm("v7");
52 register int8x16_t b0a asm("v8");
53 register int8x16_t b1a asm("v9");
54 register int8x16_t b2a asm("v10");
55 register int8x16_t b3a asm("v11");
56
57 __asm __volatile (
58 "movi v16.4s, #0x0\n"
59 "ldr q0, [%[a_ptr]]\n"
60 "movi v17.4s, #0x0\n"
61 "ldr %q[b0], [%[b_ptr]]\n"
62 "movi v18.4s, #0x0\n"
63 "ldr %q[b1], [%[b_ptr], #16]\n"
64 "movi v19.4s, #0x0\n"
65 "ldr %q[b2], [%[b_ptr], #32]\n"
66 "movi v20.4s, #0x0\n"
67 "ldr %q[b3], [%[b_ptr], #48]\n"
68 "movi v21.4s, #0x0\n"
69 "ldr q1, [%[a_ptr], #16]\n"
70 "movi v22.4s, #0x0\n"
71 "ldr q2, [%[a_ptr], #32]\n"
72 "movi v23.4s, #0x0\n"
73 "ldr q3, [%[a_ptr], #48]\n"
74 "movi v24.4s, #0x0\n"
75 ASM_PREFETCH("[%[b_ptr], #64]")
76 "movi v25.4s, #0x0\n"
77 ASM_PREFETCH("[%[a_ptr], #64]")
78 "movi v26.4s, #0x0\n"
79 ASM_PREFETCH("[%[b_ptr], #128]")
80 "movi v27.4s, #0x0\n"
81 ASM_PREFETCH("[%[a_ptr], #128]")
82 "movi v28.4s, #0x0\n"
83 ASM_PREFETCH("[%[b_ptr], #192]")
84 "movi v29.4s, #0x0\n"
85 ASM_PREFETCH("[%[a_ptr], #192]")
86 "movi v30.4s, #0x0\n"
87 ASM_PREFETCH("[%[b_ptr], #256]")
88 "movi v31.4s, #0x0\n"
89 ASM_PREFETCH("[%[a_ptr], #256]")
90
91 // Loop structure optimized for A57 (after r0).
92
93 // Unavoidably, the multiply will "dribble" if
94 // dual issued with an add.
95
96 // Minimize the effect of this by making sure
97 // there are 2 adds to run under the dribbled
98 // multiply.
99
100 // Pipeline in blocks of 8 multiplies - combine
101 // this iteration's multiplies with adds from
102 // the previous iteration.
103
104 // So the first block doesn't have any adds to
105 // do - but because all the adds are at the
106 // start of the block it's only the first couple
107 // of multiplies that need to be pulled out.
108
109 // Start of unroll 0 (first iteration)
110 "smull v12.8h, v0.8b, %[b0].8b\n"
111 "smull v13.8h, v0.8b, %[b1].8b\n"
112
113 // Skip loop if we are doing zero iterations of it.
114 "cbz %w[k], 4f\n"
115
116 // Unroll 0 continuation (branch target)
117 "1:\n"
118 "smull v14.8h, v0.8b, %[b2].8b\n"
119 "subs %w[k], %w[k], #1\n"
120 "smull v15.8h, v0.8b, %[b3].8b\n"
121 "ldr %q[b0a], [%[b_ptr], #64]\n"
122 "smlal2 v12.8h, v0.16b, %[b0].16b\n"
123 "smlal2 v13.8h, v0.16b, %[b1].16b\n"
124 "ldr %q[b1a], [%[b_ptr], #80]\n"
125 "smlal2 v14.8h, v0.16b, %[b2].16b\n"
126 "smlal2 v15.8h, v0.16b, %[b3].16b\n"
127 "ldr q0, [%[a_ptr], #64]\n"
128
129 "sadalp v16.4s, v12.8h\n"
130 "smull v12.8h, v1.8b, %[b0].8b\n"
131 "sadalp v17.4s, v13.8h\n"
132 "sadalp v18.4s, v14.8h\n"
133 "smull v13.8h, v1.8b, %[b1].8b\n"
134 "sadalp v19.4s, v15.8h\n"
135 "smull v14.8h, v1.8b, %[b2].8b\n"
136 "ldr %q[b2a], [%[b_ptr], #96]\n"
137 "smull v15.8h, v1.8b, %[b3].8b\n"
138 "smlal2 v12.8h, v1.16b, %[b0].16b\n"
139 "ldr %q[b3a], [%[b_ptr], #112]\n"
140 "smlal2 v13.8h, v1.16b, %[b1].16b\n"
141 "add %[b_ptr], %[b_ptr], #128\n"
142 "smlal2 v14.8h, v1.16b, %[b2].16b\n"
143 "smlal2 v15.8h, v1.16b, %[b3].16b\n"
144 "ldr q1, [%[a_ptr], #80]\n"
145
146 "sadalp v20.4s, v12.8h\n"
147 "smull v12.8h, v2.8b, %[b0].8b\n"
148 "sadalp v21.4s, v13.8h\n"
149 "sadalp v22.4s, v14.8h\n"
150 "smull v13.8h, v2.8b, %[b1].8b\n"
151 "sadalp v23.4s, v15.8h\n"
152 "smull v14.8h, v2.8b, %[b2].8b\n"
153 "smull v15.8h, v2.8b, %[b3].8b\n"
154 "smlal2 v12.8h, v2.16b, %[b0].16b\n"
155 ASM_PREFETCH("[%[b_ptr], #192]")
156 "smlal2 v13.8h, v2.16b, %[b1].16b\n"
157 "smlal2 v14.8h, v2.16b, %[b2].16b\n"
158 ASM_PREFETCH("[%[a_ptr], #320]")
159 "smlal2 v15.8h, v2.16b, %[b3].16b\n"
160 "ldr q2, [%[a_ptr], #96]\n"
161
162 "sadalp v24.4s, v12.8h\n"
163 "smull v12.8h, v3.8b, %[b0].8b\n"
164 "sadalp v25.4s, v13.8h\n"
165 "sadalp v26.4s, v14.8h\n"
166 "smull v13.8h, v3.8b, %[b1].8b\n"
167 "sadalp v27.4s, v15.8h\n"
168 "smull v14.8h, v3.8b, %[b2].8b\n"
169 "smull v15.8h, v3.8b, %[b3].8b\n"
170 "smlal2 v12.8h, v3.16b, %[b0].16b\n"
171 "ldr %q[b0], [%[b_ptr], #0]\n"
172 "smlal2 v13.8h, v3.16b, %[b1].16b\n"
173 "smlal2 v14.8h, v3.16b, %[b2].16b\n"
174 "smlal2 v15.8h, v3.16b, %[b3].16b\n"
175 "ldr q3, [%[a_ptr], #112]\n"
176
177 // Unroll 1
178 "sadalp v28.4s, v12.8h\n"
179 "smull v12.8h, v0.8b, %[b0a].8b\n"
180 "sadalp v29.4s, v13.8h\n"
181 "sadalp v30.4s, v14.8h\n"
182 "smull v13.8h, v0.8b, %[b1a].8b\n"
183 "sadalp v31.4s, v15.8h\n"
184 "smull v14.8h, v0.8b, %[b2a].8b\n"
185 "smull v15.8h, v0.8b, %[b3a].8b\n"
186 "ldr %q[b1], [%[b_ptr], #16]\n"
187 "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
188 "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
189 "ldr %q[b2], [%[b_ptr], #32]\n"
190 "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
191 "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
192 "ldr q0, [%[a_ptr], #128]\n"
193
194 "sadalp v16.4s, v12.8h\n"
195 "smull v12.8h, v1.8b, %[b0a].8b\n"
196 "sadalp v17.4s, v13.8h\n"
197 "sadalp v18.4s, v14.8h\n"
198 "smull v13.8h, v1.8b, %[b1a].8b\n"
199 "sadalp v19.4s, v15.8h\n"
200 "add %[a_ptr], %[a_ptr], #128\n"
201 "smull v14.8h, v1.8b, %[b2a].8b\n"
202 "smull v15.8h, v1.8b, %[b3a].8b\n"
203 "ldr %q[b3], [%[b_ptr], #48]\n"
204 "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
205 "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
206 "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
207 "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
208 "ldr q1, [%[a_ptr], #16]\n"
209
210 "sadalp v20.4s, v12.8h\n"
211 "smull v12.8h, v2.8b, %[b0a].8b\n"
212 "sadalp v21.4s, v13.8h\n"
213 "sadalp v22.4s, v14.8h\n"
214 "smull v13.8h, v2.8b, %[b1a].8b\n"
215 "sadalp v23.4s, v15.8h\n"
216 "smull v14.8h, v2.8b, %[b2a].8b\n"
217 "smull v15.8h, v2.8b, %[b3a].8b\n"
218 "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
219 ASM_PREFETCH("[%[b_ptr], #256]")
220 "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
221 "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
222 ASM_PREFETCH("[%[a_ptr], #256]")
223 "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
224 "ldr q2, [%[a_ptr], #32]\n"
225
226 "sadalp v24.4s, v12.8h\n"
227 "smull v12.8h, v3.8b, %[b0a].8b\n"
228 "sadalp v25.4s, v13.8h\n"
229 "sadalp v26.4s, v14.8h\n"
230 "smull v13.8h, v3.8b, %[b1a].8b\n"
231 "sadalp v27.4s, v15.8h\n"
232 "smull v14.8h, v3.8b, %[b2a].8b\n"
233 "smull v15.8h, v3.8b, %[b3a].8b\n"
234 "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
235 "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
236 "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
237 "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
238 "ldr q3, [%[a_ptr], #48]\n"
239
240 // Start of unroll 0 for next iteration.
241 "sadalp v28.4s, v12.8h\n"
242 "smull v12.8h, v0.8b, %[b0].8b\n"
243 "sadalp v29.4s, v13.8h\n"
244 "sadalp v30.4s, v14.8h\n"
245 "smull v13.8h, v0.8b, %[b1].8b\n"
246 "sadalp v31.4s, v15.8h\n"
247 "bne 1b\n"
248
249 // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
250 "4:\n"
251
252 // Branch to alternative tail for odd K
253 "cbnz %w[oddk], 2f\n"
254
255 // Detached final iteration (even K)
256 "smull v14.8h, v0.8b, %[b2].8b\n"
257 "smull v15.8h, v0.8b, %[b3].8b\n"
258 "ldr %q[b0a], [%[b_ptr], #64]\n"
259 "smlal2 v12.8h, v0.16b, %[b0].16b\n"
260 "smlal2 v13.8h, v0.16b, %[b1].16b\n"
261 "ldr %q[b1a], [%[b_ptr], #80]\n"
262 "smlal2 v14.8h, v0.16b, %[b2].16b\n"
263 "smlal2 v15.8h, v0.16b, %[b3].16b\n"
264 "ldr q0, [%[a_ptr], #64]\n"
265
266 "sadalp v16.4s, v12.8h\n"
267 "smull v12.8h, v1.8b, %[b0].8b\n"
268 "sadalp v17.4s, v13.8h\n"
269 "sadalp v18.4s, v14.8h\n"
270 "smull v13.8h, v1.8b, %[b1].8b\n"
271 "sadalp v19.4s, v15.8h\n"
272 "smull v14.8h, v1.8b, %[b2].8b\n"
273 "ldr %q[b2a], [%[b_ptr], #96]\n"
274 "smull v15.8h, v1.8b, %[b3].8b\n"
275 "smlal2 v12.8h, v1.16b, %[b0].16b\n"
276 "ldr %q[b3a], [%[b_ptr], #112]\n"
277 "smlal2 v13.8h, v1.16b, %[b1].16b\n"
278 "add %[b_ptr], %[b_ptr], #128\n"
279 "smlal2 v14.8h, v1.16b, %[b2].16b\n"
280 "smlal2 v15.8h, v1.16b, %[b3].16b\n"
281 "ldr q1, [%[a_ptr], #80]\n"
282
283 "sadalp v20.4s, v12.8h\n"
284 "smull v12.8h, v2.8b, %[b0].8b\n"
285 "sadalp v21.4s, v13.8h\n"
286 "sadalp v22.4s, v14.8h\n"
287 "smull v13.8h, v2.8b, %[b1].8b\n"
288 "sadalp v23.4s, v15.8h\n"
289 "smull v14.8h, v2.8b, %[b2].8b\n"
290 "smull v15.8h, v2.8b, %[b3].8b\n"
291 "smlal2 v12.8h, v2.16b, %[b0].16b\n"
292 "smlal2 v13.8h, v2.16b, %[b1].16b\n"
293 "smlal2 v14.8h, v2.16b, %[b2].16b\n"
294 "smlal2 v15.8h, v2.16b, %[b3].16b\n"
295 "ldr q2, [%[a_ptr], #96]\n"
296
297 "sadalp v24.4s, v12.8h\n"
298 "smull v12.8h, v3.8b, %[b0].8b\n"
299 "sadalp v25.4s, v13.8h\n"
300 "sadalp v26.4s, v14.8h\n"
301 "smull v13.8h, v3.8b, %[b1].8b\n"
302 "sadalp v27.4s, v15.8h\n"
303 "smull v14.8h, v3.8b, %[b2].8b\n"
304 "smull v15.8h, v3.8b, %[b3].8b\n"
305 "smlal2 v12.8h, v3.16b, %[b0].16b\n"
306 "smlal2 v13.8h, v3.16b, %[b1].16b\n"
307 "smlal2 v14.8h, v3.16b, %[b2].16b\n"
308 "smlal2 v15.8h, v3.16b, %[b3].16b\n"
309 "ldr q3, [%[a_ptr], #112]\n"
310
311 // Unroll 1
312 "sadalp v28.4s, v12.8h\n"
313 "smull v12.8h, v0.8b, %[b0a].8b\n"
314 "sadalp v29.4s, v13.8h\n"
315 "sadalp v30.4s, v14.8h\n"
316 "smull v13.8h, v0.8b, %[b1a].8b\n"
317 "sadalp v31.4s, v15.8h\n"
318 "smull v14.8h, v0.8b, %[b2a].8b\n"
319 "add %[a_ptr], %[a_ptr], #128\n"
320 "smull v15.8h, v0.8b, %[b3a].8b\n"
321 "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
322 "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
323 "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
324 "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
325
326 "sadalp v16.4s, v12.8h\n"
327 "smull v12.8h, v1.8b, %[b0a].8b\n"
328 "sadalp v17.4s, v13.8h\n"
329 "sadalp v18.4s, v14.8h\n"
330 "smull v13.8h, v1.8b, %[b1a].8b\n"
331 "sadalp v19.4s, v15.8h\n"
332 "smull v14.8h, v1.8b, %[b2a].8b\n"
333 "smull v15.8h, v1.8b, %[b3a].8b\n"
334 "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
335 "addp v16.4s, v16.4s, v17.4s\n"
336 "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
337 "addp v17.4s, v18.4s, v19.4s\n"
338 "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
339 "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
340
341 "sadalp v20.4s, v12.8h\n"
342 "smull v12.8h, v2.8b, %[b0a].8b\n"
343 "sadalp v21.4s, v13.8h\n"
344 "sadalp v22.4s, v14.8h\n"
345 "smull v13.8h, v2.8b, %[b1a].8b\n"
346 "sadalp v23.4s, v15.8h\n"
347 "addp v16.4s, v16.4s, v17.4s\n"
348 "smull v14.8h, v2.8b, %[b2a].8b\n"
349 "addp v18.4s, v20.4s, v21.4s\n"
350 "addp v19.4s, v22.4s, v23.4s\n"
351 "smull v15.8h, v2.8b, %[b3a].8b\n"
352 "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
353 "str q16, [%[c_ptr]]\n"
354 "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
355 "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
356 "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
357
358 "sadalp v24.4s, v12.8h\n"
359 "smull v12.8h, v3.8b, %[b0a].8b\n"
360 "sadalp v25.4s, v13.8h\n"
361 "sadalp v26.4s, v14.8h\n"
362 "smull v13.8h, v3.8b, %[b1a].8b\n"
363 "sadalp v27.4s, v15.8h\n"
364 "addp v17.4s, v18.4s, v19.4s\n"
365 "smull v14.8h, v3.8b, %[b2a].8b\n"
366 "addp v20.4s, v24.4s, v25.4s\n"
367 "addp v21.4s, v26.4s, v27.4s\n"
368 "smull v15.8h, v3.8b, %[b3a].8b\n"
369 "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
370 "str q17, [%[c_ptr], #16]\n"
371 "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
372 "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
373 "addp v18.4s, v20.4s, v21.4s\n"
374 "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
375 "b 3f\n"
376
377 // Detached final iteration (odd K)
378 "2:\n"
379 "smull v14.8h, v0.8b, %[b2].8b\n"
380 "add %[a_ptr], %[a_ptr], #64\n"
381 "smull v15.8h, v0.8b, %[b3].8b\n"
382 "add %[b_ptr], %[b_ptr], #64\n"
383 "smlal2 v12.8h, v0.16b, %[b0].16b\n"
384 "smlal2 v13.8h, v0.16b, %[b1].16b\n"
385 "smlal2 v14.8h, v0.16b, %[b2].16b\n"
386 "smlal2 v15.8h, v0.16b, %[b3].16b\n"
387
388 "sadalp v16.4s, v12.8h\n"
389 "smull v12.8h, v1.8b, %[b0].8b\n"
390 "sadalp v17.4s, v13.8h\n"
391 "sadalp v18.4s, v14.8h\n"
392 "smull v13.8h, v1.8b, %[b1].8b\n"
393 "sadalp v19.4s, v15.8h\n"
394 "smull v14.8h, v1.8b, %[b2].8b\n"
395 "smull v15.8h, v1.8b, %[b3].8b\n"
396 "smlal2 v12.8h, v1.16b, %[b0].16b\n"
397 "addp v16.4s, v16.4s, v17.4s\n"
398 "smlal2 v13.8h, v1.16b, %[b1].16b\n"
399 "addp v17.4s, v18.4s, v19.4s\n"
400 "smlal2 v14.8h, v1.16b, %[b2].16b\n"
401 "smlal2 v15.8h, v1.16b, %[b3].16b\n"
402
403 "sadalp v20.4s, v12.8h\n"
404 "smull v12.8h, v2.8b, %[b0].8b\n"
405 "sadalp v21.4s, v13.8h\n"
406 "sadalp v22.4s, v14.8h\n"
407 "smull v13.8h, v2.8b, %[b1].8b\n"
408 "sadalp v23.4s, v15.8h\n"
409 "addp v16.4s, v16.4s, v17.4s\n"
410 "smull v14.8h, v2.8b, %[b2].8b\n"
411 "addp v18.4s, v20.4s, v21.4s\n"
412 "addp v19.4s, v22.4s, v23.4s\n"
413 "smull v15.8h, v2.8b, %[b3].8b\n"
414 "smlal2 v12.8h, v2.16b, %[b0].16b\n"
415 "str q16, [%[c_ptr]]\n"
416 "smlal2 v13.8h, v2.16b, %[b1].16b\n"
417 "smlal2 v14.8h, v2.16b, %[b2].16b\n"
418 "smlal2 v15.8h, v2.16b, %[b3].16b\n"
419
420 "sadalp v24.4s, v12.8h\n"
421 "smull v12.8h, v3.8b, %[b0].8b\n"
422 "sadalp v25.4s, v13.8h\n"
423 "sadalp v26.4s, v14.8h\n"
424 "smull v13.8h, v3.8b, %[b1].8b\n"
425 "sadalp v27.4s, v15.8h\n"
426 "addp v17.4s, v18.4s, v19.4s\n"
427 "smull v14.8h, v3.8b, %[b2].8b\n"
428 "addp v20.4s, v24.4s, v25.4s\n"
429 "addp v21.4s, v26.4s, v27.4s\n"
430 "smull v15.8h, v3.8b, %[b3].8b\n"
431 "smlal2 v12.8h, v3.16b, %[b0].16b\n"
432 "str q17, [%[c_ptr], #16]\n"
433 "smlal2 v13.8h, v3.16b, %[b1].16b\n"
434 "smlal2 v14.8h, v3.16b, %[b2].16b\n"
435 "addp v18.4s, v20.4s, v21.4s\n"
436 "smlal2 v15.8h, v3.16b, %[b3].16b\n"
437
438 "3:\n"
439
440 // Final additions
441 "sadalp v28.4s, v12.8h\n"
442 "str q18, [%[c_ptr], #32]\n"
443 "sadalp v29.4s, v13.8h\n"
444 "sadalp v30.4s, v14.8h\n"
445 "sadalp v31.4s, v15.8h\n"
446
447 // Horizontal reduction, phase 1
448 "addp v22.4s, v28.4s, v29.4s\n"
449 "addp v23.4s, v30.4s, v31.4s\n"
450
451 // Horizontal reduction, phase 2
452 "addp v19.4s, v22.4s, v23.4s\n"
453 "str q19, [%[c_ptr], #48]\n"
454 "add %[c_ptr], %[c_ptr], #64\n"
455
456 :
457 [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
458 [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
459 [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
460 [k] "+r" (k)
461 : [oddk] "r" (oddk)
462 : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
463 "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
464 }
465 }
466 }
467
468 } // namespace arm_gemm
469
470 #endif // __aarch64__
471