xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef __aarch64__
25 
26 #include <arm_neon.h>
27 
28 #include "../../asmlib.hpp"
29 
30 namespace arm_gemm {
31 
a64_gemm_s8_4x4(const int8_t * Apanel,const int8_t * Bpanel,int32_t * Cpanel,int ablocks,int bblocks,int K)32 void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
33     const int8_t *a_ptr = Apanel;
34     int32_t *c_ptr = Cpanel;
35 
36     K /= 16;
37     int oddk = (K & 1);
38 
39     for (int yb=0; yb<ablocks; yb++) {
40         const int8_t *a_ptr0 = a_ptr;
41         const int8_t *b_ptr = Bpanel;
42 
43         for (int xb=0; xb<bblocks; xb++) {
44             a_ptr = a_ptr0;
45 
46             int k = ((K+1)/2)-1;
47 
48             register int8x16_t b0  asm("v4");
49             register int8x16_t b1  asm("v5");
50             register int8x16_t b2  asm("v6");
51             register int8x16_t b3  asm("v7");
52             register int8x16_t b0a asm("v8");
53             register int8x16_t b1a asm("v9");
54             register int8x16_t b2a asm("v10");
55             register int8x16_t b3a asm("v11");
56 
57             __asm __volatile (
58                 "movi	v16.4s, #0x0\n"
59                 "ldr	q0, [%[a_ptr]]\n"
60                 "movi	v17.4s, #0x0\n"
61                 "ldr	%q[b0], [%[b_ptr]]\n"
62                 "movi	v18.4s, #0x0\n"
63                 "ldr	%q[b1], [%[b_ptr], #16]\n"
64                 "movi	v19.4s, #0x0\n"
65                 "ldr	%q[b2], [%[b_ptr], #32]\n"
66                 "movi	v20.4s, #0x0\n"
67                 "ldr	%q[b3], [%[b_ptr], #48]\n"
68                 "movi	v21.4s, #0x0\n"
69                 "ldr	q1, [%[a_ptr], #16]\n"
70                 "movi	v22.4s, #0x0\n"
71                 "ldr	q2, [%[a_ptr], #32]\n"
72                 "movi	v23.4s, #0x0\n"
73                 "ldr	q3, [%[a_ptr], #48]\n"
74                 "movi	v24.4s, #0x0\n"
75                 ASM_PREFETCH("[%[b_ptr], #64]")
76                 "movi	v25.4s, #0x0\n"
77                 ASM_PREFETCH("[%[a_ptr], #64]")
78                 "movi	v26.4s, #0x0\n"
79                 ASM_PREFETCH("[%[b_ptr], #128]")
80                 "movi	v27.4s, #0x0\n"
81                 ASM_PREFETCH("[%[a_ptr], #128]")
82                 "movi	v28.4s, #0x0\n"
83                 ASM_PREFETCH("[%[b_ptr], #192]")
84                 "movi	v29.4s, #0x0\n"
85                 ASM_PREFETCH("[%[a_ptr], #192]")
86                 "movi	v30.4s, #0x0\n"
87                 ASM_PREFETCH("[%[b_ptr], #256]")
88                 "movi	v31.4s, #0x0\n"
89                 ASM_PREFETCH("[%[a_ptr], #256]")
90 
91                 // Loop structure optimized for A57 (after r0).
92 
93                 // Unavoidably, the multiply will "dribble" if
94                 // dual issued with an add.
95 
96                 // Minimize the effect of this by making sure
97                 // there are 2 adds to run under the dribbled
98                 // multiply.
99 
100                 // Pipeline in blocks of 8 multiplies - combine
101                 // this iteration's multiplies with adds from
102                 // the previous iteration.
103 
104                 // So the first block doesn't have any adds to
105                 // do - but because all the adds are at the
106                 // start of the block it's only the first couple
107                 // of multiplies that need to be pulled out.
108 
109                 // Start of unroll 0 (first iteration)
110                 "smull	v12.8h, v0.8b, %[b0].8b\n"
111                 "smull	v13.8h, v0.8b, %[b1].8b\n"
112 
113                 // Skip loop if we are doing zero iterations of it.
114                 "cbz	%w[k], 4f\n"
115 
116                 // Unroll 0 continuation (branch target)
117                 "1:\n"
118                 "smull	v14.8h, v0.8b, %[b2].8b\n"
119                 "subs	%w[k], %w[k], #1\n"
120                 "smull	v15.8h, v0.8b, %[b3].8b\n"
121                 "ldr	%q[b0a], [%[b_ptr], #64]\n"
122                 "smlal2	v12.8h, v0.16b, %[b0].16b\n"
123                 "smlal2	v13.8h, v0.16b, %[b1].16b\n"
124                 "ldr	%q[b1a], [%[b_ptr], #80]\n"
125                 "smlal2	v14.8h, v0.16b, %[b2].16b\n"
126                 "smlal2	v15.8h, v0.16b, %[b3].16b\n"
127                 "ldr 	q0, [%[a_ptr], #64]\n"
128 
129                 "sadalp	v16.4s, v12.8h\n"
130                 "smull	v12.8h, v1.8b, %[b0].8b\n"
131                 "sadalp	v17.4s, v13.8h\n"
132                 "sadalp	v18.4s, v14.8h\n"
133                 "smull	v13.8h, v1.8b, %[b1].8b\n"
134                 "sadalp	v19.4s, v15.8h\n"
135                 "smull	v14.8h, v1.8b, %[b2].8b\n"
136                 "ldr	%q[b2a], [%[b_ptr], #96]\n"
137                 "smull	v15.8h, v1.8b, %[b3].8b\n"
138                 "smlal2	v12.8h, v1.16b, %[b0].16b\n"
139                 "ldr	%q[b3a], [%[b_ptr], #112]\n"
140                 "smlal2	v13.8h, v1.16b, %[b1].16b\n"
141                 "add	%[b_ptr], %[b_ptr], #128\n"
142                 "smlal2	v14.8h, v1.16b, %[b2].16b\n"
143                 "smlal2	v15.8h, v1.16b, %[b3].16b\n"
144                 "ldr 	q1, [%[a_ptr], #80]\n"
145 
146                 "sadalp	v20.4s, v12.8h\n"
147                 "smull	v12.8h, v2.8b, %[b0].8b\n"
148                 "sadalp	v21.4s, v13.8h\n"
149                 "sadalp	v22.4s, v14.8h\n"
150                 "smull	v13.8h, v2.8b, %[b1].8b\n"
151                 "sadalp	v23.4s, v15.8h\n"
152                 "smull	v14.8h, v2.8b, %[b2].8b\n"
153                 "smull	v15.8h, v2.8b, %[b3].8b\n"
154                 "smlal2	v12.8h, v2.16b, %[b0].16b\n"
155                 ASM_PREFETCH("[%[b_ptr], #192]")
156                 "smlal2	v13.8h, v2.16b, %[b1].16b\n"
157                 "smlal2	v14.8h, v2.16b, %[b2].16b\n"
158                 ASM_PREFETCH("[%[a_ptr], #320]")
159                 "smlal2	v15.8h, v2.16b, %[b3].16b\n"
160                 "ldr 	q2, [%[a_ptr], #96]\n"
161 
162                 "sadalp	v24.4s, v12.8h\n"
163                 "smull	v12.8h, v3.8b, %[b0].8b\n"
164                 "sadalp	v25.4s, v13.8h\n"
165                 "sadalp	v26.4s, v14.8h\n"
166                 "smull	v13.8h, v3.8b, %[b1].8b\n"
167                 "sadalp	v27.4s, v15.8h\n"
168                 "smull	v14.8h, v3.8b, %[b2].8b\n"
169                 "smull	v15.8h, v3.8b, %[b3].8b\n"
170                 "smlal2	v12.8h, v3.16b, %[b0].16b\n"
171                 "ldr 	%q[b0], [%[b_ptr], #0]\n"
172                 "smlal2	v13.8h, v3.16b, %[b1].16b\n"
173                 "smlal2	v14.8h, v3.16b, %[b2].16b\n"
174                 "smlal2	v15.8h, v3.16b, %[b3].16b\n"
175                 "ldr 	q3, [%[a_ptr], #112]\n"
176 
177                 // Unroll 1
178                 "sadalp	v28.4s, v12.8h\n"
179                 "smull	v12.8h, v0.8b, %[b0a].8b\n"
180                 "sadalp	v29.4s, v13.8h\n"
181                 "sadalp	v30.4s, v14.8h\n"
182                 "smull	v13.8h, v0.8b, %[b1a].8b\n"
183                 "sadalp	v31.4s, v15.8h\n"
184                 "smull	v14.8h, v0.8b, %[b2a].8b\n"
185                 "smull	v15.8h, v0.8b, %[b3a].8b\n"
186                 "ldr 	%q[b1], [%[b_ptr], #16]\n"
187                 "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
188                 "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
189                 "ldr 	%q[b2], [%[b_ptr], #32]\n"
190                 "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
191                 "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
192                 "ldr 	q0, [%[a_ptr], #128]\n"
193 
194                 "sadalp	v16.4s, v12.8h\n"
195                 "smull	v12.8h, v1.8b, %[b0a].8b\n"
196                 "sadalp	v17.4s, v13.8h\n"
197                 "sadalp	v18.4s, v14.8h\n"
198                 "smull	v13.8h, v1.8b, %[b1a].8b\n"
199                 "sadalp	v19.4s, v15.8h\n"
200                 "add	%[a_ptr], %[a_ptr], #128\n"
201                 "smull	v14.8h, v1.8b, %[b2a].8b\n"
202                 "smull	v15.8h, v1.8b, %[b3a].8b\n"
203                 "ldr 	%q[b3], [%[b_ptr], #48]\n"
204                 "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
205                 "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
206                 "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
207                 "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
208                 "ldr 	q1, [%[a_ptr], #16]\n"
209 
210                 "sadalp	v20.4s, v12.8h\n"
211                 "smull	v12.8h, v2.8b, %[b0a].8b\n"
212                 "sadalp	v21.4s, v13.8h\n"
213                 "sadalp	v22.4s, v14.8h\n"
214                 "smull	v13.8h, v2.8b, %[b1a].8b\n"
215                 "sadalp	v23.4s, v15.8h\n"
216                 "smull	v14.8h, v2.8b, %[b2a].8b\n"
217                 "smull	v15.8h, v2.8b, %[b3a].8b\n"
218                 "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
219                 ASM_PREFETCH("[%[b_ptr], #256]")
220                 "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
221                 "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
222                 ASM_PREFETCH("[%[a_ptr], #256]")
223                 "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
224                 "ldr 	q2, [%[a_ptr], #32]\n"
225 
226                 "sadalp	v24.4s, v12.8h\n"
227                 "smull	v12.8h, v3.8b, %[b0a].8b\n"
228                 "sadalp	v25.4s, v13.8h\n"
229                 "sadalp	v26.4s, v14.8h\n"
230                 "smull	v13.8h, v3.8b, %[b1a].8b\n"
231                 "sadalp	v27.4s, v15.8h\n"
232                 "smull	v14.8h, v3.8b, %[b2a].8b\n"
233                 "smull	v15.8h, v3.8b, %[b3a].8b\n"
234                 "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
235                 "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
236                 "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
237                 "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
238                 "ldr 	q3, [%[a_ptr], #48]\n"
239 
240                 // Start of unroll 0 for next iteration.
241                 "sadalp	v28.4s, v12.8h\n"
242                 "smull	v12.8h, v0.8b, %[b0].8b\n"
243                 "sadalp	v29.4s, v13.8h\n"
244                 "sadalp	v30.4s, v14.8h\n"
245                 "smull	v13.8h, v0.8b, %[b1].8b\n"
246                 "sadalp	v31.4s, v15.8h\n"
247                 "bne	1b\n"
248 
249                 // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
250                 "4:\n"
251 
252                 // Branch to alternative tail for odd K
253                 "cbnz	%w[oddk], 2f\n"
254 
255                 // Detached final iteration (even K)
256                 "smull	v14.8h, v0.8b, %[b2].8b\n"
257                 "smull	v15.8h, v0.8b, %[b3].8b\n"
258                 "ldr	%q[b0a], [%[b_ptr], #64]\n"
259                 "smlal2	v12.8h, v0.16b, %[b0].16b\n"
260                 "smlal2	v13.8h, v0.16b, %[b1].16b\n"
261                 "ldr	%q[b1a], [%[b_ptr], #80]\n"
262                 "smlal2	v14.8h, v0.16b, %[b2].16b\n"
263                 "smlal2	v15.8h, v0.16b, %[b3].16b\n"
264                 "ldr 	q0, [%[a_ptr], #64]\n"
265 
266                 "sadalp	v16.4s, v12.8h\n"
267                 "smull	v12.8h, v1.8b, %[b0].8b\n"
268                 "sadalp	v17.4s, v13.8h\n"
269                 "sadalp	v18.4s, v14.8h\n"
270                 "smull	v13.8h, v1.8b, %[b1].8b\n"
271                 "sadalp	v19.4s, v15.8h\n"
272                 "smull	v14.8h, v1.8b, %[b2].8b\n"
273                 "ldr	%q[b2a], [%[b_ptr], #96]\n"
274                 "smull	v15.8h, v1.8b, %[b3].8b\n"
275                 "smlal2	v12.8h, v1.16b, %[b0].16b\n"
276                 "ldr	%q[b3a], [%[b_ptr], #112]\n"
277                 "smlal2	v13.8h, v1.16b, %[b1].16b\n"
278                 "add	%[b_ptr], %[b_ptr], #128\n"
279                 "smlal2	v14.8h, v1.16b, %[b2].16b\n"
280                 "smlal2	v15.8h, v1.16b, %[b3].16b\n"
281                 "ldr 	q1, [%[a_ptr], #80]\n"
282 
283                 "sadalp	v20.4s, v12.8h\n"
284                 "smull	v12.8h, v2.8b, %[b0].8b\n"
285                 "sadalp	v21.4s, v13.8h\n"
286                 "sadalp	v22.4s, v14.8h\n"
287                 "smull	v13.8h, v2.8b, %[b1].8b\n"
288                 "sadalp	v23.4s, v15.8h\n"
289                 "smull	v14.8h, v2.8b, %[b2].8b\n"
290                 "smull	v15.8h, v2.8b, %[b3].8b\n"
291                 "smlal2	v12.8h, v2.16b, %[b0].16b\n"
292                 "smlal2	v13.8h, v2.16b, %[b1].16b\n"
293                 "smlal2	v14.8h, v2.16b, %[b2].16b\n"
294                 "smlal2	v15.8h, v2.16b, %[b3].16b\n"
295                 "ldr 	q2, [%[a_ptr], #96]\n"
296 
297                 "sadalp	v24.4s, v12.8h\n"
298                 "smull	v12.8h, v3.8b, %[b0].8b\n"
299                 "sadalp	v25.4s, v13.8h\n"
300                 "sadalp	v26.4s, v14.8h\n"
301                 "smull	v13.8h, v3.8b, %[b1].8b\n"
302                 "sadalp	v27.4s, v15.8h\n"
303                 "smull	v14.8h, v3.8b, %[b2].8b\n"
304                 "smull	v15.8h, v3.8b, %[b3].8b\n"
305                 "smlal2	v12.8h, v3.16b, %[b0].16b\n"
306                 "smlal2	v13.8h, v3.16b, %[b1].16b\n"
307                 "smlal2	v14.8h, v3.16b, %[b2].16b\n"
308                 "smlal2	v15.8h, v3.16b, %[b3].16b\n"
309                 "ldr 	q3, [%[a_ptr], #112]\n"
310 
311                 // Unroll 1
312                 "sadalp	v28.4s, v12.8h\n"
313                 "smull	v12.8h, v0.8b, %[b0a].8b\n"
314                 "sadalp	v29.4s, v13.8h\n"
315                 "sadalp	v30.4s, v14.8h\n"
316                 "smull	v13.8h, v0.8b, %[b1a].8b\n"
317                 "sadalp	v31.4s, v15.8h\n"
318                 "smull	v14.8h, v0.8b, %[b2a].8b\n"
319                 "add	%[a_ptr], %[a_ptr], #128\n"
320                 "smull	v15.8h, v0.8b, %[b3a].8b\n"
321                 "smlal2	v12.8h, v0.16b, %[b0a].16b\n"
322                 "smlal2	v13.8h, v0.16b, %[b1a].16b\n"
323                 "smlal2	v14.8h, v0.16b, %[b2a].16b\n"
324                 "smlal2	v15.8h, v0.16b, %[b3a].16b\n"
325 
326                 "sadalp	v16.4s, v12.8h\n"
327                 "smull	v12.8h, v1.8b, %[b0a].8b\n"
328                 "sadalp	v17.4s, v13.8h\n"
329                 "sadalp	v18.4s, v14.8h\n"
330                 "smull	v13.8h, v1.8b, %[b1a].8b\n"
331                 "sadalp	v19.4s, v15.8h\n"
332                 "smull	v14.8h, v1.8b, %[b2a].8b\n"
333                 "smull	v15.8h, v1.8b, %[b3a].8b\n"
334                 "smlal2	v12.8h, v1.16b, %[b0a].16b\n"
335                 "addp	v16.4s, v16.4s, v17.4s\n"
336                 "smlal2	v13.8h, v1.16b, %[b1a].16b\n"
337                 "addp	v17.4s, v18.4s, v19.4s\n"
338                 "smlal2	v14.8h, v1.16b, %[b2a].16b\n"
339                 "smlal2	v15.8h, v1.16b, %[b3a].16b\n"
340 
341                 "sadalp	v20.4s, v12.8h\n"
342                 "smull	v12.8h, v2.8b, %[b0a].8b\n"
343                 "sadalp	v21.4s, v13.8h\n"
344                 "sadalp	v22.4s, v14.8h\n"
345                 "smull	v13.8h, v2.8b, %[b1a].8b\n"
346                 "sadalp	v23.4s, v15.8h\n"
347                 "addp	v16.4s, v16.4s, v17.4s\n"
348                 "smull	v14.8h, v2.8b, %[b2a].8b\n"
349                 "addp	v18.4s, v20.4s, v21.4s\n"
350                 "addp	v19.4s, v22.4s, v23.4s\n"
351                 "smull	v15.8h, v2.8b, %[b3a].8b\n"
352                 "smlal2	v12.8h, v2.16b, %[b0a].16b\n"
353                 "str	q16, [%[c_ptr]]\n"
354                 "smlal2	v13.8h, v2.16b, %[b1a].16b\n"
355                 "smlal2	v14.8h, v2.16b, %[b2a].16b\n"
356                 "smlal2	v15.8h, v2.16b, %[b3a].16b\n"
357 
358                 "sadalp	v24.4s, v12.8h\n"
359                 "smull	v12.8h, v3.8b, %[b0a].8b\n"
360                 "sadalp	v25.4s, v13.8h\n"
361                 "sadalp	v26.4s, v14.8h\n"
362                 "smull	v13.8h, v3.8b, %[b1a].8b\n"
363                 "sadalp	v27.4s, v15.8h\n"
364                 "addp	v17.4s, v18.4s, v19.4s\n"
365                 "smull	v14.8h, v3.8b, %[b2a].8b\n"
366                 "addp	v20.4s, v24.4s, v25.4s\n"
367                 "addp	v21.4s, v26.4s, v27.4s\n"
368                 "smull	v15.8h, v3.8b, %[b3a].8b\n"
369                 "smlal2	v12.8h, v3.16b, %[b0a].16b\n"
370                 "str	q17, [%[c_ptr], #16]\n"
371                 "smlal2	v13.8h, v3.16b, %[b1a].16b\n"
372                 "smlal2	v14.8h, v3.16b, %[b2a].16b\n"
373                 "addp	v18.4s, v20.4s, v21.4s\n"
374                 "smlal2	v15.8h, v3.16b, %[b3a].16b\n"
375                 "b	3f\n"
376 
377                 // Detached final iteration (odd K)
378                 "2:\n"
379                 "smull	v14.8h, v0.8b, %[b2].8b\n"
380                 "add	%[a_ptr], %[a_ptr], #64\n"
381                 "smull	v15.8h, v0.8b, %[b3].8b\n"
382                 "add	%[b_ptr], %[b_ptr], #64\n"
383                 "smlal2	v12.8h, v0.16b, %[b0].16b\n"
384                 "smlal2	v13.8h, v0.16b, %[b1].16b\n"
385                 "smlal2	v14.8h, v0.16b, %[b2].16b\n"
386                 "smlal2	v15.8h, v0.16b, %[b3].16b\n"
387 
388                 "sadalp	v16.4s, v12.8h\n"
389                 "smull	v12.8h, v1.8b, %[b0].8b\n"
390                 "sadalp	v17.4s, v13.8h\n"
391                 "sadalp	v18.4s, v14.8h\n"
392                 "smull	v13.8h, v1.8b, %[b1].8b\n"
393                 "sadalp	v19.4s, v15.8h\n"
394                 "smull	v14.8h, v1.8b, %[b2].8b\n"
395                 "smull	v15.8h, v1.8b, %[b3].8b\n"
396                 "smlal2	v12.8h, v1.16b, %[b0].16b\n"
397                 "addp	v16.4s, v16.4s, v17.4s\n"
398                 "smlal2	v13.8h, v1.16b, %[b1].16b\n"
399                 "addp	v17.4s, v18.4s, v19.4s\n"
400                 "smlal2	v14.8h, v1.16b, %[b2].16b\n"
401                 "smlal2	v15.8h, v1.16b, %[b3].16b\n"
402 
403                 "sadalp	v20.4s, v12.8h\n"
404                 "smull	v12.8h, v2.8b, %[b0].8b\n"
405                 "sadalp	v21.4s, v13.8h\n"
406                 "sadalp	v22.4s, v14.8h\n"
407                 "smull	v13.8h, v2.8b, %[b1].8b\n"
408                 "sadalp	v23.4s, v15.8h\n"
409                 "addp	v16.4s, v16.4s, v17.4s\n"
410                 "smull	v14.8h, v2.8b, %[b2].8b\n"
411                 "addp	v18.4s, v20.4s, v21.4s\n"
412                 "addp	v19.4s, v22.4s, v23.4s\n"
413                 "smull	v15.8h, v2.8b, %[b3].8b\n"
414                 "smlal2	v12.8h, v2.16b, %[b0].16b\n"
415                 "str	q16, [%[c_ptr]]\n"
416                 "smlal2	v13.8h, v2.16b, %[b1].16b\n"
417                 "smlal2	v14.8h, v2.16b, %[b2].16b\n"
418                 "smlal2	v15.8h, v2.16b, %[b3].16b\n"
419 
420                 "sadalp	v24.4s, v12.8h\n"
421                 "smull	v12.8h, v3.8b, %[b0].8b\n"
422                 "sadalp	v25.4s, v13.8h\n"
423                 "sadalp	v26.4s, v14.8h\n"
424                 "smull	v13.8h, v3.8b, %[b1].8b\n"
425                 "sadalp	v27.4s, v15.8h\n"
426                 "addp	v17.4s, v18.4s, v19.4s\n"
427                 "smull	v14.8h, v3.8b, %[b2].8b\n"
428                 "addp	v20.4s, v24.4s, v25.4s\n"
429                 "addp	v21.4s, v26.4s, v27.4s\n"
430                 "smull	v15.8h, v3.8b, %[b3].8b\n"
431                 "smlal2	v12.8h, v3.16b, %[b0].16b\n"
432                 "str	q17, [%[c_ptr], #16]\n"
433                 "smlal2	v13.8h, v3.16b, %[b1].16b\n"
434                 "smlal2	v14.8h, v3.16b, %[b2].16b\n"
435                 "addp	v18.4s, v20.4s, v21.4s\n"
436                 "smlal2	v15.8h, v3.16b, %[b3].16b\n"
437 
438                 "3:\n"
439 
440                 // Final additions
441                 "sadalp	v28.4s, v12.8h\n"
442                 "str	q18, [%[c_ptr], #32]\n"
443                 "sadalp	v29.4s, v13.8h\n"
444                 "sadalp	v30.4s, v14.8h\n"
445                 "sadalp	v31.4s, v15.8h\n"
446 
447                 // Horizontal reduction, phase 1
448                 "addp	v22.4s, v28.4s, v29.4s\n"
449                 "addp	v23.4s, v30.4s, v31.4s\n"
450 
451                 // Horizontal reduction, phase 2
452                 "addp	v19.4s, v22.4s, v23.4s\n"
453                 "str	q19, [%[c_ptr], #48]\n"
454                 "add	%[c_ptr], %[c_ptr], #64\n"
455 
456             :
457               [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
458               [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
459               [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
460               [k] "+r" (k)
461             : [oddk] "r" (oddk)
462             : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
463               "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
464         }
465     }
466 }
467 
468 } // namespace arm_gemm
469 
470 #endif // __aarch64__
471