1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifdef __aarch64__
25 
26 #include <algorithm>
27 
28 #include "arm_gemm.hpp"
29 
30 #include <cstdint>
31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
33 
34 namespace arm_gemm {
35 
a64_smallK_hybrid_s8s32_dot_8x4(const int8_t * A,int lda,const int8_t * B,int32_t * C,int ldc,int M,int N,int K,const int32_t *,Activation,bool)36 void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
37     const long loops_count = iceildiv(N, (int)4) - 1;
38     const long ldab = lda * sizeof(int8_t);
39     const long ldcb = ldc * sizeof(int32_t);
40     const long odds_count = K % 4;
41     K = (K + 3) / 4;
42 
43     for (int y0=0; y0<M; y0+=8) {
44         long loops = loops_count;
45         long oob_rows = std::max(8 - (M-y0), 0);
46         long odds = odds_count;
47         const int8_t *b_ptr0 = B;
48         const int8_t *a_ptr0 = A + (y0 * lda);
49 
50         int32_t *c_ptr0 = C + (y0 * ldc);
51 
52         switch(K) {
53             case 1:
54                 __asm __volatile (
55                     "a_ptr1 .req X0\n"
56                     "a_ptr2 .req X1\n"
57                     "a_ptr3 .req X2\n"
58                     "a_ptr4 .req X3\n"
59                     "a_ptr5 .req X4\n"
60                     "a_ptr6 .req X5\n"
61                     "a_ptr7 .req X6\n"
62                     "c_ptr1 .req X7\n"
63                     "c_ptr2 .req X8\n"
64                     "c_ptr3 .req X9\n"
65                     "c_ptr4 .req X10\n"
66                     "c_ptr5 .req X11\n"
67                     "c_ptr6 .req X12\n"
68                     "c_ptr7 .req X13\n"
69                     "add a_ptr1, %[a_ptr0], %[lda]\n"
70                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
71                     "add a_ptr2, a_ptr1, %[lda]\n"
72                     "add c_ptr2, c_ptr1, %[ldc]\n"
73                     "add a_ptr3, a_ptr2, %[lda]\n"
74                     "add c_ptr3, c_ptr2, %[ldc]\n"
75                     "add a_ptr4, a_ptr3, %[lda]\n"
76                     "add c_ptr4, c_ptr3, %[ldc]\n"
77                     "add a_ptr5, a_ptr4, %[lda]\n"
78                     "add c_ptr5, c_ptr4, %[ldc]\n"
79                     "add a_ptr6, a_ptr5, %[lda]\n"
80                     "add c_ptr6, c_ptr5, %[ldc]\n"
81                     "add a_ptr7, a_ptr6, %[lda]\n"
82                     "add c_ptr7, c_ptr6, %[ldc]\n"
83                     "cbz %[oob_rows], 1f\n"
84                     "subs %[oob_rows], %[oob_rows], #0x1\n"
85                     "add c_ptr7, %[c_ptr0], #0x0\n"
86                     "add a_ptr7, %[a_ptr0], #0x0\n"
87                     "b.eq 1f\n"
88                     "subs %[oob_rows], %[oob_rows], #0x1\n"
89                     "add c_ptr6, %[c_ptr0], #0x0\n"
90                     "add a_ptr6, %[a_ptr0], #0x0\n"
91                     "b.eq 1f\n"
92                     "subs %[oob_rows], %[oob_rows], #0x1\n"
93                     "add c_ptr5, %[c_ptr0], #0x0\n"
94                     "add a_ptr5, %[a_ptr0], #0x0\n"
95                     "b.eq 1f\n"
96                     "subs %[oob_rows], %[oob_rows], #0x1\n"
97                     "add c_ptr4, %[c_ptr0], #0x0\n"
98                     "add a_ptr4, %[a_ptr0], #0x0\n"
99                     "b.eq 1f\n"
100                     "subs %[oob_rows], %[oob_rows], #0x1\n"
101                     "add c_ptr3, %[c_ptr0], #0x0\n"
102                     "add a_ptr3, %[a_ptr0], #0x0\n"
103                     "b.eq 1f\n"
104                     "subs %[oob_rows], %[oob_rows], #0x1\n"
105                     "add c_ptr2, %[c_ptr0], #0x0\n"
106                     "add a_ptr2, %[a_ptr0], #0x0\n"
107                     "b.eq 1f\n"
108                     "subs %[oob_rows], %[oob_rows], #0x1\n"
109                     "add c_ptr1, %[c_ptr0], #0x0\n"
110                     "add a_ptr1, %[a_ptr0], #0x0\n"
111                     "1:\n"
112                     "cbnz %[odds], 2f\n"
113                     "ldr s0, [%[a_ptr0]]\n"
114                     "ldr s1, [a_ptr1]\n"
115                     "ldr s2, [a_ptr2]\n"
116                     "ldr s3, [a_ptr3]\n"
117                     "ldr s4, [a_ptr4]\n"
118                     "ldr s5, [a_ptr5]\n"
119                     "ldr s6, [a_ptr6]\n"
120                     "ldr s7, [a_ptr7]\n"
121                     "b 3f\n"
122                     "2:\n"
123                     "subs %[odds], %[odds], #0x1\n"
124                     "b.ne 4f\n"
125                     "ldr b0, [%[a_ptr0]]\n"
126                     "ldr b1, [a_ptr1]\n"
127                     "ldr b2, [a_ptr2]\n"
128                     "ldr b3, [a_ptr3]\n"
129                     "ldr b4, [a_ptr4]\n"
130                     "ldr b5, [a_ptr5]\n"
131                     "ldr b6, [a_ptr6]\n"
132                     "ldr b7, [a_ptr7]\n"
133                     "b 3f\n"
134                     "4:\n"
135                     "ldr h0, [%[a_ptr0]], #0x2\n"
136                     "ldr h1, [a_ptr1], #0x2\n"
137                     "ldr h2, [a_ptr2], #0x2\n"
138                     "ldr h3, [a_ptr3], #0x2\n"
139                     "ldr h4, [a_ptr4], #0x2\n"
140                     "ldr h5, [a_ptr5], #0x2\n"
141                     "ldr h6, [a_ptr6], #0x2\n"
142                     "ldr h7, [a_ptr7], #0x2\n"
143                     "subs %[odds], %[odds], #0x1\n"
144                     "b.ne 5f\n"
145                     "b 3f\n"
146                     "5:\n"
147                     "ld1 {v0.b}[2], [%[a_ptr0]]\n"
148                     "ld1 {v1.b}[2], [a_ptr1]\n"
149                     "ld1 {v2.b}[2], [a_ptr2]\n"
150                     "ld1 {v3.b}[2], [a_ptr3]\n"
151                     "ld1 {v4.b}[2], [a_ptr4]\n"
152                     "ld1 {v5.b}[2], [a_ptr5]\n"
153                     "ld1 {v6.b}[2], [a_ptr6]\n"
154                     "ld1 {v7.b}[2], [a_ptr7]\n"
155                     "3:\n"
156                     "ldr q16, [%[b_ptr0]]\n"
157                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
158                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
159                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
160                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
161                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
162                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
163                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
164                     "cbz %[loops], 6f\n"
165                     "movi v24.4s, #0\n"
166                     "subs %[loops], %[loops], #0x1\n"
167                     "movi v25.4s, #0\n"
168                     "movi v26.4s, #0\n"
169                     "movi v27.4s, #0\n"
170                     "movi v28.4s, #0\n"
171                     "movi v29.4s, #0\n"
172                     "movi v30.4s, #0\n"
173                     "movi v31.4s, #0\n"
174                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
175                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
176                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
177                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
178                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
179                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
180                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
181                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
182                     "b.eq 7f\n"
183                     "8:\n"
184                     "str q24, [%[c_ptr0]]\n"
185                     "subs %[loops], %[loops], #0x1\n"
186                     "movi v24.4s, #0\n"
187                     "ldr q16, [%[b_ptr0]]\n"
188                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
189                     "str q25, [c_ptr1]\n"
190                     "add c_ptr1, c_ptr1, #0x10\n"
191                     "movi v25.4s, #0\n"
192                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
193                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
194                     "str q26, [c_ptr2]\n"
195                     "movi v26.4s, #0\n"
196                     "add c_ptr2, c_ptr2, #0x10\n"
197                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
198                     "str q27, [c_ptr3]\n"
199                     "movi v27.4s, #0\n"
200                     "add c_ptr3, c_ptr3, #0x10\n"
201                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
202                     "str q28, [c_ptr4]\n"
203                     "movi v28.4s, #0\n"
204                     "add c_ptr4, c_ptr4, #0x10\n"
205                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
206                     "str q29, [c_ptr5]\n"
207                     "movi v29.4s, #0\n"
208                     "add c_ptr5, c_ptr5, #0x10\n"
209                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
210                     "str q30, [c_ptr6]\n"
211                     "movi v30.4s, #0\n"
212                     "add c_ptr6, c_ptr6, #0x10\n"
213                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
214                     "str q31, [c_ptr7]\n"
215                     "movi v31.4s, #0\n"
216                     "add c_ptr7, c_ptr7, #0x10\n"
217                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
218                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
219                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
220                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
221                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
222                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
223                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
224                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
225                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
226                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
227                     "b.ne 8b\n"
228                     "7:\n"
229                     "str q24, [%[c_ptr0]]\n"
230                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
231                     "movi v24.4s, #0\n"
232                     "ldr q16, [%[b_ptr0]]\n"
233                     "add %[b_ptr0], %[b_ptr0], #0x10\n"
234                     "str q25, [c_ptr1]\n"
235                     "add c_ptr1, c_ptr1, #0x10\n"
236                     "movi v25.4s, #0\n"
237                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
238                     "str q26, [c_ptr2]\n"
239                     "movi v26.4s, #0\n"
240                     "add c_ptr2, c_ptr2, #0x10\n"
241                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
242                     "str q27, [c_ptr3]\n"
243                     "movi v27.4s, #0\n"
244                     "add c_ptr3, c_ptr3, #0x10\n"
245                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
246                     "str q28, [c_ptr4]\n"
247                     "movi v28.4s, #0\n"
248                     "add c_ptr4, c_ptr4, #0x10\n"
249                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
250                     "str q29, [c_ptr5]\n"
251                     "movi v29.4s, #0\n"
252                     "add c_ptr5, c_ptr5, #0x10\n"
253                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
254                     "str q30, [c_ptr6]\n"
255                     "movi v30.4s, #0\n"
256                     "add c_ptr6, c_ptr6, #0x10\n"
257                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
258                     "str q31, [c_ptr7]\n"
259                     "movi v31.4s, #0\n"
260                     "add c_ptr7, c_ptr7, #0x10\n"
261                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
262                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
263                     "b 9f\n"
264                     "6:\n"
265                     "movi v24.4s, #0\n"
266                     "movi v25.4s, #0\n"
267                     "movi v26.4s, #0\n"
268                     "movi v27.4s, #0\n"
269                     "movi v28.4s, #0\n"
270                     "movi v29.4s, #0\n"
271                     "movi v30.4s, #0\n"
272                     "movi v31.4s, #0\n"
273                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
274                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
275                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
276                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
277                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
278                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
279                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
280                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
281                     "9:\n"
282                     "str q24, [%[c_ptr0]]\n"
283                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
284                     "str q25, [c_ptr1]\n"
285                     "str q26, [c_ptr2]\n"
286                     "str q27, [c_ptr3]\n"
287                     "str q28, [c_ptr4]\n"
288                     "str q29, [c_ptr5]\n"
289                     "str q30, [c_ptr6]\n"
290                     "str q31, [c_ptr7]\n"
291                     ".unreq a_ptr1\n"
292                     ".unreq a_ptr2\n"
293                     ".unreq a_ptr3\n"
294                     ".unreq a_ptr4\n"
295                     ".unreq a_ptr5\n"
296                     ".unreq a_ptr6\n"
297                     ".unreq a_ptr7\n"
298                     ".unreq c_ptr1\n"
299                     ".unreq c_ptr2\n"
300                     ".unreq c_ptr3\n"
301                     ".unreq c_ptr4\n"
302                     ".unreq c_ptr5\n"
303                     ".unreq c_ptr6\n"
304                     ".unreq c_ptr7\n"
305                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
306                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
307                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
308                 );
309                 break;
310             case 2:
311                 __asm __volatile (
312                     "a_ptr1 .req X0\n"
313                     "a_ptr2 .req X1\n"
314                     "a_ptr3 .req X2\n"
315                     "a_ptr4 .req X3\n"
316                     "a_ptr5 .req X4\n"
317                     "a_ptr6 .req X5\n"
318                     "a_ptr7 .req X6\n"
319                     "c_ptr1 .req X7\n"
320                     "c_ptr2 .req X8\n"
321                     "c_ptr3 .req X9\n"
322                     "c_ptr4 .req X10\n"
323                     "c_ptr5 .req X11\n"
324                     "c_ptr6 .req X12\n"
325                     "c_ptr7 .req X13\n"
326                     "add a_ptr1, %[a_ptr0], %[lda]\n"
327                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
328                     "add a_ptr2, a_ptr1, %[lda]\n"
329                     "add c_ptr2, c_ptr1, %[ldc]\n"
330                     "add a_ptr3, a_ptr2, %[lda]\n"
331                     "add c_ptr3, c_ptr2, %[ldc]\n"
332                     "add a_ptr4, a_ptr3, %[lda]\n"
333                     "add c_ptr4, c_ptr3, %[ldc]\n"
334                     "add a_ptr5, a_ptr4, %[lda]\n"
335                     "add c_ptr5, c_ptr4, %[ldc]\n"
336                     "add a_ptr6, a_ptr5, %[lda]\n"
337                     "add c_ptr6, c_ptr5, %[ldc]\n"
338                     "add a_ptr7, a_ptr6, %[lda]\n"
339                     "add c_ptr7, c_ptr6, %[ldc]\n"
340                     "cbz %[oob_rows], 1f\n"
341                     "subs %[oob_rows], %[oob_rows], #0x1\n"
342                     "add c_ptr7, %[c_ptr0], #0x0\n"
343                     "add a_ptr7, %[a_ptr0], #0x0\n"
344                     "b.eq 1f\n"
345                     "subs %[oob_rows], %[oob_rows], #0x1\n"
346                     "add c_ptr6, %[c_ptr0], #0x0\n"
347                     "add a_ptr6, %[a_ptr0], #0x0\n"
348                     "b.eq 1f\n"
349                     "subs %[oob_rows], %[oob_rows], #0x1\n"
350                     "add c_ptr5, %[c_ptr0], #0x0\n"
351                     "add a_ptr5, %[a_ptr0], #0x0\n"
352                     "b.eq 1f\n"
353                     "subs %[oob_rows], %[oob_rows], #0x1\n"
354                     "add c_ptr4, %[c_ptr0], #0x0\n"
355                     "add a_ptr4, %[a_ptr0], #0x0\n"
356                     "b.eq 1f\n"
357                     "subs %[oob_rows], %[oob_rows], #0x1\n"
358                     "add c_ptr3, %[c_ptr0], #0x0\n"
359                     "add a_ptr3, %[a_ptr0], #0x0\n"
360                     "b.eq 1f\n"
361                     "subs %[oob_rows], %[oob_rows], #0x1\n"
362                     "add c_ptr2, %[c_ptr0], #0x0\n"
363                     "add a_ptr2, %[a_ptr0], #0x0\n"
364                     "b.eq 1f\n"
365                     "subs %[oob_rows], %[oob_rows], #0x1\n"
366                     "add c_ptr1, %[c_ptr0], #0x0\n"
367                     "add a_ptr1, %[a_ptr0], #0x0\n"
368                     "1:\n"
369                     "cbnz %[odds], 2f\n"
370                     "ldr d0, [%[a_ptr0]]\n"
371                     "ldr d1, [a_ptr1]\n"
372                     "ldr d2, [a_ptr2]\n"
373                     "ldr d3, [a_ptr3]\n"
374                     "ldr d4, [a_ptr4]\n"
375                     "ldr d5, [a_ptr5]\n"
376                     "ldr d6, [a_ptr6]\n"
377                     "ldr d7, [a_ptr7]\n"
378                     "b 3f\n"
379                     "2:\n"
380                     "ldr s0, [%[a_ptr0]], #0x4\n"
381                     "ldr s1, [a_ptr1], #0x4\n"
382                     "ldr s2, [a_ptr2], #0x4\n"
383                     "ldr s3, [a_ptr3], #0x4\n"
384                     "ldr s4, [a_ptr4], #0x4\n"
385                     "ldr s5, [a_ptr5], #0x4\n"
386                     "ldr s6, [a_ptr6], #0x4\n"
387                     "ldr s7, [a_ptr7], #0x4\n"
388                     "subs %[odds], %[odds], #0x1\n"
389                     "b.ne 4f\n"
390                     "ld1 {v0.b}[4], [%[a_ptr0]]\n"
391                     "ld1 {v1.b}[4], [a_ptr1]\n"
392                     "ld1 {v2.b}[4], [a_ptr2]\n"
393                     "ld1 {v3.b}[4], [a_ptr3]\n"
394                     "ld1 {v4.b}[4], [a_ptr4]\n"
395                     "ld1 {v5.b}[4], [a_ptr5]\n"
396                     "ld1 {v6.b}[4], [a_ptr6]\n"
397                     "ld1 {v7.b}[4], [a_ptr7]\n"
398                     "b 3f\n"
399                     "4:\n"
400                     "ld1 {v0.h}[2], [%[a_ptr0]], #2\n"
401                     "ld1 {v1.h}[2], [a_ptr1], #2\n"
402                     "ld1 {v2.h}[2], [a_ptr2], #2\n"
403                     "ld1 {v3.h}[2], [a_ptr3], #2\n"
404                     "ld1 {v4.h}[2], [a_ptr4], #2\n"
405                     "ld1 {v5.h}[2], [a_ptr5], #2\n"
406                     "ld1 {v6.h}[2], [a_ptr6], #2\n"
407                     "ld1 {v7.h}[2], [a_ptr7], #2\n"
408                     "subs %[odds], %[odds], #0x1\n"
409                     "b.ne 5f\n"
410                     "b 3f\n"
411                     "5:\n"
412                     "ld1 {v0.b}[6], [%[a_ptr0]]\n"
413                     "ld1 {v1.b}[6], [a_ptr1]\n"
414                     "ld1 {v2.b}[6], [a_ptr2]\n"
415                     "ld1 {v3.b}[6], [a_ptr3]\n"
416                     "ld1 {v4.b}[6], [a_ptr4]\n"
417                     "ld1 {v5.b}[6], [a_ptr5]\n"
418                     "ld1 {v6.b}[6], [a_ptr6]\n"
419                     "ld1 {v7.b}[6], [a_ptr7]\n"
420                     "3:\n"
421                     "ldr q16, [%[b_ptr0]]\n"
422                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
423                     "ldr q17, [%[b_ptr0], #0x10]\n"
424                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
425                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
426                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
427                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
428                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
429                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
430                     "cbz %[loops], 6f\n"
431                     "movi v24.4s, #0\n"
432                     "subs %[loops], %[loops], #0x1\n"
433                     "movi v25.4s, #0\n"
434                     "movi v26.4s, #0\n"
435                     "movi v27.4s, #0\n"
436                     "movi v28.4s, #0\n"
437                     "movi v29.4s, #0\n"
438                     "movi v30.4s, #0\n"
439                     "movi v31.4s, #0\n"
440                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
441                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
442                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
443                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
444                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
445                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
446                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
447                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
448                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
449                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
450                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
451                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
452                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
453                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
454                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
455                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
456                     "b.eq 7f\n"
457                     "8:\n"
458                     "str q24, [%[c_ptr0]]\n"
459                     "subs %[loops], %[loops], #0x1\n"
460                     "movi v24.4s, #0\n"
461                     "ldr q16, [%[b_ptr0]]\n"
462                     "ldr q17, [%[b_ptr0], #0x10]\n"
463                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
464                     "str q25, [c_ptr1]\n"
465                     "add c_ptr1, c_ptr1, #0x10\n"
466                     "movi v25.4s, #0\n"
467                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
468                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
469                     "str q26, [c_ptr2]\n"
470                     "movi v26.4s, #0\n"
471                     "add c_ptr2, c_ptr2, #0x10\n"
472                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
473                     "str q27, [c_ptr3]\n"
474                     "movi v27.4s, #0\n"
475                     "add c_ptr3, c_ptr3, #0x10\n"
476                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
477                     "str q28, [c_ptr4]\n"
478                     "movi v28.4s, #0\n"
479                     "add c_ptr4, c_ptr4, #0x10\n"
480                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
481                     "str q29, [c_ptr5]\n"
482                     "movi v29.4s, #0\n"
483                     "add c_ptr5, c_ptr5, #0x10\n"
484                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
485                     "str q30, [c_ptr6]\n"
486                     "movi v30.4s, #0\n"
487                     "add c_ptr6, c_ptr6, #0x10\n"
488                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
489                     "str q31, [c_ptr7]\n"
490                     "movi v31.4s, #0\n"
491                     "add c_ptr7, c_ptr7, #0x10\n"
492                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
493                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
494                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
495                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
496                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
497                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
498                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
499                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
500                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
501                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
502                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
503                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
504                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
505                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
506                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
507                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
508                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
509                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
510                     "b.ne 8b\n"
511                     "7:\n"
512                     "str q24, [%[c_ptr0]]\n"
513                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
514                     "movi v24.4s, #0\n"
515                     "ldr q16, [%[b_ptr0]]\n"
516                     "ldr q17, [%[b_ptr0], #0x10]\n"
517                     "add %[b_ptr0], %[b_ptr0], #0x20\n"
518                     "str q25, [c_ptr1]\n"
519                     "add c_ptr1, c_ptr1, #0x10\n"
520                     "movi v25.4s, #0\n"
521                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
522                     "str q26, [c_ptr2]\n"
523                     "movi v26.4s, #0\n"
524                     "add c_ptr2, c_ptr2, #0x10\n"
525                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
526                     "str q27, [c_ptr3]\n"
527                     "movi v27.4s, #0\n"
528                     "add c_ptr3, c_ptr3, #0x10\n"
529                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
530                     "str q28, [c_ptr4]\n"
531                     "movi v28.4s, #0\n"
532                     "add c_ptr4, c_ptr4, #0x10\n"
533                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
534                     "str q29, [c_ptr5]\n"
535                     "movi v29.4s, #0\n"
536                     "add c_ptr5, c_ptr5, #0x10\n"
537                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
538                     "str q30, [c_ptr6]\n"
539                     "movi v30.4s, #0\n"
540                     "add c_ptr6, c_ptr6, #0x10\n"
541                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
542                     "str q31, [c_ptr7]\n"
543                     "movi v31.4s, #0\n"
544                     "add c_ptr7, c_ptr7, #0x10\n"
545                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
546                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
547                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
548                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
549                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
550                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
551                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
552                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
553                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
554                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
555                     "b 9f\n"
556                     "6:\n"
557                     "movi v24.4s, #0\n"
558                     "movi v25.4s, #0\n"
559                     "movi v26.4s, #0\n"
560                     "movi v27.4s, #0\n"
561                     "movi v28.4s, #0\n"
562                     "movi v29.4s, #0\n"
563                     "movi v30.4s, #0\n"
564                     "movi v31.4s, #0\n"
565                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
566                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
567                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
568                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
569                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
570                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
571                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
572                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
573                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
574                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
575                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
576                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
577                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
578                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
579                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
580                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
581                     "9:\n"
582                     "str q24, [%[c_ptr0]]\n"
583                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
584                     "str q25, [c_ptr1]\n"
585                     "str q26, [c_ptr2]\n"
586                     "str q27, [c_ptr3]\n"
587                     "str q28, [c_ptr4]\n"
588                     "str q29, [c_ptr5]\n"
589                     "str q30, [c_ptr6]\n"
590                     "str q31, [c_ptr7]\n"
591                     ".unreq a_ptr1\n"
592                     ".unreq a_ptr2\n"
593                     ".unreq a_ptr3\n"
594                     ".unreq a_ptr4\n"
595                     ".unreq a_ptr5\n"
596                     ".unreq a_ptr6\n"
597                     ".unreq a_ptr7\n"
598                     ".unreq c_ptr1\n"
599                     ".unreq c_ptr2\n"
600                     ".unreq c_ptr3\n"
601                     ".unreq c_ptr4\n"
602                     ".unreq c_ptr5\n"
603                     ".unreq c_ptr6\n"
604                     ".unreq c_ptr7\n"
605                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
606                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
607                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
608                 );
609                 break;
610             case 3:
611                 __asm __volatile (
612                     "a_ptr1 .req X0\n"
613                     "a_ptr2 .req X1\n"
614                     "a_ptr3 .req X2\n"
615                     "a_ptr4 .req X3\n"
616                     "a_ptr5 .req X4\n"
617                     "a_ptr6 .req X5\n"
618                     "a_ptr7 .req X6\n"
619                     "c_ptr1 .req X7\n"
620                     "c_ptr2 .req X8\n"
621                     "c_ptr3 .req X9\n"
622                     "c_ptr4 .req X10\n"
623                     "c_ptr5 .req X11\n"
624                     "c_ptr6 .req X12\n"
625                     "c_ptr7 .req X13\n"
626                     "add a_ptr1, %[a_ptr0], %[lda]\n"
627                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
628                     "add a_ptr2, a_ptr1, %[lda]\n"
629                     "add c_ptr2, c_ptr1, %[ldc]\n"
630                     "add a_ptr3, a_ptr2, %[lda]\n"
631                     "add c_ptr3, c_ptr2, %[ldc]\n"
632                     "add a_ptr4, a_ptr3, %[lda]\n"
633                     "add c_ptr4, c_ptr3, %[ldc]\n"
634                     "add a_ptr5, a_ptr4, %[lda]\n"
635                     "add c_ptr5, c_ptr4, %[ldc]\n"
636                     "add a_ptr6, a_ptr5, %[lda]\n"
637                     "add c_ptr6, c_ptr5, %[ldc]\n"
638                     "add a_ptr7, a_ptr6, %[lda]\n"
639                     "add c_ptr7, c_ptr6, %[ldc]\n"
640                     "cbz %[oob_rows], 1f\n"
641                     "subs %[oob_rows], %[oob_rows], #0x1\n"
642                     "add c_ptr7, %[c_ptr0], #0x0\n"
643                     "add a_ptr7, %[a_ptr0], #0x0\n"
644                     "b.eq 1f\n"
645                     "subs %[oob_rows], %[oob_rows], #0x1\n"
646                     "add c_ptr6, %[c_ptr0], #0x0\n"
647                     "add a_ptr6, %[a_ptr0], #0x0\n"
648                     "b.eq 1f\n"
649                     "subs %[oob_rows], %[oob_rows], #0x1\n"
650                     "add c_ptr5, %[c_ptr0], #0x0\n"
651                     "add a_ptr5, %[a_ptr0], #0x0\n"
652                     "b.eq 1f\n"
653                     "subs %[oob_rows], %[oob_rows], #0x1\n"
654                     "add c_ptr4, %[c_ptr0], #0x0\n"
655                     "add a_ptr4, %[a_ptr0], #0x0\n"
656                     "b.eq 1f\n"
657                     "subs %[oob_rows], %[oob_rows], #0x1\n"
658                     "add c_ptr3, %[c_ptr0], #0x0\n"
659                     "add a_ptr3, %[a_ptr0], #0x0\n"
660                     "b.eq 1f\n"
661                     "subs %[oob_rows], %[oob_rows], #0x1\n"
662                     "add c_ptr2, %[c_ptr0], #0x0\n"
663                     "add a_ptr2, %[a_ptr0], #0x0\n"
664                     "b.eq 1f\n"
665                     "subs %[oob_rows], %[oob_rows], #0x1\n"
666                     "add c_ptr1, %[c_ptr0], #0x0\n"
667                     "add a_ptr1, %[a_ptr0], #0x0\n"
668                     "1:\n"
669                     "ldr d0, [%[a_ptr0]], #0x8\n"
670                     "ldr d1, [a_ptr1], #0x8\n"
671                     "ldr d2, [a_ptr2], #0x8\n"
672                     "ldr d3, [a_ptr3], #0x8\n"
673                     "ldr d4, [a_ptr4], #0x8\n"
674                     "ldr d5, [a_ptr5], #0x8\n"
675                     "ldr d6, [a_ptr6], #0x8\n"
676                     "ldr d7, [a_ptr7], #0x8\n"
677                     "cbnz %[odds], 2f\n"
678                     "ld1 {v0.s}[2], [%[a_ptr0]]\n"
679                     "ld1 {v1.s}[2], [a_ptr1]\n"
680                     "ld1 {v2.s}[2], [a_ptr2]\n"
681                     "ld1 {v3.s}[2], [a_ptr3]\n"
682                     "ld1 {v4.s}[2], [a_ptr4]\n"
683                     "ld1 {v5.s}[2], [a_ptr5]\n"
684                     "ld1 {v6.s}[2], [a_ptr6]\n"
685                     "ld1 {v7.s}[2], [a_ptr7]\n"
686                     "b 3f\n"
687                     "2:\n"
688                     "subs %[odds], %[odds], #0x1\n"
689                     "b.ne 4f\n"
690                     "ld1 {v0.b}[8], [%[a_ptr0]]\n"
691                     "ld1 {v1.b}[8], [a_ptr1]\n"
692                     "ld1 {v2.b}[8], [a_ptr2]\n"
693                     "ld1 {v3.b}[8], [a_ptr3]\n"
694                     "ld1 {v4.b}[8], [a_ptr4]\n"
695                     "ld1 {v5.b}[8], [a_ptr5]\n"
696                     "ld1 {v6.b}[8], [a_ptr6]\n"
697                     "ld1 {v7.b}[8], [a_ptr7]\n"
698                     "b 3f\n"
699                     "4:\n"
700                     "ld1 {v0.h}[4], [%[a_ptr0]], #2\n"
701                     "ld1 {v1.h}[4], [a_ptr1], #2\n"
702                     "ld1 {v2.h}[4], [a_ptr2], #2\n"
703                     "ld1 {v3.h}[4], [a_ptr3], #2\n"
704                     "ld1 {v4.h}[4], [a_ptr4], #2\n"
705                     "ld1 {v5.h}[4], [a_ptr5], #2\n"
706                     "ld1 {v6.h}[4], [a_ptr6], #2\n"
707                     "ld1 {v7.h}[4], [a_ptr7], #2\n"
708                     "subs %[odds], %[odds], #0x1\n"
709                     "b.ne 5f\n"
710                     "b 3f\n"
711                     "5:\n"
712                     "ld1 {v0.b}[10], [%[a_ptr0]]\n"
713                     "ld1 {v1.b}[10], [a_ptr1]\n"
714                     "ld1 {v2.b}[10], [a_ptr2]\n"
715                     "ld1 {v3.b}[10], [a_ptr3]\n"
716                     "ld1 {v4.b}[10], [a_ptr4]\n"
717                     "ld1 {v5.b}[10], [a_ptr5]\n"
718                     "ld1 {v6.b}[10], [a_ptr6]\n"
719                     "ld1 {v7.b}[10], [a_ptr7]\n"
720                     "3:\n"
721                     "ldr q16, [%[b_ptr0]]\n"
722                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
723                     "ldr q17, [%[b_ptr0], #0x10]\n"
724                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
725                     "ldr q18, [%[b_ptr0], #0x20]\n"
726                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
727                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
728                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
729                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
730                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
731                     "cbz %[loops], 6f\n"
732                     "movi v24.4s, #0\n"
733                     "subs %[loops], %[loops], #0x1\n"
734                     "movi v25.4s, #0\n"
735                     "movi v26.4s, #0\n"
736                     "movi v27.4s, #0\n"
737                     "movi v28.4s, #0\n"
738                     "movi v29.4s, #0\n"
739                     "movi v30.4s, #0\n"
740                     "movi v31.4s, #0\n"
741                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
742                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
743                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
744                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
745                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
746                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
747                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
748                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
749                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
750                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
751                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
752                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
753                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
754                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
755                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
756                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
757                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
758                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
759                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
760                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
761                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
762                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
763                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
764                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
765                     "b.eq 7f\n"
766                     "8:\n"
767                     "str q24, [%[c_ptr0]]\n"
768                     "subs %[loops], %[loops], #0x1\n"
769                     "movi v24.4s, #0\n"
770                     "ldr q16, [%[b_ptr0]]\n"
771                     "ldr q17, [%[b_ptr0], #0x10]\n"
772                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
773                     "str q25, [c_ptr1]\n"
774                     "add c_ptr1, c_ptr1, #0x10\n"
775                     "movi v25.4s, #0\n"
776                     "ldr q18, [%[b_ptr0], #0x20]\n"
777                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
778                     "str q26, [c_ptr2]\n"
779                     "movi v26.4s, #0\n"
780                     "add c_ptr2, c_ptr2, #0x10\n"
781                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
782                     "str q27, [c_ptr3]\n"
783                     "movi v27.4s, #0\n"
784                     "add c_ptr3, c_ptr3, #0x10\n"
785                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
786                     "str q28, [c_ptr4]\n"
787                     "movi v28.4s, #0\n"
788                     "add c_ptr4, c_ptr4, #0x10\n"
789                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
790                     "str q29, [c_ptr5]\n"
791                     "movi v29.4s, #0\n"
792                     "add c_ptr5, c_ptr5, #0x10\n"
793                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
794                     "str q30, [c_ptr6]\n"
795                     "movi v30.4s, #0\n"
796                     "add c_ptr6, c_ptr6, #0x10\n"
797                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
798                     "str q31, [c_ptr7]\n"
799                     "movi v31.4s, #0\n"
800                     "add c_ptr7, c_ptr7, #0x10\n"
801                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
802                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
803                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
804                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
805                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
806                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
807                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
808                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
809                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
810                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
811                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
812                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
813                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
814                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
815                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
816                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
817                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
818                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
819                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
820                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
821                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
822                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
823                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
824                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
825                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
826                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
827                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
828                     "b.ne 8b\n"
829                     "7:\n"
830                     "str q24, [%[c_ptr0]]\n"
831                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
832                     "movi v24.4s, #0\n"
833                     "ldr q16, [%[b_ptr0]]\n"
834                     "ldr q17, [%[b_ptr0], #0x10]\n"
835                     "str q25, [c_ptr1]\n"
836                     "add c_ptr1, c_ptr1, #0x10\n"
837                     "movi v25.4s, #0\n"
838                     "ldr q18, [%[b_ptr0], #0x20]\n"
839                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
840                     "str q26, [c_ptr2]\n"
841                     "movi v26.4s, #0\n"
842                     "add c_ptr2, c_ptr2, #0x10\n"
843                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
844                     "str q27, [c_ptr3]\n"
845                     "movi v27.4s, #0\n"
846                     "add c_ptr3, c_ptr3, #0x10\n"
847                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
848                     "str q28, [c_ptr4]\n"
849                     "movi v28.4s, #0\n"
850                     "add c_ptr4, c_ptr4, #0x10\n"
851                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
852                     "str q29, [c_ptr5]\n"
853                     "movi v29.4s, #0\n"
854                     "add c_ptr5, c_ptr5, #0x10\n"
855                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
856                     "str q30, [c_ptr6]\n"
857                     "movi v30.4s, #0\n"
858                     "add c_ptr6, c_ptr6, #0x10\n"
859                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
860                     "str q31, [c_ptr7]\n"
861                     "movi v31.4s, #0\n"
862                     "add c_ptr7, c_ptr7, #0x10\n"
863                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
864                     "add %[b_ptr0], %[b_ptr0], #0x30\n"
865                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
866                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
867                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
868                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
869                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
870                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
871                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
872                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
873                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
874                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
875                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
876                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
877                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
878                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
879                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
880                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
881                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
882                     "b 9f\n"
883                     "6:\n"
884                     "movi v24.4s, #0\n"
885                     "movi v25.4s, #0\n"
886                     "movi v26.4s, #0\n"
887                     "movi v27.4s, #0\n"
888                     "movi v28.4s, #0\n"
889                     "movi v29.4s, #0\n"
890                     "movi v30.4s, #0\n"
891                     "movi v31.4s, #0\n"
892                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
893                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
894                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
895                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
896                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
897                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
898                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
899                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
900                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
901                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
902                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
903                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
904                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
905                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
906                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
907                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
908                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
909                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
910                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
911                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
912                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
913                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
914                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
915                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
916                     "9:\n"
917                     "str q24, [%[c_ptr0]]\n"
918                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
919                     "str q25, [c_ptr1]\n"
920                     "str q26, [c_ptr2]\n"
921                     "str q27, [c_ptr3]\n"
922                     "str q28, [c_ptr4]\n"
923                     "str q29, [c_ptr5]\n"
924                     "str q30, [c_ptr6]\n"
925                     "str q31, [c_ptr7]\n"
926                     ".unreq a_ptr1\n"
927                     ".unreq a_ptr2\n"
928                     ".unreq a_ptr3\n"
929                     ".unreq a_ptr4\n"
930                     ".unreq a_ptr5\n"
931                     ".unreq a_ptr6\n"
932                     ".unreq a_ptr7\n"
933                     ".unreq c_ptr1\n"
934                     ".unreq c_ptr2\n"
935                     ".unreq c_ptr3\n"
936                     ".unreq c_ptr4\n"
937                     ".unreq c_ptr5\n"
938                     ".unreq c_ptr6\n"
939                     ".unreq c_ptr7\n"
940                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
941                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
942                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
943                 );
944                 break;
945             case 4:
946                 __asm __volatile (
947                     "a_ptr1 .req X0\n"
948                     "a_ptr2 .req X1\n"
949                     "a_ptr3 .req X2\n"
950                     "a_ptr4 .req X3\n"
951                     "a_ptr5 .req X4\n"
952                     "a_ptr6 .req X5\n"
953                     "a_ptr7 .req X6\n"
954                     "c_ptr1 .req X7\n"
955                     "c_ptr2 .req X8\n"
956                     "c_ptr3 .req X9\n"
957                     "c_ptr4 .req X10\n"
958                     "c_ptr5 .req X11\n"
959                     "c_ptr6 .req X12\n"
960                     "c_ptr7 .req X13\n"
961                     "add a_ptr1, %[a_ptr0], %[lda]\n"
962                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
963                     "add a_ptr2, a_ptr1, %[lda]\n"
964                     "add c_ptr2, c_ptr1, %[ldc]\n"
965                     "add a_ptr3, a_ptr2, %[lda]\n"
966                     "add c_ptr3, c_ptr2, %[ldc]\n"
967                     "add a_ptr4, a_ptr3, %[lda]\n"
968                     "add c_ptr4, c_ptr3, %[ldc]\n"
969                     "add a_ptr5, a_ptr4, %[lda]\n"
970                     "add c_ptr5, c_ptr4, %[ldc]\n"
971                     "add a_ptr6, a_ptr5, %[lda]\n"
972                     "add c_ptr6, c_ptr5, %[ldc]\n"
973                     "add a_ptr7, a_ptr6, %[lda]\n"
974                     "add c_ptr7, c_ptr6, %[ldc]\n"
975                     "cbz %[oob_rows], 1f\n"
976                     "subs %[oob_rows], %[oob_rows], #0x1\n"
977                     "add c_ptr7, %[c_ptr0], #0x0\n"
978                     "add a_ptr7, %[a_ptr0], #0x0\n"
979                     "b.eq 1f\n"
980                     "subs %[oob_rows], %[oob_rows], #0x1\n"
981                     "add c_ptr6, %[c_ptr0], #0x0\n"
982                     "add a_ptr6, %[a_ptr0], #0x0\n"
983                     "b.eq 1f\n"
984                     "subs %[oob_rows], %[oob_rows], #0x1\n"
985                     "add c_ptr5, %[c_ptr0], #0x0\n"
986                     "add a_ptr5, %[a_ptr0], #0x0\n"
987                     "b.eq 1f\n"
988                     "subs %[oob_rows], %[oob_rows], #0x1\n"
989                     "add c_ptr4, %[c_ptr0], #0x0\n"
990                     "add a_ptr4, %[a_ptr0], #0x0\n"
991                     "b.eq 1f\n"
992                     "subs %[oob_rows], %[oob_rows], #0x1\n"
993                     "add c_ptr3, %[c_ptr0], #0x0\n"
994                     "add a_ptr3, %[a_ptr0], #0x0\n"
995                     "b.eq 1f\n"
996                     "subs %[oob_rows], %[oob_rows], #0x1\n"
997                     "add c_ptr2, %[c_ptr0], #0x0\n"
998                     "add a_ptr2, %[a_ptr0], #0x0\n"
999                     "b.eq 1f\n"
1000                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1001                     "add c_ptr1, %[c_ptr0], #0x0\n"
1002                     "add a_ptr1, %[a_ptr0], #0x0\n"
1003                     "1:\n"
1004                     "cbnz %[odds], 2f\n"
1005                     "ldr q0, [%[a_ptr0]]\n"
1006                     "ldr q1, [a_ptr1]\n"
1007                     "ldr q2, [a_ptr2]\n"
1008                     "ldr q3, [a_ptr3]\n"
1009                     "ldr q4, [a_ptr4]\n"
1010                     "ldr q5, [a_ptr5]\n"
1011                     "ldr q6, [a_ptr6]\n"
1012                     "ldr q7, [a_ptr7]\n"
1013                     "b 3f\n"
1014                     "2:\n"
1015                     "ldr d0, [%[a_ptr0]], #0x8\n"
1016                     "ldr d1, [a_ptr1], #0x8\n"
1017                     "ldr d2, [a_ptr2], #0x8\n"
1018                     "ldr d3, [a_ptr3], #0x8\n"
1019                     "ldr d4, [a_ptr4], #0x8\n"
1020                     "ldr d5, [a_ptr5], #0x8\n"
1021                     "ldr d6, [a_ptr6], #0x8\n"
1022                     "ldr d7, [a_ptr7], #0x8\n"
1023                     "ld1 {v0.s}[2], [%[a_ptr0]], #4\n"
1024                     "ld1 {v1.s}[2], [a_ptr1], #4\n"
1025                     "ld1 {v2.s}[2], [a_ptr2], #4\n"
1026                     "ld1 {v3.s}[2], [a_ptr3], #4\n"
1027                     "ld1 {v4.s}[2], [a_ptr4], #4\n"
1028                     "ld1 {v5.s}[2], [a_ptr5], #4\n"
1029                     "ld1 {v6.s}[2], [a_ptr6], #4\n"
1030                     "ld1 {v7.s}[2], [a_ptr7], #4\n"
1031                     "subs %[odds], %[odds], #0x1\n"
1032                     "b.ne 4f\n"
1033                     "ld1 {v0.b}[12], [%[a_ptr0]]\n"
1034                     "ld1 {v1.b}[12], [a_ptr1]\n"
1035                     "ld1 {v2.b}[12], [a_ptr2]\n"
1036                     "ld1 {v3.b}[12], [a_ptr3]\n"
1037                     "ld1 {v4.b}[12], [a_ptr4]\n"
1038                     "ld1 {v5.b}[12], [a_ptr5]\n"
1039                     "ld1 {v6.b}[12], [a_ptr6]\n"
1040                     "ld1 {v7.b}[12], [a_ptr7]\n"
1041                     "b 3f\n"
1042                     "4:\n"
1043                     "ld1 {v0.h}[6], [%[a_ptr0]], #2\n"
1044                     "ld1 {v1.h}[6], [a_ptr1], #2\n"
1045                     "ld1 {v2.h}[6], [a_ptr2], #2\n"
1046                     "ld1 {v3.h}[6], [a_ptr3], #2\n"
1047                     "ld1 {v4.h}[6], [a_ptr4], #2\n"
1048                     "ld1 {v5.h}[6], [a_ptr5], #2\n"
1049                     "ld1 {v6.h}[6], [a_ptr6], #2\n"
1050                     "ld1 {v7.h}[6], [a_ptr7], #2\n"
1051                     "subs %[odds], %[odds], #0x1\n"
1052                     "b.ne 5f\n"
1053                     "b 3f\n"
1054                     "5:\n"
1055                     "ld1 {v0.b}[14], [%[a_ptr0]]\n"
1056                     "ld1 {v1.b}[14], [a_ptr1]\n"
1057                     "ld1 {v2.b}[14], [a_ptr2]\n"
1058                     "ld1 {v3.b}[14], [a_ptr3]\n"
1059                     "ld1 {v4.b}[14], [a_ptr4]\n"
1060                     "ld1 {v5.b}[14], [a_ptr5]\n"
1061                     "ld1 {v6.b}[14], [a_ptr6]\n"
1062                     "ld1 {v7.b}[14], [a_ptr7]\n"
1063                     "3:\n"
1064                     "ldr q16, [%[b_ptr0]]\n"
1065                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1066                     "ldr q17, [%[b_ptr0], #0x10]\n"
1067                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1068                     "ldr q18, [%[b_ptr0], #0x20]\n"
1069                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1070                     "ldr q19, [%[b_ptr0], #0x30]\n"
1071                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1072                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1073                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1074                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
1075                     "cbz %[loops], 6f\n"
1076                     "movi v24.4s, #0\n"
1077                     "subs %[loops], %[loops], #0x1\n"
1078                     "movi v25.4s, #0\n"
1079                     "movi v26.4s, #0\n"
1080                     "movi v27.4s, #0\n"
1081                     "movi v28.4s, #0\n"
1082                     "movi v29.4s, #0\n"
1083                     "movi v30.4s, #0\n"
1084                     "movi v31.4s, #0\n"
1085                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1086                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1087                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1088                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1089                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1090                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1091                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1092                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1093                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1094                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1095                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1096                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1097                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1098                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1099                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1100                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1101                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1102                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1103                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1104                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1105                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1106                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1107                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1108                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1109                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1110                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1111                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1112                     ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1113                     ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1114                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1115                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1116                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1117                     "b.eq 7f\n"
1118                     "8:\n"
1119                     "str q24, [%[c_ptr0]]\n"
1120                     "subs %[loops], %[loops], #0x1\n"
1121                     "movi v24.4s, #0\n"
1122                     "ldr q16, [%[b_ptr0]]\n"
1123                     "ldr q17, [%[b_ptr0], #0x10]\n"
1124                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1125                     "str q25, [c_ptr1]\n"
1126                     "add c_ptr1, c_ptr1, #0x10\n"
1127                     "movi v25.4s, #0\n"
1128                     "ldr q18, [%[b_ptr0], #0x20]\n"
1129                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1130                     "str q26, [c_ptr2]\n"
1131                     "movi v26.4s, #0\n"
1132                     "ldr q19, [%[b_ptr0], #0x30]\n"
1133                     "add c_ptr2, c_ptr2, #0x10\n"
1134                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1135                     "str q27, [c_ptr3]\n"
1136                     "movi v27.4s, #0\n"
1137                     "add c_ptr3, c_ptr3, #0x10\n"
1138                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1139                     "str q28, [c_ptr4]\n"
1140                     "movi v28.4s, #0\n"
1141                     "add c_ptr4, c_ptr4, #0x10\n"
1142                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1143                     "str q29, [c_ptr5]\n"
1144                     "movi v29.4s, #0\n"
1145                     "add c_ptr5, c_ptr5, #0x10\n"
1146                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1147                     "str q30, [c_ptr6]\n"
1148                     "movi v30.4s, #0\n"
1149                     "add c_ptr6, c_ptr6, #0x10\n"
1150                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1151                     "str q31, [c_ptr7]\n"
1152                     "movi v31.4s, #0\n"
1153                     "add c_ptr7, c_ptr7, #0x10\n"
1154                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1155                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
1156                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1157                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1158                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1159                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1160                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1161                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1162                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1163                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1164                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1165                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1166                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1167                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1168                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1169                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1170                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1171                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1172                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1173                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1174                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1175                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1176                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1177                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1178                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1179                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1180                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1181                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1182                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1183                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1184                     ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1185                     ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1186                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1187                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1188                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1189                     "b.ne 8b\n"
1190                     "7:\n"
1191                     "str q24, [%[c_ptr0]]\n"
1192                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1193                     "movi v24.4s, #0\n"
1194                     "ldr q16, [%[b_ptr0]]\n"
1195                     "ldr q17, [%[b_ptr0], #0x10]\n"
1196                     "str q25, [c_ptr1]\n"
1197                     "add c_ptr1, c_ptr1, #0x10\n"
1198                     "movi v25.4s, #0\n"
1199                     "ldr q18, [%[b_ptr0], #0x20]\n"
1200                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1201                     "str q26, [c_ptr2]\n"
1202                     "movi v26.4s, #0\n"
1203                     "ldr q19, [%[b_ptr0], #0x30]\n"
1204                     "add c_ptr2, c_ptr2, #0x10\n"
1205                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1206                     "str q27, [c_ptr3]\n"
1207                     "movi v27.4s, #0\n"
1208                     "add c_ptr3, c_ptr3, #0x10\n"
1209                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1210                     "str q28, [c_ptr4]\n"
1211                     "movi v28.4s, #0\n"
1212                     "add c_ptr4, c_ptr4, #0x10\n"
1213                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1214                     "str q29, [c_ptr5]\n"
1215                     "movi v29.4s, #0\n"
1216                     "add c_ptr5, c_ptr5, #0x10\n"
1217                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1218                     "str q30, [c_ptr6]\n"
1219                     "movi v30.4s, #0\n"
1220                     "add c_ptr6, c_ptr6, #0x10\n"
1221                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1222                     "str q31, [c_ptr7]\n"
1223                     "movi v31.4s, #0\n"
1224                     "add c_ptr7, c_ptr7, #0x10\n"
1225                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1226                     "add %[b_ptr0], %[b_ptr0], #0x40\n"
1227                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1228                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1229                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1230                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1231                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1232                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1233                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1234                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1235                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1236                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1237                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1238                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1239                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1240                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1241                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1242                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1243                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1244                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1245                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1246                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1247                     ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1248                     ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1249                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1250                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1251                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1252                     "b 9f\n"
1253                     "6:\n"
1254                     "movi v24.4s, #0\n"
1255                     "movi v25.4s, #0\n"
1256                     "movi v26.4s, #0\n"
1257                     "movi v27.4s, #0\n"
1258                     "movi v28.4s, #0\n"
1259                     "movi v29.4s, #0\n"
1260                     "movi v30.4s, #0\n"
1261                     "movi v31.4s, #0\n"
1262                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1263                     ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
1264                     ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
1265                     ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
1266                     ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
1267                     ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
1268                     ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
1269                     ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
1270                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1271                     ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
1272                     ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
1273                     ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
1274                     ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
1275                     ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
1276                     ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
1277                     ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
1278                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1279                     ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
1280                     ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
1281                     ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
1282                     ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
1283                     ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
1284                     ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
1285                     ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
1286                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1287                     ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
1288                     ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
1289                     ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
1290                     ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
1291                     ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
1292                     ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
1293                     ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
1294                     "9:\n"
1295                     "str q24, [%[c_ptr0]]\n"
1296                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1297                     "str q25, [c_ptr1]\n"
1298                     "str q26, [c_ptr2]\n"
1299                     "str q27, [c_ptr3]\n"
1300                     "str q28, [c_ptr4]\n"
1301                     "str q29, [c_ptr5]\n"
1302                     "str q30, [c_ptr6]\n"
1303                     "str q31, [c_ptr7]\n"
1304                     ".unreq a_ptr1\n"
1305                     ".unreq a_ptr2\n"
1306                     ".unreq a_ptr3\n"
1307                     ".unreq a_ptr4\n"
1308                     ".unreq a_ptr5\n"
1309                     ".unreq a_ptr6\n"
1310                     ".unreq a_ptr7\n"
1311                     ".unreq c_ptr1\n"
1312                     ".unreq c_ptr2\n"
1313                     ".unreq c_ptr3\n"
1314                     ".unreq c_ptr4\n"
1315                     ".unreq c_ptr5\n"
1316                     ".unreq c_ptr6\n"
1317                     ".unreq c_ptr7\n"
1318                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
1319                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
1320                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
1321                 );
1322                 break;
1323             case 5:
1324                 __asm __volatile (
1325                     "a_ptr1 .req X0\n"
1326                     "a_ptr2 .req X1\n"
1327                     "a_ptr3 .req X2\n"
1328                     "a_ptr4 .req X3\n"
1329                     "a_ptr5 .req X4\n"
1330                     "a_ptr6 .req X5\n"
1331                     "a_ptr7 .req X6\n"
1332                     "c_ptr1 .req X7\n"
1333                     "c_ptr2 .req X8\n"
1334                     "c_ptr3 .req X9\n"
1335                     "c_ptr4 .req X10\n"
1336                     "c_ptr5 .req X11\n"
1337                     "c_ptr6 .req X12\n"
1338                     "c_ptr7 .req X13\n"
1339                     "add a_ptr1, %[a_ptr0], %[lda]\n"
1340                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
1341                     "add a_ptr2, a_ptr1, %[lda]\n"
1342                     "add c_ptr2, c_ptr1, %[ldc]\n"
1343                     "add a_ptr3, a_ptr2, %[lda]\n"
1344                     "add c_ptr3, c_ptr2, %[ldc]\n"
1345                     "add a_ptr4, a_ptr3, %[lda]\n"
1346                     "add c_ptr4, c_ptr3, %[ldc]\n"
1347                     "add a_ptr5, a_ptr4, %[lda]\n"
1348                     "add c_ptr5, c_ptr4, %[ldc]\n"
1349                     "add a_ptr6, a_ptr5, %[lda]\n"
1350                     "add c_ptr6, c_ptr5, %[ldc]\n"
1351                     "add a_ptr7, a_ptr6, %[lda]\n"
1352                     "add c_ptr7, c_ptr6, %[ldc]\n"
1353                     "cbz %[oob_rows], 1f\n"
1354                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1355                     "add c_ptr7, %[c_ptr0], #0x0\n"
1356                     "add a_ptr7, %[a_ptr0], #0x0\n"
1357                     "b.eq 1f\n"
1358                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1359                     "add c_ptr6, %[c_ptr0], #0x0\n"
1360                     "add a_ptr6, %[a_ptr0], #0x0\n"
1361                     "b.eq 1f\n"
1362                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1363                     "add c_ptr5, %[c_ptr0], #0x0\n"
1364                     "add a_ptr5, %[a_ptr0], #0x0\n"
1365                     "b.eq 1f\n"
1366                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1367                     "add c_ptr4, %[c_ptr0], #0x0\n"
1368                     "add a_ptr4, %[a_ptr0], #0x0\n"
1369                     "b.eq 1f\n"
1370                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1371                     "add c_ptr3, %[c_ptr0], #0x0\n"
1372                     "add a_ptr3, %[a_ptr0], #0x0\n"
1373                     "b.eq 1f\n"
1374                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1375                     "add c_ptr2, %[c_ptr0], #0x0\n"
1376                     "add a_ptr2, %[a_ptr0], #0x0\n"
1377                     "b.eq 1f\n"
1378                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1379                     "add c_ptr1, %[c_ptr0], #0x0\n"
1380                     "add a_ptr1, %[a_ptr0], #0x0\n"
1381                     "1:\n"
1382                     "cbnz %[odds], 2f\n"
1383                     "ldr q0, [%[a_ptr0]], #0x10\n"
1384                     "ldr q2, [a_ptr1], #0x10\n"
1385                     "ldr q4, [a_ptr2], #0x10\n"
1386                     "ldr q6, [a_ptr3], #0x10\n"
1387                     "ldr s1, [%[a_ptr0]]\n"
1388                     "ldr q8, [a_ptr4], #0x10\n"
1389                     "ldr s3, [a_ptr1]\n"
1390                     "ldr q10, [a_ptr5], #0x10\n"
1391                     "ldr s5, [a_ptr2]\n"
1392                     "ldr q12, [a_ptr6], #0x10\n"
1393                     "ldr s7, [a_ptr3]\n"
1394                     "ldr q14, [a_ptr7], #0x10\n"
1395                     "ldr s9, [a_ptr4]\n"
1396                     "ldr s11, [a_ptr5]\n"
1397                     "ldr s13, [a_ptr6]\n"
1398                     "ldr s15, [a_ptr7]\n"
1399                     "b 3f\n"
1400                     "2:\n"
1401                     "ldr q0, [%[a_ptr0]], #0x10\n"
1402                     "subs %[odds], %[odds], #0x1\n"
1403                     "ldr q2, [a_ptr1], #0x10\n"
1404                     "ldr q4, [a_ptr2], #0x10\n"
1405                     "ldr q6, [a_ptr3], #0x10\n"
1406                     "ldr q8, [a_ptr4], #0x10\n"
1407                     "ldr q10, [a_ptr5], #0x10\n"
1408                     "ldr q12, [a_ptr6], #0x10\n"
1409                     "ldr q14, [a_ptr7], #0x10\n"
1410                     "b.ne 4f\n"
1411                     "ldr b1, [%[a_ptr0]]\n"
1412                     "ldr b3, [a_ptr1]\n"
1413                     "ldr b5, [a_ptr2]\n"
1414                     "ldr b7, [a_ptr3]\n"
1415                     "ldr b9, [a_ptr4]\n"
1416                     "ldr b11, [a_ptr5]\n"
1417                     "ldr b13, [a_ptr6]\n"
1418                     "ldr b15, [a_ptr7]\n"
1419                     "b 3f\n"
1420                     "4:\n"
1421                     "ldr h1, [%[a_ptr0]], #0x2\n"
1422                     "ldr h3, [a_ptr1], #0x2\n"
1423                     "ldr h5, [a_ptr2], #0x2\n"
1424                     "ldr h7, [a_ptr3], #0x2\n"
1425                     "ldr h9, [a_ptr4], #0x2\n"
1426                     "ldr h11, [a_ptr5], #0x2\n"
1427                     "ldr h13, [a_ptr6], #0x2\n"
1428                     "ldr h15, [a_ptr7], #0x2\n"
1429                     "subs %[odds], %[odds], #0x1\n"
1430                     "b.ne 5f\n"
1431                     "b 3f\n"
1432                     "5:\n"
1433                     "ld1 {v1.b}[2], [%[a_ptr0]]\n"
1434                     "ld1 {v3.b}[2], [a_ptr1]\n"
1435                     "ld1 {v5.b}[2], [a_ptr2]\n"
1436                     "ld1 {v7.b}[2], [a_ptr3]\n"
1437                     "ld1 {v9.b}[2], [a_ptr4]\n"
1438                     "ld1 {v11.b}[2], [a_ptr5]\n"
1439                     "ld1 {v13.b}[2], [a_ptr6]\n"
1440                     "ld1 {v15.b}[2], [a_ptr7]\n"
1441                     "3:\n"
1442                     "ldr q16, [%[b_ptr0]]\n"
1443                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1444                     "ldr q17, [%[b_ptr0], #0x10]\n"
1445                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1446                     "ldr q18, [%[b_ptr0], #0x20]\n"
1447                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1448                     "ldr q19, [%[b_ptr0], #0x30]\n"
1449                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1450                     "ldr q20, [%[b_ptr0], #0x40]\n"
1451                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1452                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1453                     "add %[b_ptr0], %[b_ptr0], #0x50\n"
1454                     "cbz %[loops], 6f\n"
1455                     "movi v24.4s, #0\n"
1456                     "subs %[loops], %[loops], #0x1\n"
1457                     "movi v25.4s, #0\n"
1458                     "movi v26.4s, #0\n"
1459                     "movi v27.4s, #0\n"
1460                     "movi v28.4s, #0\n"
1461                     "movi v29.4s, #0\n"
1462                     "movi v30.4s, #0\n"
1463                     "movi v31.4s, #0\n"
1464                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1465                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1466                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1467                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1468                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1469                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1470                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1471                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1472                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1473                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1474                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1475                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1476                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1477                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1478                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1479                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1480                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1481                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1482                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1483                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1484                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1485                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1486                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1487                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1488                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1489                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1490                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1491                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1492                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1493                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1494                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1495                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1496                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1497                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1498                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1499                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1500                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1501                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1502                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1503                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1504                     "b.eq 7f\n"
1505                     "8:\n"
1506                     "str q24, [%[c_ptr0]]\n"
1507                     "subs %[loops], %[loops], #0x1\n"
1508                     "movi v24.4s, #0\n"
1509                     "ldr q16, [%[b_ptr0]]\n"
1510                     "ldr q17, [%[b_ptr0], #0x10]\n"
1511                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1512                     "str q25, [c_ptr1]\n"
1513                     "add c_ptr1, c_ptr1, #0x10\n"
1514                     "movi v25.4s, #0\n"
1515                     "ldr q18, [%[b_ptr0], #0x20]\n"
1516                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1517                     "str q26, [c_ptr2]\n"
1518                     "movi v26.4s, #0\n"
1519                     "ldr q19, [%[b_ptr0], #0x30]\n"
1520                     "ldr q20, [%[b_ptr0], #0x40]\n"
1521                     "add c_ptr2, c_ptr2, #0x10\n"
1522                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1523                     "str q27, [c_ptr3]\n"
1524                     "movi v27.4s, #0\n"
1525                     "add c_ptr3, c_ptr3, #0x10\n"
1526                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1527                     "str q28, [c_ptr4]\n"
1528                     "movi v28.4s, #0\n"
1529                     "add c_ptr4, c_ptr4, #0x10\n"
1530                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1531                     "str q29, [c_ptr5]\n"
1532                     "movi v29.4s, #0\n"
1533                     "add c_ptr5, c_ptr5, #0x10\n"
1534                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1535                     "str q30, [c_ptr6]\n"
1536                     "movi v30.4s, #0\n"
1537                     "add c_ptr6, c_ptr6, #0x10\n"
1538                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1539                     "str q31, [c_ptr7]\n"
1540                     "movi v31.4s, #0\n"
1541                     "add c_ptr7, c_ptr7, #0x10\n"
1542                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1543                     "add %[b_ptr0], %[b_ptr0], #0x50\n"
1544                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1545                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1546                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1547                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1548                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1549                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1550                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1551                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1552                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1553                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1554                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1555                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1556                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1557                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1558                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1559                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1560                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1561                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1562                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1563                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1564                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1565                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1566                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1567                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1568                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1569                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1570                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1571                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1572                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1573                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1574                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1575                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1576                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1577                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1578                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1579                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1580                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1581                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1582                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1583                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1584                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1585                     "b.ne 8b\n"
1586                     "7:\n"
1587                     "str q24, [%[c_ptr0]]\n"
1588                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1589                     "movi v24.4s, #0\n"
1590                     "ldr q16, [%[b_ptr0]]\n"
1591                     "ldr q17, [%[b_ptr0], #0x10]\n"
1592                     "str q25, [c_ptr1]\n"
1593                     "add c_ptr1, c_ptr1, #0x10\n"
1594                     "movi v25.4s, #0\n"
1595                     "ldr q18, [%[b_ptr0], #0x20]\n"
1596                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1597                     "str q26, [c_ptr2]\n"
1598                     "movi v26.4s, #0\n"
1599                     "ldr q19, [%[b_ptr0], #0x30]\n"
1600                     "ldr q20, [%[b_ptr0], #0x40]\n"
1601                     "add c_ptr2, c_ptr2, #0x10\n"
1602                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1603                     "str q27, [c_ptr3]\n"
1604                     "movi v27.4s, #0\n"
1605                     "add c_ptr3, c_ptr3, #0x10\n"
1606                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1607                     "str q28, [c_ptr4]\n"
1608                     "movi v28.4s, #0\n"
1609                     "add c_ptr4, c_ptr4, #0x10\n"
1610                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1611                     "str q29, [c_ptr5]\n"
1612                     "movi v29.4s, #0\n"
1613                     "add c_ptr5, c_ptr5, #0x10\n"
1614                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1615                     "str q30, [c_ptr6]\n"
1616                     "movi v30.4s, #0\n"
1617                     "add c_ptr6, c_ptr6, #0x10\n"
1618                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1619                     "str q31, [c_ptr7]\n"
1620                     "movi v31.4s, #0\n"
1621                     "add c_ptr7, c_ptr7, #0x10\n"
1622                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1623                     "add %[b_ptr0], %[b_ptr0], #0x50\n"
1624                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1625                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1626                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1627                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1628                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1629                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1630                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1631                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1632                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1633                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1634                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1635                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1636                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1637                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1638                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1639                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1640                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1641                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1642                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1643                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1644                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1645                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1646                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1647                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1648                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1649                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1650                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1651                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1652                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1653                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1654                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1655                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1656                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1657                     "b 9f\n"
1658                     "6:\n"
1659                     "movi v24.4s, #0\n"
1660                     "movi v25.4s, #0\n"
1661                     "movi v26.4s, #0\n"
1662                     "movi v27.4s, #0\n"
1663                     "movi v28.4s, #0\n"
1664                     "movi v29.4s, #0\n"
1665                     "movi v30.4s, #0\n"
1666                     "movi v31.4s, #0\n"
1667                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1668                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1669                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1670                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1671                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1672                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1673                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1674                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1675                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1676                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1677                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1678                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1679                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1680                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1681                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1682                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1683                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1684                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1685                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1686                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1687                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1688                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1689                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1690                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1691                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1692                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1693                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1694                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1695                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1696                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1697                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1698                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1699                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1700                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1701                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1702                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1703                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1704                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1705                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1706                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1707                     "9:\n"
1708                     "str q24, [%[c_ptr0]]\n"
1709                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1710                     "str q25, [c_ptr1]\n"
1711                     "str q26, [c_ptr2]\n"
1712                     "str q27, [c_ptr3]\n"
1713                     "str q28, [c_ptr4]\n"
1714                     "str q29, [c_ptr5]\n"
1715                     "str q30, [c_ptr6]\n"
1716                     "str q31, [c_ptr7]\n"
1717                     ".unreq a_ptr1\n"
1718                     ".unreq a_ptr2\n"
1719                     ".unreq a_ptr3\n"
1720                     ".unreq a_ptr4\n"
1721                     ".unreq a_ptr5\n"
1722                     ".unreq a_ptr6\n"
1723                     ".unreq a_ptr7\n"
1724                     ".unreq c_ptr1\n"
1725                     ".unreq c_ptr2\n"
1726                     ".unreq c_ptr3\n"
1727                     ".unreq c_ptr4\n"
1728                     ".unreq c_ptr5\n"
1729                     ".unreq c_ptr6\n"
1730                     ".unreq c_ptr7\n"
1731                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
1732                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
1733                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
1734                 );
1735                 break;
1736             case 6:
1737                 __asm __volatile (
1738                     "a_ptr1 .req X0\n"
1739                     "a_ptr2 .req X1\n"
1740                     "a_ptr3 .req X2\n"
1741                     "a_ptr4 .req X3\n"
1742                     "a_ptr5 .req X4\n"
1743                     "a_ptr6 .req X5\n"
1744                     "a_ptr7 .req X6\n"
1745                     "c_ptr1 .req X7\n"
1746                     "c_ptr2 .req X8\n"
1747                     "c_ptr3 .req X9\n"
1748                     "c_ptr4 .req X10\n"
1749                     "c_ptr5 .req X11\n"
1750                     "c_ptr6 .req X12\n"
1751                     "c_ptr7 .req X13\n"
1752                     "add a_ptr1, %[a_ptr0], %[lda]\n"
1753                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
1754                     "add a_ptr2, a_ptr1, %[lda]\n"
1755                     "add c_ptr2, c_ptr1, %[ldc]\n"
1756                     "add a_ptr3, a_ptr2, %[lda]\n"
1757                     "add c_ptr3, c_ptr2, %[ldc]\n"
1758                     "add a_ptr4, a_ptr3, %[lda]\n"
1759                     "add c_ptr4, c_ptr3, %[ldc]\n"
1760                     "add a_ptr5, a_ptr4, %[lda]\n"
1761                     "add c_ptr5, c_ptr4, %[ldc]\n"
1762                     "add a_ptr6, a_ptr5, %[lda]\n"
1763                     "add c_ptr6, c_ptr5, %[ldc]\n"
1764                     "add a_ptr7, a_ptr6, %[lda]\n"
1765                     "add c_ptr7, c_ptr6, %[ldc]\n"
1766                     "cbz %[oob_rows], 1f\n"
1767                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1768                     "add c_ptr7, %[c_ptr0], #0x0\n"
1769                     "add a_ptr7, %[a_ptr0], #0x0\n"
1770                     "b.eq 1f\n"
1771                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1772                     "add c_ptr6, %[c_ptr0], #0x0\n"
1773                     "add a_ptr6, %[a_ptr0], #0x0\n"
1774                     "b.eq 1f\n"
1775                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1776                     "add c_ptr5, %[c_ptr0], #0x0\n"
1777                     "add a_ptr5, %[a_ptr0], #0x0\n"
1778                     "b.eq 1f\n"
1779                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1780                     "add c_ptr4, %[c_ptr0], #0x0\n"
1781                     "add a_ptr4, %[a_ptr0], #0x0\n"
1782                     "b.eq 1f\n"
1783                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1784                     "add c_ptr3, %[c_ptr0], #0x0\n"
1785                     "add a_ptr3, %[a_ptr0], #0x0\n"
1786                     "b.eq 1f\n"
1787                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1788                     "add c_ptr2, %[c_ptr0], #0x0\n"
1789                     "add a_ptr2, %[a_ptr0], #0x0\n"
1790                     "b.eq 1f\n"
1791                     "subs %[oob_rows], %[oob_rows], #0x1\n"
1792                     "add c_ptr1, %[c_ptr0], #0x0\n"
1793                     "add a_ptr1, %[a_ptr0], #0x0\n"
1794                     "1:\n"
1795                     "cbnz %[odds], 2f\n"
1796                     "ldr q0, [%[a_ptr0]], #0x10\n"
1797                     "ldr q2, [a_ptr1], #0x10\n"
1798                     "ldr q4, [a_ptr2], #0x10\n"
1799                     "ldr q6, [a_ptr3], #0x10\n"
1800                     "ldr d1, [%[a_ptr0]]\n"
1801                     "ldr q8, [a_ptr4], #0x10\n"
1802                     "ldr d3, [a_ptr1]\n"
1803                     "ldr q10, [a_ptr5], #0x10\n"
1804                     "ldr d5, [a_ptr2]\n"
1805                     "ldr q12, [a_ptr6], #0x10\n"
1806                     "ldr d7, [a_ptr3]\n"
1807                     "ldr q14, [a_ptr7], #0x10\n"
1808                     "ldr d9, [a_ptr4]\n"
1809                     "ldr d11, [a_ptr5]\n"
1810                     "ldr d13, [a_ptr6]\n"
1811                     "ldr d15, [a_ptr7]\n"
1812                     "b 3f\n"
1813                     "2:\n"
1814                     "ldr q0, [%[a_ptr0]], #0x10\n"
1815                     "subs %[odds], %[odds], #0x1\n"
1816                     "ldr q2, [a_ptr1], #0x10\n"
1817                     "ldr q4, [a_ptr2], #0x10\n"
1818                     "ldr s1, [%[a_ptr0]], #0x4\n"
1819                     "ldr q6, [a_ptr3], #0x10\n"
1820                     "ldr s3, [a_ptr1], #0x4\n"
1821                     "ldr q8, [a_ptr4], #0x10\n"
1822                     "ldr s5, [a_ptr2], #0x4\n"
1823                     "ldr q10, [a_ptr5], #0x10\n"
1824                     "ldr s7, [a_ptr3], #0x4\n"
1825                     "ldr q12, [a_ptr6], #0x10\n"
1826                     "ldr s9, [a_ptr4], #0x4\n"
1827                     "ldr q14, [a_ptr7], #0x10\n"
1828                     "ldr s11, [a_ptr5], #0x4\n"
1829                     "ldr s13, [a_ptr6], #0x4\n"
1830                     "ldr s15, [a_ptr7], #0x4\n"
1831                     "b.ne 4f\n"
1832                     "ld1 {v1.b}[4], [%[a_ptr0]]\n"
1833                     "ld1 {v3.b}[4], [a_ptr1]\n"
1834                     "ld1 {v5.b}[4], [a_ptr2]\n"
1835                     "ld1 {v7.b}[4], [a_ptr3]\n"
1836                     "ld1 {v9.b}[4], [a_ptr4]\n"
1837                     "ld1 {v11.b}[4], [a_ptr5]\n"
1838                     "ld1 {v13.b}[4], [a_ptr6]\n"
1839                     "ld1 {v15.b}[4], [a_ptr7]\n"
1840                     "b 3f\n"
1841                     "4:\n"
1842                     "ld1 {v1.h}[2], [%[a_ptr0]], #2\n"
1843                     "ld1 {v3.h}[2], [a_ptr1], #2\n"
1844                     "ld1 {v5.h}[2], [a_ptr2], #2\n"
1845                     "ld1 {v7.h}[2], [a_ptr3], #2\n"
1846                     "ld1 {v9.h}[2], [a_ptr4], #2\n"
1847                     "ld1 {v11.h}[2], [a_ptr5], #2\n"
1848                     "ld1 {v13.h}[2], [a_ptr6], #2\n"
1849                     "ld1 {v15.h}[2], [a_ptr7], #2\n"
1850                     "subs %[odds], %[odds], #0x1\n"
1851                     "b.ne 5f\n"
1852                     "b 3f\n"
1853                     "5:\n"
1854                     "ld1 {v1.b}[6], [%[a_ptr0]]\n"
1855                     "ld1 {v3.b}[6], [a_ptr1]\n"
1856                     "ld1 {v5.b}[6], [a_ptr2]\n"
1857                     "ld1 {v7.b}[6], [a_ptr3]\n"
1858                     "ld1 {v9.b}[6], [a_ptr4]\n"
1859                     "ld1 {v11.b}[6], [a_ptr5]\n"
1860                     "ld1 {v13.b}[6], [a_ptr6]\n"
1861                     "ld1 {v15.b}[6], [a_ptr7]\n"
1862                     "3:\n"
1863                     "ldr q16, [%[b_ptr0]]\n"
1864                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1865                     "ldr q17, [%[b_ptr0], #0x10]\n"
1866                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1867                     "ldr q18, [%[b_ptr0], #0x20]\n"
1868                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1869                     "ldr q19, [%[b_ptr0], #0x30]\n"
1870                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1871                     "ldr q20, [%[b_ptr0], #0x40]\n"
1872                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1873                     "ldr q21, [%[b_ptr0], #0x50]\n"
1874                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1875                     "add %[b_ptr0], %[b_ptr0], #0x60\n"
1876                     "cbz %[loops], 6f\n"
1877                     "movi v24.4s, #0\n"
1878                     "subs %[loops], %[loops], #0x1\n"
1879                     "movi v25.4s, #0\n"
1880                     "movi v26.4s, #0\n"
1881                     "movi v27.4s, #0\n"
1882                     "movi v28.4s, #0\n"
1883                     "movi v29.4s, #0\n"
1884                     "movi v30.4s, #0\n"
1885                     "movi v31.4s, #0\n"
1886                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1887                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1888                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1889                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1890                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1891                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1892                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1893                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1894                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1895                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1896                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1897                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1898                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1899                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1900                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1901                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1902                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1903                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1904                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1905                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1906                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1907                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1908                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1909                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
1910                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
1911                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
1912                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
1913                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
1914                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
1915                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
1916                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
1917                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
1918                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
1919                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
1920                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
1921                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
1922                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
1923                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
1924                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
1925                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
1926                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
1927                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
1928                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
1929                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
1930                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
1931                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
1932                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
1933                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
1934                     "b.eq 7f\n"
1935                     "8:\n"
1936                     "str q24, [%[c_ptr0]]\n"
1937                     "subs %[loops], %[loops], #0x1\n"
1938                     "movi v24.4s, #0\n"
1939                     "ldr q16, [%[b_ptr0]]\n"
1940                     "ldr q17, [%[b_ptr0], #0x10]\n"
1941                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
1942                     "str q25, [c_ptr1]\n"
1943                     "add c_ptr1, c_ptr1, #0x10\n"
1944                     "movi v25.4s, #0\n"
1945                     "ldr q18, [%[b_ptr0], #0x20]\n"
1946                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
1947                     "str q26, [c_ptr2]\n"
1948                     "movi v26.4s, #0\n"
1949                     "ldr q19, [%[b_ptr0], #0x30]\n"
1950                     "ldr q20, [%[b_ptr0], #0x40]\n"
1951                     "add c_ptr2, c_ptr2, #0x10\n"
1952                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
1953                     "str q27, [c_ptr3]\n"
1954                     "movi v27.4s, #0\n"
1955                     "ldr q21, [%[b_ptr0], #0x50]\n"
1956                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
1957                     "add c_ptr3, c_ptr3, #0x10\n"
1958                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
1959                     "str q28, [c_ptr4]\n"
1960                     "movi v28.4s, #0\n"
1961                     "add c_ptr4, c_ptr4, #0x10\n"
1962                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
1963                     "str q29, [c_ptr5]\n"
1964                     "movi v29.4s, #0\n"
1965                     "add c_ptr5, c_ptr5, #0x10\n"
1966                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
1967                     "str q30, [c_ptr6]\n"
1968                     "movi v30.4s, #0\n"
1969                     "add c_ptr6, c_ptr6, #0x10\n"
1970                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
1971                     "str q31, [c_ptr7]\n"
1972                     "movi v31.4s, #0\n"
1973                     "add c_ptr7, c_ptr7, #0x10\n"
1974                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
1975                     "add %[b_ptr0], %[b_ptr0], #0x60\n"
1976                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
1977                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1978                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
1979                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1980                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
1981                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1982                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
1983                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1984                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
1985                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1986                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
1987                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1988                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
1989                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1990                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
1991                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1992                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
1993                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
1994                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
1995                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
1996                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
1997                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
1998                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
1999                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2000                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2001                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2002                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2003                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2004                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2005                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2006                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2007                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2008                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2009                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2010                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2011                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2012                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2013                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2014                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2015                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2016                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2017                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2018                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2019                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2020                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2021                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2022                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2023                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2024                     "b.ne 8b\n"
2025                     "7:\n"
2026                     "str q24, [%[c_ptr0]]\n"
2027                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
2028                     "movi v24.4s, #0\n"
2029                     "ldr q16, [%[b_ptr0]]\n"
2030                     "ldr q17, [%[b_ptr0], #0x10]\n"
2031                     "str q25, [c_ptr1]\n"
2032                     "add c_ptr1, c_ptr1, #0x10\n"
2033                     "movi v25.4s, #0\n"
2034                     "ldr q18, [%[b_ptr0], #0x20]\n"
2035                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2036                     "str q26, [c_ptr2]\n"
2037                     "movi v26.4s, #0\n"
2038                     "ldr q19, [%[b_ptr0], #0x30]\n"
2039                     "ldr q20, [%[b_ptr0], #0x40]\n"
2040                     "add c_ptr2, c_ptr2, #0x10\n"
2041                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2042                     "str q27, [c_ptr3]\n"
2043                     "movi v27.4s, #0\n"
2044                     "ldr q21, [%[b_ptr0], #0x50]\n"
2045                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2046                     "add c_ptr3, c_ptr3, #0x10\n"
2047                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2048                     "str q28, [c_ptr4]\n"
2049                     "movi v28.4s, #0\n"
2050                     "add c_ptr4, c_ptr4, #0x10\n"
2051                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2052                     "str q29, [c_ptr5]\n"
2053                     "movi v29.4s, #0\n"
2054                     "add c_ptr5, c_ptr5, #0x10\n"
2055                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2056                     "str q30, [c_ptr6]\n"
2057                     "movi v30.4s, #0\n"
2058                     "add c_ptr6, c_ptr6, #0x10\n"
2059                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2060                     "str q31, [c_ptr7]\n"
2061                     "movi v31.4s, #0\n"
2062                     "add c_ptr7, c_ptr7, #0x10\n"
2063                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2064                     "add %[b_ptr0], %[b_ptr0], #0x60\n"
2065                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2066                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2067                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2068                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2069                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2070                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2071                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2072                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2073                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2074                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2075                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2076                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2077                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2078                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2079                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2080                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2081                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2082                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2083                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2084                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2085                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2086                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2087                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2088                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2089                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2090                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2091                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2092                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2093                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2094                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2095                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2096                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2097                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2098                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2099                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2100                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2101                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2102                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2103                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2104                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2105                     "b 9f\n"
2106                     "6:\n"
2107                     "movi v24.4s, #0\n"
2108                     "movi v25.4s, #0\n"
2109                     "movi v26.4s, #0\n"
2110                     "movi v27.4s, #0\n"
2111                     "movi v28.4s, #0\n"
2112                     "movi v29.4s, #0\n"
2113                     "movi v30.4s, #0\n"
2114                     "movi v31.4s, #0\n"
2115                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2116                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2117                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2118                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2119                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2120                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2121                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2122                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2123                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2124                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2125                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2126                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2127                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2128                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2129                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2130                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2131                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2132                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2133                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2134                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2135                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2136                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2137                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2138                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2139                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2140                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2141                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2142                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2143                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2144                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2145                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2146                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2147                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2148                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2149                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2150                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2151                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2152                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2153                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2154                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2155                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2156                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2157                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2158                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2159                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2160                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2161                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2162                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2163                     "9:\n"
2164                     "str q24, [%[c_ptr0]]\n"
2165                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
2166                     "str q25, [c_ptr1]\n"
2167                     "str q26, [c_ptr2]\n"
2168                     "str q27, [c_ptr3]\n"
2169                     "str q28, [c_ptr4]\n"
2170                     "str q29, [c_ptr5]\n"
2171                     "str q30, [c_ptr6]\n"
2172                     "str q31, [c_ptr7]\n"
2173                     ".unreq a_ptr1\n"
2174                     ".unreq a_ptr2\n"
2175                     ".unreq a_ptr3\n"
2176                     ".unreq a_ptr4\n"
2177                     ".unreq a_ptr5\n"
2178                     ".unreq a_ptr6\n"
2179                     ".unreq a_ptr7\n"
2180                     ".unreq c_ptr1\n"
2181                     ".unreq c_ptr2\n"
2182                     ".unreq c_ptr3\n"
2183                     ".unreq c_ptr4\n"
2184                     ".unreq c_ptr5\n"
2185                     ".unreq c_ptr6\n"
2186                     ".unreq c_ptr7\n"
2187                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
2188                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
2189                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
2190                 );
2191                 break;
2192             case 7:
2193                 __asm __volatile (
2194                     "a_ptr1 .req X0\n"
2195                     "a_ptr2 .req X1\n"
2196                     "a_ptr3 .req X2\n"
2197                     "a_ptr4 .req X3\n"
2198                     "a_ptr5 .req X4\n"
2199                     "a_ptr6 .req X5\n"
2200                     "a_ptr7 .req X6\n"
2201                     "c_ptr1 .req X7\n"
2202                     "c_ptr2 .req X8\n"
2203                     "c_ptr3 .req X9\n"
2204                     "c_ptr4 .req X10\n"
2205                     "c_ptr5 .req X11\n"
2206                     "c_ptr6 .req X12\n"
2207                     "c_ptr7 .req X13\n"
2208                     "add a_ptr1, %[a_ptr0], %[lda]\n"
2209                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
2210                     "add a_ptr2, a_ptr1, %[lda]\n"
2211                     "add c_ptr2, c_ptr1, %[ldc]\n"
2212                     "add a_ptr3, a_ptr2, %[lda]\n"
2213                     "add c_ptr3, c_ptr2, %[ldc]\n"
2214                     "add a_ptr4, a_ptr3, %[lda]\n"
2215                     "add c_ptr4, c_ptr3, %[ldc]\n"
2216                     "add a_ptr5, a_ptr4, %[lda]\n"
2217                     "add c_ptr5, c_ptr4, %[ldc]\n"
2218                     "add a_ptr6, a_ptr5, %[lda]\n"
2219                     "add c_ptr6, c_ptr5, %[ldc]\n"
2220                     "add a_ptr7, a_ptr6, %[lda]\n"
2221                     "add c_ptr7, c_ptr6, %[ldc]\n"
2222                     "cbz %[oob_rows], 1f\n"
2223                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2224                     "add c_ptr7, %[c_ptr0], #0x0\n"
2225                     "add a_ptr7, %[a_ptr0], #0x0\n"
2226                     "b.eq 1f\n"
2227                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2228                     "add c_ptr6, %[c_ptr0], #0x0\n"
2229                     "add a_ptr6, %[a_ptr0], #0x0\n"
2230                     "b.eq 1f\n"
2231                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2232                     "add c_ptr5, %[c_ptr0], #0x0\n"
2233                     "add a_ptr5, %[a_ptr0], #0x0\n"
2234                     "b.eq 1f\n"
2235                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2236                     "add c_ptr4, %[c_ptr0], #0x0\n"
2237                     "add a_ptr4, %[a_ptr0], #0x0\n"
2238                     "b.eq 1f\n"
2239                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2240                     "add c_ptr3, %[c_ptr0], #0x0\n"
2241                     "add a_ptr3, %[a_ptr0], #0x0\n"
2242                     "b.eq 1f\n"
2243                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2244                     "add c_ptr2, %[c_ptr0], #0x0\n"
2245                     "add a_ptr2, %[a_ptr0], #0x0\n"
2246                     "b.eq 1f\n"
2247                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2248                     "add c_ptr1, %[c_ptr0], #0x0\n"
2249                     "add a_ptr1, %[a_ptr0], #0x0\n"
2250                     "1:\n"
2251                     "ldr q0, [%[a_ptr0]], #0x10\n"
2252                     "ldr q2, [a_ptr1], #0x10\n"
2253                     "ldr q4, [a_ptr2], #0x10\n"
2254                     "ldr q6, [a_ptr3], #0x10\n"
2255                     "ldr d1, [%[a_ptr0]], #0x8\n"
2256                     "ldr q8, [a_ptr4], #0x10\n"
2257                     "ldr d3, [a_ptr1], #0x8\n"
2258                     "ldr q10, [a_ptr5], #0x10\n"
2259                     "ldr d5, [a_ptr2], #0x8\n"
2260                     "ldr q12, [a_ptr6], #0x10\n"
2261                     "ldr d7, [a_ptr3], #0x8\n"
2262                     "ldr q14, [a_ptr7], #0x10\n"
2263                     "ldr d9, [a_ptr4], #0x8\n"
2264                     "ldr d11, [a_ptr5], #0x8\n"
2265                     "ldr d13, [a_ptr6], #0x8\n"
2266                     "ldr d15, [a_ptr7], #0x8\n"
2267                     "cbnz %[odds], 2f\n"
2268                     "ld1 {v1.s}[2], [%[a_ptr0]]\n"
2269                     "ld1 {v3.s}[2], [a_ptr1]\n"
2270                     "ld1 {v5.s}[2], [a_ptr2]\n"
2271                     "ld1 {v7.s}[2], [a_ptr3]\n"
2272                     "ld1 {v9.s}[2], [a_ptr4]\n"
2273                     "ld1 {v11.s}[2], [a_ptr5]\n"
2274                     "ld1 {v13.s}[2], [a_ptr6]\n"
2275                     "ld1 {v15.s}[2], [a_ptr7]\n"
2276                     "b 3f\n"
2277                     "2:\n"
2278                     "subs %[odds], %[odds], #0x1\n"
2279                     "b.ne 4f\n"
2280                     "ld1 {v1.b}[8], [%[a_ptr0]]\n"
2281                     "ld1 {v3.b}[8], [a_ptr1]\n"
2282                     "ld1 {v5.b}[8], [a_ptr2]\n"
2283                     "ld1 {v7.b}[8], [a_ptr3]\n"
2284                     "ld1 {v9.b}[8], [a_ptr4]\n"
2285                     "ld1 {v11.b}[8], [a_ptr5]\n"
2286                     "ld1 {v13.b}[8], [a_ptr6]\n"
2287                     "ld1 {v15.b}[8], [a_ptr7]\n"
2288                     "b 3f\n"
2289                     "4:\n"
2290                     "ld1 {v1.h}[4], [%[a_ptr0]], #2\n"
2291                     "ld1 {v3.h}[4], [a_ptr1], #2\n"
2292                     "ld1 {v5.h}[4], [a_ptr2], #2\n"
2293                     "ld1 {v7.h}[4], [a_ptr3], #2\n"
2294                     "ld1 {v9.h}[4], [a_ptr4], #2\n"
2295                     "ld1 {v11.h}[4], [a_ptr5], #2\n"
2296                     "ld1 {v13.h}[4], [a_ptr6], #2\n"
2297                     "ld1 {v15.h}[4], [a_ptr7], #2\n"
2298                     "subs %[odds], %[odds], #0x1\n"
2299                     "b.ne 5f\n"
2300                     "b 3f\n"
2301                     "5:\n"
2302                     "ld1 {v1.b}[10], [%[a_ptr0]]\n"
2303                     "ld1 {v3.b}[10], [a_ptr1]\n"
2304                     "ld1 {v5.b}[10], [a_ptr2]\n"
2305                     "ld1 {v7.b}[10], [a_ptr3]\n"
2306                     "ld1 {v9.b}[10], [a_ptr4]\n"
2307                     "ld1 {v11.b}[10], [a_ptr5]\n"
2308                     "ld1 {v13.b}[10], [a_ptr6]\n"
2309                     "ld1 {v15.b}[10], [a_ptr7]\n"
2310                     "3:\n"
2311                     "ldr q16, [%[b_ptr0]]\n"
2312                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2313                     "ldr q17, [%[b_ptr0], #0x10]\n"
2314                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2315                     "ldr q18, [%[b_ptr0], #0x20]\n"
2316                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2317                     "ldr q19, [%[b_ptr0], #0x30]\n"
2318                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2319                     "ldr q20, [%[b_ptr0], #0x40]\n"
2320                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2321                     "ldr q21, [%[b_ptr0], #0x50]\n"
2322                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2323                     "ldr q22, [%[b_ptr0], #0x60]\n"
2324                     "add %[b_ptr0], %[b_ptr0], #0x70\n"
2325                     "cbz %[loops], 6f\n"
2326                     "movi v24.4s, #0\n"
2327                     "subs %[loops], %[loops], #0x1\n"
2328                     "movi v25.4s, #0\n"
2329                     "movi v26.4s, #0\n"
2330                     "movi v27.4s, #0\n"
2331                     "movi v28.4s, #0\n"
2332                     "movi v29.4s, #0\n"
2333                     "movi v30.4s, #0\n"
2334                     "movi v31.4s, #0\n"
2335                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2336                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2337                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2338                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2339                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2340                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2341                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2342                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2343                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2344                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2345                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2346                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2347                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2348                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2349                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2350                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2351                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2352                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2353                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2354                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2355                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2356                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2357                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2358                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2359                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2360                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2361                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2362                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2363                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2364                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2365                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2366                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2367                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2368                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2369                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2370                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2371                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2372                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2373                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2374                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2375                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2376                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2377                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2378                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2379                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2380                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2381                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2382                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2383                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2384                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2385                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2386                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2387                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2388                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2389                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2390                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2391                     "b.eq 7f\n"
2392                     "8:\n"
2393                     "str q24, [%[c_ptr0]]\n"
2394                     "subs %[loops], %[loops], #0x1\n"
2395                     "movi v24.4s, #0\n"
2396                     "ldr q16, [%[b_ptr0]]\n"
2397                     "ldr q17, [%[b_ptr0], #0x10]\n"
2398                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
2399                     "str q25, [c_ptr1]\n"
2400                     "add c_ptr1, c_ptr1, #0x10\n"
2401                     "movi v25.4s, #0\n"
2402                     "ldr q18, [%[b_ptr0], #0x20]\n"
2403                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2404                     "str q26, [c_ptr2]\n"
2405                     "movi v26.4s, #0\n"
2406                     "ldr q19, [%[b_ptr0], #0x30]\n"
2407                     "ldr q20, [%[b_ptr0], #0x40]\n"
2408                     "add c_ptr2, c_ptr2, #0x10\n"
2409                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2410                     "str q27, [c_ptr3]\n"
2411                     "movi v27.4s, #0\n"
2412                     "ldr q21, [%[b_ptr0], #0x50]\n"
2413                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2414                     "ldr q22, [%[b_ptr0], #0x60]\n"
2415                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2416                     "str q28, [c_ptr4]\n"
2417                     "movi v28.4s, #0\n"
2418                     "add c_ptr3, c_ptr3, #0x10\n"
2419                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2420                     "str q29, [c_ptr5]\n"
2421                     "movi v29.4s, #0\n"
2422                     "add c_ptr4, c_ptr4, #0x10\n"
2423                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2424                     "str q30, [c_ptr6]\n"
2425                     "movi v30.4s, #0\n"
2426                     "add c_ptr5, c_ptr5, #0x10\n"
2427                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2428                     "str q31, [c_ptr7]\n"
2429                     "movi v31.4s, #0\n"
2430                     "add c_ptr6, c_ptr6, #0x10\n"
2431                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2432                     "add c_ptr7, c_ptr7, #0x10\n"
2433                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2434                     "add %[b_ptr0], %[b_ptr0], #0x70\n"
2435                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2436                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2437                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2438                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2439                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2440                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2441                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2442                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2443                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2444                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2445                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2446                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2447                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2448                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2449                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2450                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2451                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2452                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2453                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2454                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2455                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2456                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2457                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2458                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2459                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2460                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2461                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2462                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2463                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2464                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2465                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2466                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2467                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2468                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2469                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2470                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2471                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2472                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2473                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2474                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2475                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2476                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2477                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2478                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2479                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2480                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2481                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2482                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2483                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2484                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2485                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2486                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2487                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2488                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2489                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2490                     "b.ne 8b\n"
2491                     "7:\n"
2492                     "str q24, [%[c_ptr0]]\n"
2493                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
2494                     "movi v24.4s, #0\n"
2495                     "ldr q16, [%[b_ptr0]]\n"
2496                     "ldr q17, [%[b_ptr0], #0x10]\n"
2497                     "str q25, [c_ptr1]\n"
2498                     "add c_ptr1, c_ptr1, #0x10\n"
2499                     "movi v25.4s, #0\n"
2500                     "ldr q18, [%[b_ptr0], #0x20]\n"
2501                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2502                     "str q26, [c_ptr2]\n"
2503                     "movi v26.4s, #0\n"
2504                     "ldr q19, [%[b_ptr0], #0x30]\n"
2505                     "ldr q20, [%[b_ptr0], #0x40]\n"
2506                     "add c_ptr2, c_ptr2, #0x10\n"
2507                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2508                     "str q27, [c_ptr3]\n"
2509                     "movi v27.4s, #0\n"
2510                     "ldr q21, [%[b_ptr0], #0x50]\n"
2511                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2512                     "ldr q22, [%[b_ptr0], #0x60]\n"
2513                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2514                     "str q28, [c_ptr4]\n"
2515                     "movi v28.4s, #0\n"
2516                     "add c_ptr3, c_ptr3, #0x10\n"
2517                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2518                     "str q29, [c_ptr5]\n"
2519                     "movi v29.4s, #0\n"
2520                     "add c_ptr4, c_ptr4, #0x10\n"
2521                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2522                     "str q30, [c_ptr6]\n"
2523                     "movi v30.4s, #0\n"
2524                     "add c_ptr5, c_ptr5, #0x10\n"
2525                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2526                     "str q31, [c_ptr7]\n"
2527                     "movi v31.4s, #0\n"
2528                     "add c_ptr6, c_ptr6, #0x10\n"
2529                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2530                     "add c_ptr7, c_ptr7, #0x10\n"
2531                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2532                     "add %[b_ptr0], %[b_ptr0], #0x70\n"
2533                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2534                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2535                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2536                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2537                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2538                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2539                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2540                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2541                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2542                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2543                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2544                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2545                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2546                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2547                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2548                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2549                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2550                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2551                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2552                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2553                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2554                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2555                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2556                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2557                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2558                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2559                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2560                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2561                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2562                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2563                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2564                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2565                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2566                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2567                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2568                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2569                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2570                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2571                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2572                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2573                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2574                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2575                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2576                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2577                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2578                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2579                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2580                     "b 9f\n"
2581                     "6:\n"
2582                     "movi v24.4s, #0\n"
2583                     "movi v25.4s, #0\n"
2584                     "movi v26.4s, #0\n"
2585                     "movi v27.4s, #0\n"
2586                     "movi v28.4s, #0\n"
2587                     "movi v29.4s, #0\n"
2588                     "movi v30.4s, #0\n"
2589                     "movi v31.4s, #0\n"
2590                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2591                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2592                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2593                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2594                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2595                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2596                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2597                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2598                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2599                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2600                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2601                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2602                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2603                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2604                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2605                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2606                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2607                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2608                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2609                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2610                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2611                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2612                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2613                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2614                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2615                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2616                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2617                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2618                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2619                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2620                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2621                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2622                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2623                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2624                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2625                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2626                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2627                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2628                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2629                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2630                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2631                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2632                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2633                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2634                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2635                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2636                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2637                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2638                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2639                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2640                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2641                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2642                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2643                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2644                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2645                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2646                     "9:\n"
2647                     "str q24, [%[c_ptr0]]\n"
2648                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
2649                     "str q25, [c_ptr1]\n"
2650                     "str q26, [c_ptr2]\n"
2651                     "str q27, [c_ptr3]\n"
2652                     "str q28, [c_ptr4]\n"
2653                     "str q29, [c_ptr5]\n"
2654                     "str q30, [c_ptr6]\n"
2655                     "str q31, [c_ptr7]\n"
2656                     ".unreq a_ptr1\n"
2657                     ".unreq a_ptr2\n"
2658                     ".unreq a_ptr3\n"
2659                     ".unreq a_ptr4\n"
2660                     ".unreq a_ptr5\n"
2661                     ".unreq a_ptr6\n"
2662                     ".unreq a_ptr7\n"
2663                     ".unreq c_ptr1\n"
2664                     ".unreq c_ptr2\n"
2665                     ".unreq c_ptr3\n"
2666                     ".unreq c_ptr4\n"
2667                     ".unreq c_ptr5\n"
2668                     ".unreq c_ptr6\n"
2669                     ".unreq c_ptr7\n"
2670                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
2671                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
2672                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
2673                 );
2674                 break;
2675             default:
2676             case 8:
2677                 __asm __volatile (
2678                     "a_ptr1 .req X0\n"
2679                     "a_ptr2 .req X1\n"
2680                     "a_ptr3 .req X2\n"
2681                     "a_ptr4 .req X3\n"
2682                     "a_ptr5 .req X4\n"
2683                     "a_ptr6 .req X5\n"
2684                     "a_ptr7 .req X6\n"
2685                     "c_ptr1 .req X7\n"
2686                     "c_ptr2 .req X8\n"
2687                     "c_ptr3 .req X9\n"
2688                     "c_ptr4 .req X10\n"
2689                     "c_ptr5 .req X11\n"
2690                     "c_ptr6 .req X12\n"
2691                     "c_ptr7 .req X13\n"
2692                     "add a_ptr1, %[a_ptr0], %[lda]\n"
2693                     "add c_ptr1, %[c_ptr0], %[ldc]\n"
2694                     "add a_ptr2, a_ptr1, %[lda]\n"
2695                     "add c_ptr2, c_ptr1, %[ldc]\n"
2696                     "add a_ptr3, a_ptr2, %[lda]\n"
2697                     "add c_ptr3, c_ptr2, %[ldc]\n"
2698                     "add a_ptr4, a_ptr3, %[lda]\n"
2699                     "add c_ptr4, c_ptr3, %[ldc]\n"
2700                     "add a_ptr5, a_ptr4, %[lda]\n"
2701                     "add c_ptr5, c_ptr4, %[ldc]\n"
2702                     "add a_ptr6, a_ptr5, %[lda]\n"
2703                     "add c_ptr6, c_ptr5, %[ldc]\n"
2704                     "add a_ptr7, a_ptr6, %[lda]\n"
2705                     "add c_ptr7, c_ptr6, %[ldc]\n"
2706                     "cbz %[oob_rows], 1f\n"
2707                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2708                     "add c_ptr7, %[c_ptr0], #0x0\n"
2709                     "add a_ptr7, %[a_ptr0], #0x0\n"
2710                     "b.eq 1f\n"
2711                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2712                     "add c_ptr6, %[c_ptr0], #0x0\n"
2713                     "add a_ptr6, %[a_ptr0], #0x0\n"
2714                     "b.eq 1f\n"
2715                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2716                     "add c_ptr5, %[c_ptr0], #0x0\n"
2717                     "add a_ptr5, %[a_ptr0], #0x0\n"
2718                     "b.eq 1f\n"
2719                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2720                     "add c_ptr4, %[c_ptr0], #0x0\n"
2721                     "add a_ptr4, %[a_ptr0], #0x0\n"
2722                     "b.eq 1f\n"
2723                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2724                     "add c_ptr3, %[c_ptr0], #0x0\n"
2725                     "add a_ptr3, %[a_ptr0], #0x0\n"
2726                     "b.eq 1f\n"
2727                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2728                     "add c_ptr2, %[c_ptr0], #0x0\n"
2729                     "add a_ptr2, %[a_ptr0], #0x0\n"
2730                     "b.eq 1f\n"
2731                     "subs %[oob_rows], %[oob_rows], #0x1\n"
2732                     "add c_ptr1, %[c_ptr0], #0x0\n"
2733                     "add a_ptr1, %[a_ptr0], #0x0\n"
2734                     "1:\n"
2735                     "cbnz %[odds], 2f\n"
2736                     "ldr q0, [%[a_ptr0]], #0x10\n"
2737                     "ldr q2, [a_ptr1], #0x10\n"
2738                     "ldr q4, [a_ptr2], #0x10\n"
2739                     "ldr q6, [a_ptr3], #0x10\n"
2740                     "ldr q8, [a_ptr4], #0x10\n"
2741                     "ldr q10, [a_ptr5], #0x10\n"
2742                     "ldr q12, [a_ptr6], #0x10\n"
2743                     "ldr q14, [a_ptr7], #0x10\n"
2744                     "ldr q1, [%[a_ptr0]]\n"
2745                     "ldr q3, [a_ptr1]\n"
2746                     "ldr q5, [a_ptr2]\n"
2747                     "ldr q7, [a_ptr3]\n"
2748                     "ldr q9, [a_ptr4]\n"
2749                     "ldr q11, [a_ptr5]\n"
2750                     "ldr q13, [a_ptr6]\n"
2751                     "ldr q15, [a_ptr7]\n"
2752                     "b 3f\n"
2753                     "2:\n"
2754                     "ldr q0, [%[a_ptr0]], #0x10\n"
2755                     "subs %[odds], %[odds], #0x1\n"
2756                     "ldr q2, [a_ptr1], #0x10\n"
2757                     "ldr q4, [a_ptr2], #0x10\n"
2758                     "ldr d1, [%[a_ptr0]], #0x8\n"
2759                     "ldr q6, [a_ptr3], #0x10\n"
2760                     "ldr d3, [a_ptr1], #0x8\n"
2761                     "ldr q8, [a_ptr4], #0x10\n"
2762                     "ldr d5, [a_ptr2], #0x8\n"
2763                     "ldr q10, [a_ptr5], #0x10\n"
2764                     "ldr d7, [a_ptr3], #0x8\n"
2765                     "ldr q12, [a_ptr6], #0x10\n"
2766                     "ldr d9, [a_ptr4], #0x8\n"
2767                     "ldr q14, [a_ptr7], #0x10\n"
2768                     "ldr d11, [a_ptr5], #0x8\n"
2769                     "ldr d13, [a_ptr6], #0x8\n"
2770                     "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
2771                     "ldr d15, [a_ptr7], #0x8\n"
2772                     "ld1 {v3.s}[2], [a_ptr1], #4\n"
2773                     "ld1 {v5.s}[2], [a_ptr2], #4\n"
2774                     "ld1 {v7.s}[2], [a_ptr3], #4\n"
2775                     "ld1 {v9.s}[2], [a_ptr4], #4\n"
2776                     "ld1 {v11.s}[2], [a_ptr5], #4\n"
2777                     "ld1 {v13.s}[2], [a_ptr6], #4\n"
2778                     "ld1 {v15.s}[2], [a_ptr7], #4\n"
2779                     "b.ne 4f\n"
2780                     "ld1 {v1.b}[12], [%[a_ptr0]]\n"
2781                     "ld1 {v3.b}[12], [a_ptr1]\n"
2782                     "ld1 {v5.b}[12], [a_ptr2]\n"
2783                     "ld1 {v7.b}[12], [a_ptr3]\n"
2784                     "ld1 {v9.b}[12], [a_ptr4]\n"
2785                     "ld1 {v11.b}[12], [a_ptr5]\n"
2786                     "ld1 {v13.b}[12], [a_ptr6]\n"
2787                     "ld1 {v15.b}[12], [a_ptr7]\n"
2788                     "b 3f\n"
2789                     "4:\n"
2790                     "ld1 {v1.h}[6], [%[a_ptr0]], #2\n"
2791                     "ld1 {v3.h}[6], [a_ptr1], #2\n"
2792                     "ld1 {v5.h}[6], [a_ptr2], #2\n"
2793                     "ld1 {v7.h}[6], [a_ptr3], #2\n"
2794                     "ld1 {v9.h}[6], [a_ptr4], #2\n"
2795                     "ld1 {v11.h}[6], [a_ptr5], #2\n"
2796                     "ld1 {v13.h}[6], [a_ptr6], #2\n"
2797                     "ld1 {v15.h}[6], [a_ptr7], #2\n"
2798                     "subs %[odds], %[odds], #0x1\n"
2799                     "b.ne 5f\n"
2800                     "b 3f\n"
2801                     "5:\n"
2802                     "ld1 {v1.b}[14], [%[a_ptr0]]\n"
2803                     "ld1 {v3.b}[14], [a_ptr1]\n"
2804                     "ld1 {v5.b}[14], [a_ptr2]\n"
2805                     "ld1 {v7.b}[14], [a_ptr3]\n"
2806                     "ld1 {v9.b}[14], [a_ptr4]\n"
2807                     "ld1 {v11.b}[14], [a_ptr5]\n"
2808                     "ld1 {v13.b}[14], [a_ptr6]\n"
2809                     "ld1 {v15.b}[14], [a_ptr7]\n"
2810                     "3:\n"
2811                     "ldr q16, [%[b_ptr0]]\n"
2812                     "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2813                     "ldr q17, [%[b_ptr0], #0x10]\n"
2814                     "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2815                     "ldr q18, [%[b_ptr0], #0x20]\n"
2816                     "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2817                     "ldr q19, [%[b_ptr0], #0x30]\n"
2818                     "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2819                     "ldr q20, [%[b_ptr0], #0x40]\n"
2820                     "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2821                     "ldr q21, [%[b_ptr0], #0x50]\n"
2822                     "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2823                     "ldr q22, [%[b_ptr0], #0x60]\n"
2824                     "ldr q23, [%[b_ptr0], #0x70]\n"
2825                     "add %[b_ptr0], %[b_ptr0], #0x80\n"
2826                     "cbz %[loops], 6f\n"
2827                     "movi v24.4s, #0\n"
2828                     "subs %[loops], %[loops], #0x1\n"
2829                     "movi v25.4s, #0\n"
2830                     "movi v26.4s, #0\n"
2831                     "movi v27.4s, #0\n"
2832                     "movi v28.4s, #0\n"
2833                     "movi v29.4s, #0\n"
2834                     "movi v30.4s, #0\n"
2835                     "movi v31.4s, #0\n"
2836                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2837                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2838                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2839                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2840                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2841                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2842                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2843                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2844                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2845                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2846                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2847                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2848                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2849                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2850                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2851                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2852                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2853                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2854                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2855                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2856                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2857                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2858                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2859                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2860                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2861                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2862                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2863                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2864                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2865                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2866                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2867                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2868                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2869                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2870                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2871                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2872                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2873                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2874                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2875                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2876                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2877                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2878                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2879                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2880                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2881                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2882                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2883                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2884                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2885                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2886                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2887                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2888                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2889                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2890                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2891                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
2892                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
2893                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
2894                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
2895                     ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
2896                     ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
2897                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
2898                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
2899                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
2900                     "b.eq 7f\n"
2901                     "8:\n"
2902                     "str q24, [%[c_ptr0]]\n"
2903                     "subs %[loops], %[loops], #0x1\n"
2904                     "movi v24.4s, #0\n"
2905                     "ldr q16, [%[b_ptr0]]\n"
2906                     "ldr q17, [%[b_ptr0], #0x10]\n"
2907                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
2908                     "str q25, [c_ptr1]\n"
2909                     "add c_ptr1, c_ptr1, #0x10\n"
2910                     "movi v25.4s, #0\n"
2911                     "ldr q18, [%[b_ptr0], #0x20]\n"
2912                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
2913                     "str q26, [c_ptr2]\n"
2914                     "movi v26.4s, #0\n"
2915                     "ldr q19, [%[b_ptr0], #0x30]\n"
2916                     "ldr q20, [%[b_ptr0], #0x40]\n"
2917                     "add c_ptr2, c_ptr2, #0x10\n"
2918                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
2919                     "str q27, [c_ptr3]\n"
2920                     "movi v27.4s, #0\n"
2921                     "ldr q21, [%[b_ptr0], #0x50]\n"
2922                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
2923                     "ldr q22, [%[b_ptr0], #0x60]\n"
2924                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
2925                     "str q28, [c_ptr4]\n"
2926                     "movi v28.4s, #0\n"
2927                     "ldr q23, [%[b_ptr0], #0x70]\n"
2928                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
2929                     "add c_ptr3, c_ptr3, #0x10\n"
2930                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
2931                     "str q29, [c_ptr5]\n"
2932                     "movi v29.4s, #0\n"
2933                     "add c_ptr4, c_ptr4, #0x10\n"
2934                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
2935                     "str q30, [c_ptr6]\n"
2936                     "movi v30.4s, #0\n"
2937                     "add c_ptr5, c_ptr5, #0x10\n"
2938                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
2939                     "str q31, [c_ptr7]\n"
2940                     "movi v31.4s, #0\n"
2941                     "add c_ptr6, c_ptr6, #0x10\n"
2942                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
2943                     "add c_ptr7, c_ptr7, #0x10\n"
2944                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
2945                     "add %[b_ptr0], %[b_ptr0], #0x80\n"
2946                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
2947                     "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2948                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
2949                     "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2950                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
2951                     "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2952                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
2953                     "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2954                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
2955                     "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2956                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
2957                     "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2958                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
2959                     "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2960                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
2961                     "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2962                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
2963                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
2964                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
2965                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
2966                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
2967                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
2968                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
2969                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
2970                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
2971                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
2972                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
2973                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
2974                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
2975                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
2976                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
2977                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
2978                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
2979                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
2980                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
2981                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
2982                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
2983                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
2984                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
2985                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
2986                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
2987                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
2988                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
2989                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
2990                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
2991                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
2992                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
2993                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
2994                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
2995                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
2996                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
2997                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
2998                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
2999                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3000                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3001                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3002                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3003                     ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3004                     ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3005                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3006                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3007                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3008                     "b.ne 8b\n"
3009                     "7:\n"
3010                     "str q24, [%[c_ptr0]]\n"
3011                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
3012                     "movi v24.4s, #0\n"
3013                     "ldr q16, [%[b_ptr0]]\n"
3014                     "ldr q17, [%[b_ptr0], #0x10]\n"
3015                     "str q25, [c_ptr1]\n"
3016                     "add c_ptr1, c_ptr1, #0x10\n"
3017                     "movi v25.4s, #0\n"
3018                     "ldr q18, [%[b_ptr0], #0x20]\n"
3019                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
3020                     "str q26, [c_ptr2]\n"
3021                     "movi v26.4s, #0\n"
3022                     "ldr q19, [%[b_ptr0], #0x30]\n"
3023                     "ldr q20, [%[b_ptr0], #0x40]\n"
3024                     "add c_ptr2, c_ptr2, #0x10\n"
3025                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
3026                     "str q27, [c_ptr3]\n"
3027                     "movi v27.4s, #0\n"
3028                     "ldr q21, [%[b_ptr0], #0x50]\n"
3029                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
3030                     "ldr q22, [%[b_ptr0], #0x60]\n"
3031                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
3032                     "str q28, [c_ptr4]\n"
3033                     "movi v28.4s, #0\n"
3034                     "ldr q23, [%[b_ptr0], #0x70]\n"
3035                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
3036                     "add c_ptr3, c_ptr3, #0x10\n"
3037                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
3038                     "str q29, [c_ptr5]\n"
3039                     "movi v29.4s, #0\n"
3040                     "add c_ptr4, c_ptr4, #0x10\n"
3041                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
3042                     "str q30, [c_ptr6]\n"
3043                     "movi v30.4s, #0\n"
3044                     "add c_ptr5, c_ptr5, #0x10\n"
3045                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
3046                     "str q31, [c_ptr7]\n"
3047                     "movi v31.4s, #0\n"
3048                     "add c_ptr6, c_ptr6, #0x10\n"
3049                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
3050                     "add c_ptr7, c_ptr7, #0x10\n"
3051                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
3052                     "add %[b_ptr0], %[b_ptr0], #0x80\n"
3053                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
3054                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
3055                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
3056                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
3057                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
3058                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
3059                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
3060                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
3061                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
3062                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
3063                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
3064                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
3065                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
3066                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
3067                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
3068                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
3069                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
3070                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
3071                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
3072                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
3073                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
3074                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
3075                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
3076                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
3077                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
3078                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
3079                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
3080                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
3081                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
3082                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
3083                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
3084                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
3085                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
3086                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
3087                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
3088                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
3089                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
3090                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
3091                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
3092                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
3093                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
3094                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
3095                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
3096                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
3097                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
3098                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3099                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3100                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3101                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3102                     ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3103                     ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3104                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3105                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3106                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3107                     "b 9f\n"
3108                     "6:\n"
3109                     "movi v24.4s, #0\n"
3110                     "movi v25.4s, #0\n"
3111                     "movi v26.4s, #0\n"
3112                     "movi v27.4s, #0\n"
3113                     "movi v28.4s, #0\n"
3114                     "movi v29.4s, #0\n"
3115                     "movi v30.4s, #0\n"
3116                     "movi v31.4s, #0\n"
3117                     ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
3118                     ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
3119                     ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
3120                     ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
3121                     ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
3122                     ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
3123                     ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
3124                     ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
3125                     ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
3126                     ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
3127                     ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
3128                     ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
3129                     ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
3130                     ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
3131                     ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
3132                     ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
3133                     ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
3134                     ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
3135                     ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
3136                     ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
3137                     ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
3138                     ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
3139                     ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
3140                     ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
3141                     ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
3142                     ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
3143                     ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
3144                     ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
3145                     ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
3146                     ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
3147                     ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
3148                     ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
3149                     ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
3150                     ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
3151                     ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
3152                     ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
3153                     ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
3154                     ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
3155                     ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
3156                     ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
3157                     ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
3158                     ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
3159                     ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
3160                     ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
3161                     ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
3162                     ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
3163                     ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
3164                     ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
3165                     ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
3166                     ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
3167                     ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
3168                     ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
3169                     ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
3170                     ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
3171                     ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
3172                     ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
3173                     ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
3174                     ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
3175                     ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
3176                     ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
3177                     ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
3178                     ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
3179                     ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
3180                     ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
3181                     "9:\n"
3182                     "str q24, [%[c_ptr0]]\n"
3183                     "add %[c_ptr0], %[c_ptr0], #0x10\n"
3184                     "str q25, [c_ptr1]\n"
3185                     "str q26, [c_ptr2]\n"
3186                     "str q27, [c_ptr3]\n"
3187                     "str q28, [c_ptr4]\n"
3188                     "str q29, [c_ptr5]\n"
3189                     "str q30, [c_ptr6]\n"
3190                     "str q31, [c_ptr7]\n"
3191                     ".unreq a_ptr1\n"
3192                     ".unreq a_ptr2\n"
3193                     ".unreq a_ptr3\n"
3194                     ".unreq a_ptr4\n"
3195                     ".unreq a_ptr5\n"
3196                     ".unreq a_ptr6\n"
3197                     ".unreq a_ptr7\n"
3198                     ".unreq c_ptr1\n"
3199                     ".unreq c_ptr2\n"
3200                     ".unreq c_ptr3\n"
3201                     ".unreq c_ptr4\n"
3202                     ".unreq c_ptr5\n"
3203                     ".unreq c_ptr6\n"
3204                     ".unreq c_ptr7\n"
3205                     : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [odds] "+r" (odds)
3206                     : [lda] "r" (ldab), [ldc] "r" (ldcb)
3207                     : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
3208                 );
3209                 break;
3210         }
3211     }
3212 }
3213 
3214 } // namespace arm_gemm
3215 
3216 #endif // __aarch64__
3217