1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #ifdef __aarch64__
25
26 #include <algorithm>
27
28 #include "arm_gemm.hpp"
29
30
31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
33
34 namespace arm_gemm {
35
a64_smallK_hybrid_fp32_mla_6x4(const float * A,int lda,const float * B,float * C,int ldc,int M,int N,int K,const float * bias,Activation act,bool)36 void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
37 const long loops_count = iceildiv(N, (int)4) - 1;
38 const long ldab = lda * sizeof(float);
39 const long ldcb = ldc * sizeof(float);
40 float nullbias[4];
41 if (!bias) {
42 memset(nullbias, 0, (4 * sizeof(float)));
43 }
44 float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
45 float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
46 const float * const minptr = &minval;
47 const float * const maxptr = &maxval;
48
49 switch(act.type)
50 {
51 default:
52 case Activation::Type::None:
53 break;
54 case Activation::Type::BoundedReLU:
55 maxval = static_cast<float>(act.param1);
56 /* fall through */
57 case Activation::Type::ReLU:
58 minval = 0.0f;
59 break;
60 }
61
62 for (int y0=0; y0<M; y0+=6) {
63 long loops = loops_count;
64 long oob_rows = std::max(6 - (M-y0), 0);
65 const float *b_ptr0 = B;
66 const float *biasptr = bias ? bias : nullbias;
67 const uint64_t biasinc = bias ? 4*sizeof(float) : 0;
68 const float *a_ptr0 = A + (y0 * lda);
69
70 float *c_ptr0 = C + (y0 * ldc);
71
72 switch(K) {
73 case 9:
74 __asm __volatile (
75 "a_ptr1 .req X0\n"
76 "a_ptr2 .req X1\n"
77 "a_ptr3 .req X2\n"
78 "a_ptr4 .req X3\n"
79 "a_ptr5 .req X4\n"
80 "c_ptr1 .req X5\n"
81 "c_ptr2 .req X6\n"
82 "c_ptr3 .req X7\n"
83 "c_ptr4 .req X8\n"
84 "c_ptr5 .req X9\n"
85 "add a_ptr1, %[a_ptr0], %[lda]\n"
86 "add c_ptr1, %[c_ptr0], %[ldc]\n"
87 "add a_ptr2, a_ptr1, %[lda]\n"
88 "add c_ptr2, c_ptr1, %[ldc]\n"
89 "add a_ptr3, a_ptr2, %[lda]\n"
90 "add c_ptr3, c_ptr2, %[ldc]\n"
91 "add a_ptr4, a_ptr3, %[lda]\n"
92 "add c_ptr4, c_ptr3, %[ldc]\n"
93 "add a_ptr5, a_ptr4, %[lda]\n"
94 "add c_ptr5, c_ptr4, %[ldc]\n"
95 "cbz %[oob_rows], 1f\n"
96 "subs %[oob_rows], %[oob_rows], #0x1\n"
97 "add c_ptr5, %[c_ptr0], #0x0\n"
98 "add a_ptr5, %[a_ptr0], #0x0\n"
99 "b.eq 1f\n"
100 "subs %[oob_rows], %[oob_rows], #0x1\n"
101 "add c_ptr4, %[c_ptr0], #0x0\n"
102 "add a_ptr4, %[a_ptr0], #0x0\n"
103 "b.eq 1f\n"
104 "subs %[oob_rows], %[oob_rows], #0x1\n"
105 "add c_ptr3, %[c_ptr0], #0x0\n"
106 "add a_ptr3, %[a_ptr0], #0x0\n"
107 "b.eq 1f\n"
108 "subs %[oob_rows], %[oob_rows], #0x1\n"
109 "add c_ptr2, %[c_ptr0], #0x0\n"
110 "add a_ptr2, %[a_ptr0], #0x0\n"
111 "b.eq 1f\n"
112 "subs %[oob_rows], %[oob_rows], #0x1\n"
113 "add c_ptr1, %[c_ptr0], #0x0\n"
114 "add a_ptr1, %[a_ptr0], #0x0\n"
115 "1:\n"
116 "ldr q0, [%[a_ptr0]], #0x10\n"
117 "ldr q3, [a_ptr1], #0x10\n"
118 "ldr q6, [a_ptr2], #0x10\n"
119 "ldr q9, [a_ptr3], #0x10\n"
120 "ldr q12, [a_ptr4], #0x10\n"
121 "ldr q15, [a_ptr5], #0x10\n"
122 "ldr q1, [%[a_ptr0]], #0x10\n"
123 "ldr q4, [a_ptr1], #0x10\n"
124 "ldr q7, [a_ptr2], #0x10\n"
125 "ldr q10, [a_ptr3], #0x10\n"
126 "ldr s2, [%[a_ptr0]]\n"
127 "ldr q13, [a_ptr4], #0x10\n"
128 "ldr s5, [a_ptr1]\n"
129 "ldr q16, [a_ptr5], #0x10\n"
130 "ldr s8, [a_ptr2]\n"
131 "ldr q18, [%[b_ptr0]]\n"
132 "ldr s11, [a_ptr3]\n"
133 "ldr q19, [%[b_ptr0], #0x10]\n"
134 "ldr s14, [a_ptr4]\n"
135 "ldr q20, [%[b_ptr0], #0x20]\n"
136 "ldr s17, [a_ptr5]\n"
137 "ldr q21, [%[b_ptr0], #0x30]\n"
138 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
139 "ldr q22, [%[b_ptr0], #0x40]\n"
140 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
141 "ldr q23, [%[b_ptr0], #0x50]\n"
142 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
143 "ldr q24, [%[b_ptr0], #0x60]\n"
144 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
145 "ldr q25, [%[b_ptr0], #0x70]\n"
146 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
147 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
148 "add %[b_ptr0], %[b_ptr0], #0x80\n"
149 "cbz %[loops], 2f\n"
150 "ldr q26, [%[biasptr]]\n"
151 "add %[biasptr], %[biasptr], %[biasinc]\n"
152 "subs %[loops], %[loops], #0x1\n"
153 "mov v27.16b, v26.16b\n"
154 "mov v28.16b, v26.16b\n"
155 "mov v29.16b, v26.16b\n"
156 "mov v30.16b, v26.16b\n"
157 "mov v31.16b, v26.16b\n"
158 "fmla v26.4s, v18.4s, v0.s[0]\n"
159 "fmla v27.4s, v18.4s, v3.s[0]\n"
160 "fmla v28.4s, v18.4s, v6.s[0]\n"
161 "fmla v29.4s, v18.4s, v9.s[0]\n"
162 "fmla v30.4s, v18.4s, v12.s[0]\n"
163 "fmla v31.4s, v18.4s, v15.s[0]\n"
164 "ldr q18, [%[b_ptr0]]\n"
165 "fmla v26.4s, v19.4s, v0.s[1]\n"
166 "add %[b_ptr0], %[b_ptr0], #0x10\n"
167 "fmla v27.4s, v19.4s, v3.s[1]\n"
168 "fmla v28.4s, v19.4s, v6.s[1]\n"
169 "fmla v29.4s, v19.4s, v9.s[1]\n"
170 "fmla v30.4s, v19.4s, v12.s[1]\n"
171 "fmla v31.4s, v19.4s, v15.s[1]\n"
172 "fmla v26.4s, v20.4s, v0.s[2]\n"
173 "fmla v27.4s, v20.4s, v3.s[2]\n"
174 "fmla v28.4s, v20.4s, v6.s[2]\n"
175 "fmla v29.4s, v20.4s, v9.s[2]\n"
176 "fmla v30.4s, v20.4s, v12.s[2]\n"
177 "fmla v31.4s, v20.4s, v15.s[2]\n"
178 "fmla v26.4s, v21.4s, v0.s[3]\n"
179 "fmla v27.4s, v21.4s, v3.s[3]\n"
180 "fmla v28.4s, v21.4s, v6.s[3]\n"
181 "fmla v29.4s, v21.4s, v9.s[3]\n"
182 "fmla v30.4s, v21.4s, v12.s[3]\n"
183 "fmla v31.4s, v21.4s, v15.s[3]\n"
184 "fmla v26.4s, v22.4s, v1.s[0]\n"
185 "fmla v27.4s, v22.4s, v4.s[0]\n"
186 "fmla v28.4s, v22.4s, v7.s[0]\n"
187 "fmla v29.4s, v22.4s, v10.s[0]\n"
188 "fmla v30.4s, v22.4s, v13.s[0]\n"
189 "fmla v31.4s, v22.4s, v16.s[0]\n"
190 "fmla v26.4s, v23.4s, v1.s[1]\n"
191 "fmla v27.4s, v23.4s, v4.s[1]\n"
192 "fmla v28.4s, v23.4s, v7.s[1]\n"
193 "fmla v29.4s, v23.4s, v10.s[1]\n"
194 "fmla v30.4s, v23.4s, v13.s[1]\n"
195 "fmla v31.4s, v23.4s, v16.s[1]\n"
196 "fmla v26.4s, v24.4s, v1.s[2]\n"
197 "fmla v27.4s, v24.4s, v4.s[2]\n"
198 "fmla v28.4s, v24.4s, v7.s[2]\n"
199 "fmla v29.4s, v24.4s, v10.s[2]\n"
200 "fmla v30.4s, v24.4s, v13.s[2]\n"
201 "fmla v31.4s, v24.4s, v16.s[2]\n"
202 "fmla v26.4s, v25.4s, v1.s[3]\n"
203 "fmla v27.4s, v25.4s, v4.s[3]\n"
204 "fmla v28.4s, v25.4s, v7.s[3]\n"
205 "fmla v29.4s, v25.4s, v10.s[3]\n"
206 "fmla v30.4s, v25.4s, v13.s[3]\n"
207 "fmla v31.4s, v25.4s, v16.s[3]\n"
208 "fmla v26.4s, v18.4s, v2.s[0]\n"
209 "fmla v27.4s, v18.4s, v5.s[0]\n"
210 "fmla v28.4s, v18.4s, v8.s[0]\n"
211 "fmla v29.4s, v18.4s, v11.s[0]\n"
212 "fmla v30.4s, v18.4s, v14.s[0]\n"
213 "fmla v31.4s, v18.4s, v17.s[0]\n"
214 "b.eq 3f\n"
215 "4:\n"
216 "ld1r {v24.4s}, [%[minptr]]\n"
217 "subs %[loops], %[loops], #0x1\n"
218 "ld1r {v25.4s}, [%[maxptr]]\n"
219 "ldr q18, [%[b_ptr0]]\n"
220 "fmax v26.4s, v26.4s, v24.4s\n"
221 "ldr q19, [%[b_ptr0], #0x10]\n"
222 "fmax v27.4s, v27.4s, v24.4s\n"
223 "ldr q20, [%[b_ptr0], #0x20]\n"
224 "fmax v28.4s, v28.4s, v24.4s\n"
225 "ldr q21, [%[b_ptr0], #0x30]\n"
226 "fmax v29.4s, v29.4s, v24.4s\n"
227 "ldr q22, [%[b_ptr0], #0x40]\n"
228 "fmin v26.4s, v26.4s, v25.4s\n"
229 "ldr q23, [%[b_ptr0], #0x50]\n"
230 "fmin v27.4s, v27.4s, v25.4s\n"
231 "fmin v28.4s, v28.4s, v25.4s\n"
232 "fmin v29.4s, v29.4s, v25.4s\n"
233 "str q26, [%[c_ptr0]]\n"
234 "fmax v30.4s, v30.4s, v24.4s\n"
235 "ldr q26, [%[biasptr]]\n"
236 "fmax v31.4s, v31.4s, v24.4s\n"
237 "ldr q24, [%[b_ptr0], #0x60]\n"
238 "add %[c_ptr0], %[c_ptr0], #0x10\n"
239 "str q27, [c_ptr1]\n"
240 "add c_ptr1, c_ptr1, #0x10\n"
241 "fmin v30.4s, v30.4s, v25.4s\n"
242 "add %[biasptr], %[biasptr], %[biasinc]\n"
243 "fmin v31.4s, v31.4s, v25.4s\n"
244 "str q28, [c_ptr2]\n"
245 "mov v27.16b, v26.16b\n"
246 "ldr q25, [%[b_ptr0], #0x70]\n"
247 "mov v28.16b, v26.16b\n"
248 "add c_ptr2, c_ptr2, #0x10\n"
249 "str q29, [c_ptr3]\n"
250 "add c_ptr3, c_ptr3, #0x10\n"
251 "mov v29.16b, v26.16b\n"
252 "add %[b_ptr0], %[b_ptr0], #0x80\n"
253 "fmla v27.4s, v18.4s, v3.s[0]\n"
254 "str q30, [c_ptr4]\n"
255 "mov v30.16b, v26.16b\n"
256 "add c_ptr4, c_ptr4, #0x10\n"
257 "fmla v28.4s, v18.4s, v6.s[0]\n"
258 "str q31, [c_ptr5]\n"
259 "mov v31.16b, v26.16b\n"
260 "add c_ptr5, c_ptr5, #0x10\n"
261 "fmla v26.4s, v18.4s, v0.s[0]\n"
262 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
263 "fmla v29.4s, v18.4s, v9.s[0]\n"
264 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
265 "fmla v30.4s, v18.4s, v12.s[0]\n"
266 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
267 "fmla v31.4s, v18.4s, v15.s[0]\n"
268 "ldr q18, [%[b_ptr0]]\n"
269 "fmla v26.4s, v19.4s, v0.s[1]\n"
270 "add %[b_ptr0], %[b_ptr0], #0x10\n"
271 "fmla v27.4s, v19.4s, v3.s[1]\n"
272 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
273 "fmla v28.4s, v19.4s, v6.s[1]\n"
274 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
275 "fmla v29.4s, v19.4s, v9.s[1]\n"
276 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
277 "fmla v30.4s, v19.4s, v12.s[1]\n"
278 "fmla v31.4s, v19.4s, v15.s[1]\n"
279 "fmla v26.4s, v20.4s, v0.s[2]\n"
280 "fmla v27.4s, v20.4s, v3.s[2]\n"
281 "fmla v28.4s, v20.4s, v6.s[2]\n"
282 "fmla v29.4s, v20.4s, v9.s[2]\n"
283 "fmla v30.4s, v20.4s, v12.s[2]\n"
284 "fmla v31.4s, v20.4s, v15.s[2]\n"
285 "fmla v26.4s, v21.4s, v0.s[3]\n"
286 "fmla v27.4s, v21.4s, v3.s[3]\n"
287 "fmla v28.4s, v21.4s, v6.s[3]\n"
288 "fmla v29.4s, v21.4s, v9.s[3]\n"
289 "fmla v30.4s, v21.4s, v12.s[3]\n"
290 "fmla v31.4s, v21.4s, v15.s[3]\n"
291 "fmla v26.4s, v22.4s, v1.s[0]\n"
292 "fmla v27.4s, v22.4s, v4.s[0]\n"
293 "fmla v28.4s, v22.4s, v7.s[0]\n"
294 "fmla v29.4s, v22.4s, v10.s[0]\n"
295 "fmla v30.4s, v22.4s, v13.s[0]\n"
296 "fmla v31.4s, v22.4s, v16.s[0]\n"
297 "fmla v26.4s, v23.4s, v1.s[1]\n"
298 "fmla v27.4s, v23.4s, v4.s[1]\n"
299 "fmla v28.4s, v23.4s, v7.s[1]\n"
300 "fmla v29.4s, v23.4s, v10.s[1]\n"
301 "fmla v30.4s, v23.4s, v13.s[1]\n"
302 "fmla v31.4s, v23.4s, v16.s[1]\n"
303 "fmla v26.4s, v24.4s, v1.s[2]\n"
304 "fmla v27.4s, v24.4s, v4.s[2]\n"
305 "fmla v28.4s, v24.4s, v7.s[2]\n"
306 "fmla v29.4s, v24.4s, v10.s[2]\n"
307 "fmla v30.4s, v24.4s, v13.s[2]\n"
308 "fmla v31.4s, v24.4s, v16.s[2]\n"
309 "fmla v26.4s, v25.4s, v1.s[3]\n"
310 "fmla v27.4s, v25.4s, v4.s[3]\n"
311 "fmla v28.4s, v25.4s, v7.s[3]\n"
312 "fmla v29.4s, v25.4s, v10.s[3]\n"
313 "fmla v30.4s, v25.4s, v13.s[3]\n"
314 "fmla v31.4s, v25.4s, v16.s[3]\n"
315 "fmla v26.4s, v18.4s, v2.s[0]\n"
316 "fmla v27.4s, v18.4s, v5.s[0]\n"
317 "fmla v28.4s, v18.4s, v8.s[0]\n"
318 "fmla v29.4s, v18.4s, v11.s[0]\n"
319 "fmla v30.4s, v18.4s, v14.s[0]\n"
320 "fmla v31.4s, v18.4s, v17.s[0]\n"
321 "b.ne 4b\n"
322 "3:\n"
323 "ld1r {v24.4s}, [%[minptr]]\n"
324 "ld1r {v25.4s}, [%[maxptr]]\n"
325 "ldr q18, [%[b_ptr0]]\n"
326 "ldr q19, [%[b_ptr0], #0x10]\n"
327 "fmax v26.4s, v26.4s, v24.4s\n"
328 "ldr q20, [%[b_ptr0], #0x20]\n"
329 "fmax v27.4s, v27.4s, v24.4s\n"
330 "ldr q21, [%[b_ptr0], #0x30]\n"
331 "fmax v28.4s, v28.4s, v24.4s\n"
332 "ldr q22, [%[b_ptr0], #0x40]\n"
333 "fmax v29.4s, v29.4s, v24.4s\n"
334 "ldr q23, [%[b_ptr0], #0x50]\n"
335 "fmin v26.4s, v26.4s, v25.4s\n"
336 "fmin v27.4s, v27.4s, v25.4s\n"
337 "fmin v28.4s, v28.4s, v25.4s\n"
338 "fmin v29.4s, v29.4s, v25.4s\n"
339 "str q26, [%[c_ptr0]]\n"
340 "fmax v30.4s, v30.4s, v24.4s\n"
341 "ldr q26, [%[biasptr]]\n"
342 "fmax v31.4s, v31.4s, v24.4s\n"
343 "ldr q24, [%[b_ptr0], #0x60]\n"
344 "add %[c_ptr0], %[c_ptr0], #0x10\n"
345 "str q27, [c_ptr1]\n"
346 "add c_ptr1, c_ptr1, #0x10\n"
347 "fmin v30.4s, v30.4s, v25.4s\n"
348 "add %[biasptr], %[biasptr], %[biasinc]\n"
349 "fmin v31.4s, v31.4s, v25.4s\n"
350 "str q28, [c_ptr2]\n"
351 "mov v27.16b, v26.16b\n"
352 "ldr q25, [%[b_ptr0], #0x70]\n"
353 "mov v28.16b, v26.16b\n"
354 "add c_ptr2, c_ptr2, #0x10\n"
355 "str q29, [c_ptr3]\n"
356 "add c_ptr3, c_ptr3, #0x10\n"
357 "mov v29.16b, v26.16b\n"
358 "add %[b_ptr0], %[b_ptr0], #0x80\n"
359 "fmla v27.4s, v18.4s, v3.s[0]\n"
360 "str q30, [c_ptr4]\n"
361 "mov v30.16b, v26.16b\n"
362 "add c_ptr4, c_ptr4, #0x10\n"
363 "fmla v28.4s, v18.4s, v6.s[0]\n"
364 "str q31, [c_ptr5]\n"
365 "mov v31.16b, v26.16b\n"
366 "add c_ptr5, c_ptr5, #0x10\n"
367 "fmla v26.4s, v18.4s, v0.s[0]\n"
368 "fmla v29.4s, v18.4s, v9.s[0]\n"
369 "fmla v30.4s, v18.4s, v12.s[0]\n"
370 "fmla v31.4s, v18.4s, v15.s[0]\n"
371 "ldr q18, [%[b_ptr0]]\n"
372 "fmla v26.4s, v19.4s, v0.s[1]\n"
373 "add %[b_ptr0], %[b_ptr0], #0x10\n"
374 "fmla v27.4s, v19.4s, v3.s[1]\n"
375 "fmla v28.4s, v19.4s, v6.s[1]\n"
376 "fmla v29.4s, v19.4s, v9.s[1]\n"
377 "fmla v30.4s, v19.4s, v12.s[1]\n"
378 "fmla v31.4s, v19.4s, v15.s[1]\n"
379 "fmla v26.4s, v20.4s, v0.s[2]\n"
380 "fmla v27.4s, v20.4s, v3.s[2]\n"
381 "fmla v28.4s, v20.4s, v6.s[2]\n"
382 "fmla v29.4s, v20.4s, v9.s[2]\n"
383 "fmla v30.4s, v20.4s, v12.s[2]\n"
384 "fmla v31.4s, v20.4s, v15.s[2]\n"
385 "fmla v26.4s, v21.4s, v0.s[3]\n"
386 "fmla v27.4s, v21.4s, v3.s[3]\n"
387 "fmla v28.4s, v21.4s, v6.s[3]\n"
388 "fmla v29.4s, v21.4s, v9.s[3]\n"
389 "fmla v30.4s, v21.4s, v12.s[3]\n"
390 "fmla v31.4s, v21.4s, v15.s[3]\n"
391 "fmla v26.4s, v22.4s, v1.s[0]\n"
392 "fmla v27.4s, v22.4s, v4.s[0]\n"
393 "fmla v28.4s, v22.4s, v7.s[0]\n"
394 "fmla v29.4s, v22.4s, v10.s[0]\n"
395 "fmla v30.4s, v22.4s, v13.s[0]\n"
396 "fmla v31.4s, v22.4s, v16.s[0]\n"
397 "fmla v26.4s, v23.4s, v1.s[1]\n"
398 "fmla v27.4s, v23.4s, v4.s[1]\n"
399 "fmla v28.4s, v23.4s, v7.s[1]\n"
400 "fmla v29.4s, v23.4s, v10.s[1]\n"
401 "fmla v30.4s, v23.4s, v13.s[1]\n"
402 "fmla v31.4s, v23.4s, v16.s[1]\n"
403 "fmla v26.4s, v24.4s, v1.s[2]\n"
404 "fmla v27.4s, v24.4s, v4.s[2]\n"
405 "fmla v28.4s, v24.4s, v7.s[2]\n"
406 "fmla v29.4s, v24.4s, v10.s[2]\n"
407 "fmla v30.4s, v24.4s, v13.s[2]\n"
408 "fmla v31.4s, v24.4s, v16.s[2]\n"
409 "fmla v26.4s, v25.4s, v1.s[3]\n"
410 "fmla v27.4s, v25.4s, v4.s[3]\n"
411 "fmla v28.4s, v25.4s, v7.s[3]\n"
412 "fmla v29.4s, v25.4s, v10.s[3]\n"
413 "fmla v30.4s, v25.4s, v13.s[3]\n"
414 "fmla v31.4s, v25.4s, v16.s[3]\n"
415 "fmla v26.4s, v18.4s, v2.s[0]\n"
416 "fmla v27.4s, v18.4s, v5.s[0]\n"
417 "fmla v28.4s, v18.4s, v8.s[0]\n"
418 "fmla v29.4s, v18.4s, v11.s[0]\n"
419 "fmla v30.4s, v18.4s, v14.s[0]\n"
420 "fmla v31.4s, v18.4s, v17.s[0]\n"
421 "b 5f\n"
422 "2:\n"
423 "ldr q26, [%[biasptr]]\n"
424 "add %[biasptr], %[biasptr], %[biasinc]\n"
425 "mov v27.16b, v26.16b\n"
426 "mov v28.16b, v26.16b\n"
427 "mov v29.16b, v26.16b\n"
428 "mov v30.16b, v26.16b\n"
429 "mov v31.16b, v26.16b\n"
430 "fmla v26.4s, v18.4s, v0.s[0]\n"
431 "fmla v27.4s, v18.4s, v3.s[0]\n"
432 "fmla v28.4s, v18.4s, v6.s[0]\n"
433 "fmla v29.4s, v18.4s, v9.s[0]\n"
434 "fmla v30.4s, v18.4s, v12.s[0]\n"
435 "fmla v31.4s, v18.4s, v15.s[0]\n"
436 "ldr q18, [%[b_ptr0]]\n"
437 "fmla v26.4s, v19.4s, v0.s[1]\n"
438 "add %[b_ptr0], %[b_ptr0], #0x10\n"
439 "fmla v27.4s, v19.4s, v3.s[1]\n"
440 "fmla v28.4s, v19.4s, v6.s[1]\n"
441 "fmla v29.4s, v19.4s, v9.s[1]\n"
442 "fmla v30.4s, v19.4s, v12.s[1]\n"
443 "fmla v31.4s, v19.4s, v15.s[1]\n"
444 "fmla v26.4s, v20.4s, v0.s[2]\n"
445 "fmla v27.4s, v20.4s, v3.s[2]\n"
446 "fmla v28.4s, v20.4s, v6.s[2]\n"
447 "fmla v29.4s, v20.4s, v9.s[2]\n"
448 "fmla v30.4s, v20.4s, v12.s[2]\n"
449 "fmla v31.4s, v20.4s, v15.s[2]\n"
450 "fmla v26.4s, v21.4s, v0.s[3]\n"
451 "fmla v27.4s, v21.4s, v3.s[3]\n"
452 "fmla v28.4s, v21.4s, v6.s[3]\n"
453 "fmla v29.4s, v21.4s, v9.s[3]\n"
454 "fmla v30.4s, v21.4s, v12.s[3]\n"
455 "fmla v31.4s, v21.4s, v15.s[3]\n"
456 "fmla v26.4s, v22.4s, v1.s[0]\n"
457 "fmla v27.4s, v22.4s, v4.s[0]\n"
458 "fmla v28.4s, v22.4s, v7.s[0]\n"
459 "fmla v29.4s, v22.4s, v10.s[0]\n"
460 "fmla v30.4s, v22.4s, v13.s[0]\n"
461 "fmla v31.4s, v22.4s, v16.s[0]\n"
462 "fmla v26.4s, v23.4s, v1.s[1]\n"
463 "fmla v27.4s, v23.4s, v4.s[1]\n"
464 "fmla v28.4s, v23.4s, v7.s[1]\n"
465 "fmla v29.4s, v23.4s, v10.s[1]\n"
466 "fmla v30.4s, v23.4s, v13.s[1]\n"
467 "fmla v31.4s, v23.4s, v16.s[1]\n"
468 "fmla v26.4s, v24.4s, v1.s[2]\n"
469 "fmla v27.4s, v24.4s, v4.s[2]\n"
470 "fmla v28.4s, v24.4s, v7.s[2]\n"
471 "fmla v29.4s, v24.4s, v10.s[2]\n"
472 "fmla v30.4s, v24.4s, v13.s[2]\n"
473 "fmla v31.4s, v24.4s, v16.s[2]\n"
474 "fmla v26.4s, v25.4s, v1.s[3]\n"
475 "fmla v27.4s, v25.4s, v4.s[3]\n"
476 "fmla v28.4s, v25.4s, v7.s[3]\n"
477 "fmla v29.4s, v25.4s, v10.s[3]\n"
478 "fmla v30.4s, v25.4s, v13.s[3]\n"
479 "fmla v31.4s, v25.4s, v16.s[3]\n"
480 "fmla v26.4s, v18.4s, v2.s[0]\n"
481 "fmla v27.4s, v18.4s, v5.s[0]\n"
482 "fmla v28.4s, v18.4s, v8.s[0]\n"
483 "fmla v29.4s, v18.4s, v11.s[0]\n"
484 "fmla v30.4s, v18.4s, v14.s[0]\n"
485 "fmla v31.4s, v18.4s, v17.s[0]\n"
486 "5:\n"
487 "ld1r {v24.4s}, [%[minptr]]\n"
488 "ld1r {v25.4s}, [%[maxptr]]\n"
489 "fmax v26.4s, v26.4s, v24.4s\n"
490 "fmax v27.4s, v27.4s, v24.4s\n"
491 "fmax v28.4s, v28.4s, v24.4s\n"
492 "fmax v29.4s, v29.4s, v24.4s\n"
493 "fmin v26.4s, v26.4s, v25.4s\n"
494 "fmin v27.4s, v27.4s, v25.4s\n"
495 "fmin v28.4s, v28.4s, v25.4s\n"
496 "fmin v29.4s, v29.4s, v25.4s\n"
497 "str q26, [%[c_ptr0]]\n"
498 "fmax v30.4s, v30.4s, v24.4s\n"
499 "add %[c_ptr0], %[c_ptr0], #0x10\n"
500 "fmax v31.4s, v31.4s, v24.4s\n"
501 "str q27, [c_ptr1]\n"
502 "fmin v30.4s, v30.4s, v25.4s\n"
503 "fmin v31.4s, v31.4s, v25.4s\n"
504 "str q28, [c_ptr2]\n"
505 "str q29, [c_ptr3]\n"
506 "str q30, [c_ptr4]\n"
507 "str q31, [c_ptr5]\n"
508 ".unreq a_ptr1\n"
509 ".unreq a_ptr2\n"
510 ".unreq a_ptr3\n"
511 ".unreq a_ptr4\n"
512 ".unreq a_ptr5\n"
513 ".unreq c_ptr1\n"
514 ".unreq c_ptr2\n"
515 ".unreq c_ptr3\n"
516 ".unreq c_ptr4\n"
517 ".unreq c_ptr5\n"
518 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
519 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
520 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
521 );
522 break;
523 case 10:
524 __asm __volatile (
525 "a_ptr1 .req X0\n"
526 "a_ptr2 .req X1\n"
527 "a_ptr3 .req X2\n"
528 "a_ptr4 .req X3\n"
529 "a_ptr5 .req X4\n"
530 "c_ptr1 .req X5\n"
531 "c_ptr2 .req X6\n"
532 "c_ptr3 .req X7\n"
533 "c_ptr4 .req X8\n"
534 "c_ptr5 .req X9\n"
535 "add a_ptr1, %[a_ptr0], %[lda]\n"
536 "add c_ptr1, %[c_ptr0], %[ldc]\n"
537 "add a_ptr2, a_ptr1, %[lda]\n"
538 "add c_ptr2, c_ptr1, %[ldc]\n"
539 "add a_ptr3, a_ptr2, %[lda]\n"
540 "add c_ptr3, c_ptr2, %[ldc]\n"
541 "add a_ptr4, a_ptr3, %[lda]\n"
542 "add c_ptr4, c_ptr3, %[ldc]\n"
543 "add a_ptr5, a_ptr4, %[lda]\n"
544 "add c_ptr5, c_ptr4, %[ldc]\n"
545 "cbz %[oob_rows], 1f\n"
546 "subs %[oob_rows], %[oob_rows], #0x1\n"
547 "add c_ptr5, %[c_ptr0], #0x0\n"
548 "add a_ptr5, %[a_ptr0], #0x0\n"
549 "b.eq 1f\n"
550 "subs %[oob_rows], %[oob_rows], #0x1\n"
551 "add c_ptr4, %[c_ptr0], #0x0\n"
552 "add a_ptr4, %[a_ptr0], #0x0\n"
553 "b.eq 1f\n"
554 "subs %[oob_rows], %[oob_rows], #0x1\n"
555 "add c_ptr3, %[c_ptr0], #0x0\n"
556 "add a_ptr3, %[a_ptr0], #0x0\n"
557 "b.eq 1f\n"
558 "subs %[oob_rows], %[oob_rows], #0x1\n"
559 "add c_ptr2, %[c_ptr0], #0x0\n"
560 "add a_ptr2, %[a_ptr0], #0x0\n"
561 "b.eq 1f\n"
562 "subs %[oob_rows], %[oob_rows], #0x1\n"
563 "add c_ptr1, %[c_ptr0], #0x0\n"
564 "add a_ptr1, %[a_ptr0], #0x0\n"
565 "1:\n"
566 "ldr q0, [%[a_ptr0]], #0x10\n"
567 "ldr q3, [a_ptr1], #0x10\n"
568 "ldr q6, [a_ptr2], #0x10\n"
569 "ldr q9, [a_ptr3], #0x10\n"
570 "ldr q12, [a_ptr4], #0x10\n"
571 "ldr q15, [a_ptr5], #0x10\n"
572 "ldr q1, [%[a_ptr0]], #0x10\n"
573 "ldr q4, [a_ptr1], #0x10\n"
574 "ldr q7, [a_ptr2], #0x10\n"
575 "ldr q10, [a_ptr3], #0x10\n"
576 "ldr d2, [%[a_ptr0]]\n"
577 "ldr q13, [a_ptr4], #0x10\n"
578 "ldr d5, [a_ptr1]\n"
579 "ldr q16, [a_ptr5], #0x10\n"
580 "ldr d8, [a_ptr2]\n"
581 "ldr q18, [%[b_ptr0]]\n"
582 "ldr d11, [a_ptr3]\n"
583 "ldr q19, [%[b_ptr0], #0x10]\n"
584 "ldr d14, [a_ptr4]\n"
585 "ldr q20, [%[b_ptr0], #0x20]\n"
586 "ldr d17, [a_ptr5]\n"
587 "ldr q21, [%[b_ptr0], #0x30]\n"
588 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
589 "ldr q22, [%[b_ptr0], #0x40]\n"
590 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
591 "ldr q23, [%[b_ptr0], #0x50]\n"
592 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
593 "ldr q24, [%[b_ptr0], #0x60]\n"
594 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
595 "ldr q25, [%[b_ptr0], #0x70]\n"
596 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
597 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
598 "add %[b_ptr0], %[b_ptr0], #0x80\n"
599 "cbz %[loops], 2f\n"
600 "ldr q26, [%[biasptr]]\n"
601 "add %[biasptr], %[biasptr], %[biasinc]\n"
602 "subs %[loops], %[loops], #0x1\n"
603 "mov v27.16b, v26.16b\n"
604 "mov v28.16b, v26.16b\n"
605 "mov v29.16b, v26.16b\n"
606 "mov v30.16b, v26.16b\n"
607 "mov v31.16b, v26.16b\n"
608 "fmla v26.4s, v18.4s, v0.s[0]\n"
609 "fmla v27.4s, v18.4s, v3.s[0]\n"
610 "fmla v28.4s, v18.4s, v6.s[0]\n"
611 "fmla v29.4s, v18.4s, v9.s[0]\n"
612 "fmla v30.4s, v18.4s, v12.s[0]\n"
613 "fmla v31.4s, v18.4s, v15.s[0]\n"
614 "ldr q18, [%[b_ptr0]]\n"
615 "fmla v26.4s, v19.4s, v0.s[1]\n"
616 "fmla v27.4s, v19.4s, v3.s[1]\n"
617 "fmla v28.4s, v19.4s, v6.s[1]\n"
618 "fmla v29.4s, v19.4s, v9.s[1]\n"
619 "fmla v30.4s, v19.4s, v12.s[1]\n"
620 "fmla v31.4s, v19.4s, v15.s[1]\n"
621 "ldr q19, [%[b_ptr0], #0x10]\n"
622 "fmla v26.4s, v20.4s, v0.s[2]\n"
623 "add %[b_ptr0], %[b_ptr0], #0x20\n"
624 "fmla v27.4s, v20.4s, v3.s[2]\n"
625 "fmla v28.4s, v20.4s, v6.s[2]\n"
626 "fmla v29.4s, v20.4s, v9.s[2]\n"
627 "fmla v30.4s, v20.4s, v12.s[2]\n"
628 "fmla v31.4s, v20.4s, v15.s[2]\n"
629 "fmla v26.4s, v21.4s, v0.s[3]\n"
630 "fmla v27.4s, v21.4s, v3.s[3]\n"
631 "fmla v28.4s, v21.4s, v6.s[3]\n"
632 "fmla v29.4s, v21.4s, v9.s[3]\n"
633 "fmla v30.4s, v21.4s, v12.s[3]\n"
634 "fmla v31.4s, v21.4s, v15.s[3]\n"
635 "fmla v26.4s, v22.4s, v1.s[0]\n"
636 "fmla v27.4s, v22.4s, v4.s[0]\n"
637 "fmla v28.4s, v22.4s, v7.s[0]\n"
638 "fmla v29.4s, v22.4s, v10.s[0]\n"
639 "fmla v30.4s, v22.4s, v13.s[0]\n"
640 "fmla v31.4s, v22.4s, v16.s[0]\n"
641 "fmla v26.4s, v23.4s, v1.s[1]\n"
642 "fmla v27.4s, v23.4s, v4.s[1]\n"
643 "fmla v28.4s, v23.4s, v7.s[1]\n"
644 "fmla v29.4s, v23.4s, v10.s[1]\n"
645 "fmla v30.4s, v23.4s, v13.s[1]\n"
646 "fmla v31.4s, v23.4s, v16.s[1]\n"
647 "fmla v26.4s, v24.4s, v1.s[2]\n"
648 "fmla v27.4s, v24.4s, v4.s[2]\n"
649 "fmla v28.4s, v24.4s, v7.s[2]\n"
650 "fmla v29.4s, v24.4s, v10.s[2]\n"
651 "fmla v30.4s, v24.4s, v13.s[2]\n"
652 "fmla v31.4s, v24.4s, v16.s[2]\n"
653 "fmla v26.4s, v25.4s, v1.s[3]\n"
654 "fmla v27.4s, v25.4s, v4.s[3]\n"
655 "fmla v28.4s, v25.4s, v7.s[3]\n"
656 "fmla v29.4s, v25.4s, v10.s[3]\n"
657 "fmla v30.4s, v25.4s, v13.s[3]\n"
658 "fmla v31.4s, v25.4s, v16.s[3]\n"
659 "fmla v26.4s, v18.4s, v2.s[0]\n"
660 "fmla v27.4s, v18.4s, v5.s[0]\n"
661 "fmla v28.4s, v18.4s, v8.s[0]\n"
662 "fmla v29.4s, v18.4s, v11.s[0]\n"
663 "fmla v30.4s, v18.4s, v14.s[0]\n"
664 "fmla v31.4s, v18.4s, v17.s[0]\n"
665 "fmla v26.4s, v19.4s, v2.s[1]\n"
666 "fmla v27.4s, v19.4s, v5.s[1]\n"
667 "fmla v28.4s, v19.4s, v8.s[1]\n"
668 "fmla v29.4s, v19.4s, v11.s[1]\n"
669 "fmla v30.4s, v19.4s, v14.s[1]\n"
670 "fmla v31.4s, v19.4s, v17.s[1]\n"
671 "b.eq 3f\n"
672 "4:\n"
673 "ld1r {v24.4s}, [%[minptr]]\n"
674 "subs %[loops], %[loops], #0x1\n"
675 "ld1r {v25.4s}, [%[maxptr]]\n"
676 "ldr q18, [%[b_ptr0]]\n"
677 "fmax v26.4s, v26.4s, v24.4s\n"
678 "ldr q19, [%[b_ptr0], #0x10]\n"
679 "fmax v27.4s, v27.4s, v24.4s\n"
680 "ldr q20, [%[b_ptr0], #0x20]\n"
681 "fmax v28.4s, v28.4s, v24.4s\n"
682 "ldr q21, [%[b_ptr0], #0x30]\n"
683 "fmax v29.4s, v29.4s, v24.4s\n"
684 "ldr q22, [%[b_ptr0], #0x40]\n"
685 "fmin v26.4s, v26.4s, v25.4s\n"
686 "ldr q23, [%[b_ptr0], #0x50]\n"
687 "fmin v27.4s, v27.4s, v25.4s\n"
688 "fmin v28.4s, v28.4s, v25.4s\n"
689 "fmin v29.4s, v29.4s, v25.4s\n"
690 "str q26, [%[c_ptr0]]\n"
691 "fmax v30.4s, v30.4s, v24.4s\n"
692 "ldr q26, [%[biasptr]]\n"
693 "fmax v31.4s, v31.4s, v24.4s\n"
694 "ldr q24, [%[b_ptr0], #0x60]\n"
695 "add %[c_ptr0], %[c_ptr0], #0x10\n"
696 "str q27, [c_ptr1]\n"
697 "add c_ptr1, c_ptr1, #0x10\n"
698 "fmin v30.4s, v30.4s, v25.4s\n"
699 "add %[biasptr], %[biasptr], %[biasinc]\n"
700 "fmin v31.4s, v31.4s, v25.4s\n"
701 "str q28, [c_ptr2]\n"
702 "mov v27.16b, v26.16b\n"
703 "ldr q25, [%[b_ptr0], #0x70]\n"
704 "mov v28.16b, v26.16b\n"
705 "add c_ptr2, c_ptr2, #0x10\n"
706 "str q29, [c_ptr3]\n"
707 "add c_ptr3, c_ptr3, #0x10\n"
708 "mov v29.16b, v26.16b\n"
709 "add %[b_ptr0], %[b_ptr0], #0x80\n"
710 "fmla v27.4s, v18.4s, v3.s[0]\n"
711 "str q30, [c_ptr4]\n"
712 "mov v30.16b, v26.16b\n"
713 "add c_ptr4, c_ptr4, #0x10\n"
714 "fmla v28.4s, v18.4s, v6.s[0]\n"
715 "str q31, [c_ptr5]\n"
716 "mov v31.16b, v26.16b\n"
717 "add c_ptr5, c_ptr5, #0x10\n"
718 "fmla v26.4s, v18.4s, v0.s[0]\n"
719 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
720 "fmla v29.4s, v18.4s, v9.s[0]\n"
721 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
722 "fmla v30.4s, v18.4s, v12.s[0]\n"
723 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
724 "fmla v31.4s, v18.4s, v15.s[0]\n"
725 "ldr q18, [%[b_ptr0]]\n"
726 "fmla v26.4s, v19.4s, v0.s[1]\n"
727 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
728 "fmla v27.4s, v19.4s, v3.s[1]\n"
729 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
730 "fmla v28.4s, v19.4s, v6.s[1]\n"
731 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
732 "fmla v29.4s, v19.4s, v9.s[1]\n"
733 "fmla v30.4s, v19.4s, v12.s[1]\n"
734 "fmla v31.4s, v19.4s, v15.s[1]\n"
735 "ldr q19, [%[b_ptr0], #0x10]\n"
736 "fmla v26.4s, v20.4s, v0.s[2]\n"
737 "add %[b_ptr0], %[b_ptr0], #0x20\n"
738 "fmla v27.4s, v20.4s, v3.s[2]\n"
739 "fmla v28.4s, v20.4s, v6.s[2]\n"
740 "fmla v29.4s, v20.4s, v9.s[2]\n"
741 "fmla v30.4s, v20.4s, v12.s[2]\n"
742 "fmla v31.4s, v20.4s, v15.s[2]\n"
743 "fmla v26.4s, v21.4s, v0.s[3]\n"
744 "fmla v27.4s, v21.4s, v3.s[3]\n"
745 "fmla v28.4s, v21.4s, v6.s[3]\n"
746 "fmla v29.4s, v21.4s, v9.s[3]\n"
747 "fmla v30.4s, v21.4s, v12.s[3]\n"
748 "fmla v31.4s, v21.4s, v15.s[3]\n"
749 "fmla v26.4s, v22.4s, v1.s[0]\n"
750 "fmla v27.4s, v22.4s, v4.s[0]\n"
751 "fmla v28.4s, v22.4s, v7.s[0]\n"
752 "fmla v29.4s, v22.4s, v10.s[0]\n"
753 "fmla v30.4s, v22.4s, v13.s[0]\n"
754 "fmla v31.4s, v22.4s, v16.s[0]\n"
755 "fmla v26.4s, v23.4s, v1.s[1]\n"
756 "fmla v27.4s, v23.4s, v4.s[1]\n"
757 "fmla v28.4s, v23.4s, v7.s[1]\n"
758 "fmla v29.4s, v23.4s, v10.s[1]\n"
759 "fmla v30.4s, v23.4s, v13.s[1]\n"
760 "fmla v31.4s, v23.4s, v16.s[1]\n"
761 "fmla v26.4s, v24.4s, v1.s[2]\n"
762 "fmla v27.4s, v24.4s, v4.s[2]\n"
763 "fmla v28.4s, v24.4s, v7.s[2]\n"
764 "fmla v29.4s, v24.4s, v10.s[2]\n"
765 "fmla v30.4s, v24.4s, v13.s[2]\n"
766 "fmla v31.4s, v24.4s, v16.s[2]\n"
767 "fmla v26.4s, v25.4s, v1.s[3]\n"
768 "fmla v27.4s, v25.4s, v4.s[3]\n"
769 "fmla v28.4s, v25.4s, v7.s[3]\n"
770 "fmla v29.4s, v25.4s, v10.s[3]\n"
771 "fmla v30.4s, v25.4s, v13.s[3]\n"
772 "fmla v31.4s, v25.4s, v16.s[3]\n"
773 "fmla v26.4s, v18.4s, v2.s[0]\n"
774 "fmla v27.4s, v18.4s, v5.s[0]\n"
775 "fmla v28.4s, v18.4s, v8.s[0]\n"
776 "fmla v29.4s, v18.4s, v11.s[0]\n"
777 "fmla v30.4s, v18.4s, v14.s[0]\n"
778 "fmla v31.4s, v18.4s, v17.s[0]\n"
779 "fmla v26.4s, v19.4s, v2.s[1]\n"
780 "fmla v27.4s, v19.4s, v5.s[1]\n"
781 "fmla v28.4s, v19.4s, v8.s[1]\n"
782 "fmla v29.4s, v19.4s, v11.s[1]\n"
783 "fmla v30.4s, v19.4s, v14.s[1]\n"
784 "fmla v31.4s, v19.4s, v17.s[1]\n"
785 "b.ne 4b\n"
786 "3:\n"
787 "ld1r {v24.4s}, [%[minptr]]\n"
788 "ld1r {v25.4s}, [%[maxptr]]\n"
789 "ldr q18, [%[b_ptr0]]\n"
790 "ldr q19, [%[b_ptr0], #0x10]\n"
791 "fmax v26.4s, v26.4s, v24.4s\n"
792 "ldr q20, [%[b_ptr0], #0x20]\n"
793 "fmax v27.4s, v27.4s, v24.4s\n"
794 "ldr q21, [%[b_ptr0], #0x30]\n"
795 "fmax v28.4s, v28.4s, v24.4s\n"
796 "ldr q22, [%[b_ptr0], #0x40]\n"
797 "fmax v29.4s, v29.4s, v24.4s\n"
798 "ldr q23, [%[b_ptr0], #0x50]\n"
799 "fmin v26.4s, v26.4s, v25.4s\n"
800 "fmin v27.4s, v27.4s, v25.4s\n"
801 "fmin v28.4s, v28.4s, v25.4s\n"
802 "fmin v29.4s, v29.4s, v25.4s\n"
803 "str q26, [%[c_ptr0]]\n"
804 "fmax v30.4s, v30.4s, v24.4s\n"
805 "ldr q26, [%[biasptr]]\n"
806 "fmax v31.4s, v31.4s, v24.4s\n"
807 "ldr q24, [%[b_ptr0], #0x60]\n"
808 "add %[c_ptr0], %[c_ptr0], #0x10\n"
809 "str q27, [c_ptr1]\n"
810 "add c_ptr1, c_ptr1, #0x10\n"
811 "fmin v30.4s, v30.4s, v25.4s\n"
812 "add %[biasptr], %[biasptr], %[biasinc]\n"
813 "fmin v31.4s, v31.4s, v25.4s\n"
814 "str q28, [c_ptr2]\n"
815 "mov v27.16b, v26.16b\n"
816 "ldr q25, [%[b_ptr0], #0x70]\n"
817 "mov v28.16b, v26.16b\n"
818 "add c_ptr2, c_ptr2, #0x10\n"
819 "str q29, [c_ptr3]\n"
820 "add c_ptr3, c_ptr3, #0x10\n"
821 "mov v29.16b, v26.16b\n"
822 "add %[b_ptr0], %[b_ptr0], #0x80\n"
823 "fmla v27.4s, v18.4s, v3.s[0]\n"
824 "str q30, [c_ptr4]\n"
825 "mov v30.16b, v26.16b\n"
826 "add c_ptr4, c_ptr4, #0x10\n"
827 "fmla v28.4s, v18.4s, v6.s[0]\n"
828 "str q31, [c_ptr5]\n"
829 "mov v31.16b, v26.16b\n"
830 "add c_ptr5, c_ptr5, #0x10\n"
831 "fmla v26.4s, v18.4s, v0.s[0]\n"
832 "fmla v29.4s, v18.4s, v9.s[0]\n"
833 "fmla v30.4s, v18.4s, v12.s[0]\n"
834 "fmla v31.4s, v18.4s, v15.s[0]\n"
835 "ldr q18, [%[b_ptr0]]\n"
836 "fmla v26.4s, v19.4s, v0.s[1]\n"
837 "fmla v27.4s, v19.4s, v3.s[1]\n"
838 "fmla v28.4s, v19.4s, v6.s[1]\n"
839 "fmla v29.4s, v19.4s, v9.s[1]\n"
840 "fmla v30.4s, v19.4s, v12.s[1]\n"
841 "fmla v31.4s, v19.4s, v15.s[1]\n"
842 "ldr q19, [%[b_ptr0], #0x10]\n"
843 "fmla v26.4s, v20.4s, v0.s[2]\n"
844 "add %[b_ptr0], %[b_ptr0], #0x20\n"
845 "fmla v27.4s, v20.4s, v3.s[2]\n"
846 "fmla v28.4s, v20.4s, v6.s[2]\n"
847 "fmla v29.4s, v20.4s, v9.s[2]\n"
848 "fmla v30.4s, v20.4s, v12.s[2]\n"
849 "fmla v31.4s, v20.4s, v15.s[2]\n"
850 "fmla v26.4s, v21.4s, v0.s[3]\n"
851 "fmla v27.4s, v21.4s, v3.s[3]\n"
852 "fmla v28.4s, v21.4s, v6.s[3]\n"
853 "fmla v29.4s, v21.4s, v9.s[3]\n"
854 "fmla v30.4s, v21.4s, v12.s[3]\n"
855 "fmla v31.4s, v21.4s, v15.s[3]\n"
856 "fmla v26.4s, v22.4s, v1.s[0]\n"
857 "fmla v27.4s, v22.4s, v4.s[0]\n"
858 "fmla v28.4s, v22.4s, v7.s[0]\n"
859 "fmla v29.4s, v22.4s, v10.s[0]\n"
860 "fmla v30.4s, v22.4s, v13.s[0]\n"
861 "fmla v31.4s, v22.4s, v16.s[0]\n"
862 "fmla v26.4s, v23.4s, v1.s[1]\n"
863 "fmla v27.4s, v23.4s, v4.s[1]\n"
864 "fmla v28.4s, v23.4s, v7.s[1]\n"
865 "fmla v29.4s, v23.4s, v10.s[1]\n"
866 "fmla v30.4s, v23.4s, v13.s[1]\n"
867 "fmla v31.4s, v23.4s, v16.s[1]\n"
868 "fmla v26.4s, v24.4s, v1.s[2]\n"
869 "fmla v27.4s, v24.4s, v4.s[2]\n"
870 "fmla v28.4s, v24.4s, v7.s[2]\n"
871 "fmla v29.4s, v24.4s, v10.s[2]\n"
872 "fmla v30.4s, v24.4s, v13.s[2]\n"
873 "fmla v31.4s, v24.4s, v16.s[2]\n"
874 "fmla v26.4s, v25.4s, v1.s[3]\n"
875 "fmla v27.4s, v25.4s, v4.s[3]\n"
876 "fmla v28.4s, v25.4s, v7.s[3]\n"
877 "fmla v29.4s, v25.4s, v10.s[3]\n"
878 "fmla v30.4s, v25.4s, v13.s[3]\n"
879 "fmla v31.4s, v25.4s, v16.s[3]\n"
880 "fmla v26.4s, v18.4s, v2.s[0]\n"
881 "fmla v27.4s, v18.4s, v5.s[0]\n"
882 "fmla v28.4s, v18.4s, v8.s[0]\n"
883 "fmla v29.4s, v18.4s, v11.s[0]\n"
884 "fmla v30.4s, v18.4s, v14.s[0]\n"
885 "fmla v31.4s, v18.4s, v17.s[0]\n"
886 "fmla v26.4s, v19.4s, v2.s[1]\n"
887 "fmla v27.4s, v19.4s, v5.s[1]\n"
888 "fmla v28.4s, v19.4s, v8.s[1]\n"
889 "fmla v29.4s, v19.4s, v11.s[1]\n"
890 "fmla v30.4s, v19.4s, v14.s[1]\n"
891 "fmla v31.4s, v19.4s, v17.s[1]\n"
892 "b 5f\n"
893 "2:\n"
894 "ldr q26, [%[biasptr]]\n"
895 "add %[biasptr], %[biasptr], %[biasinc]\n"
896 "mov v27.16b, v26.16b\n"
897 "mov v28.16b, v26.16b\n"
898 "mov v29.16b, v26.16b\n"
899 "mov v30.16b, v26.16b\n"
900 "mov v31.16b, v26.16b\n"
901 "fmla v26.4s, v18.4s, v0.s[0]\n"
902 "fmla v27.4s, v18.4s, v3.s[0]\n"
903 "fmla v28.4s, v18.4s, v6.s[0]\n"
904 "fmla v29.4s, v18.4s, v9.s[0]\n"
905 "fmla v30.4s, v18.4s, v12.s[0]\n"
906 "fmla v31.4s, v18.4s, v15.s[0]\n"
907 "ldr q18, [%[b_ptr0]]\n"
908 "fmla v26.4s, v19.4s, v0.s[1]\n"
909 "fmla v27.4s, v19.4s, v3.s[1]\n"
910 "fmla v28.4s, v19.4s, v6.s[1]\n"
911 "fmla v29.4s, v19.4s, v9.s[1]\n"
912 "fmla v30.4s, v19.4s, v12.s[1]\n"
913 "fmla v31.4s, v19.4s, v15.s[1]\n"
914 "ldr q19, [%[b_ptr0], #0x10]\n"
915 "fmla v26.4s, v20.4s, v0.s[2]\n"
916 "add %[b_ptr0], %[b_ptr0], #0x20\n"
917 "fmla v27.4s, v20.4s, v3.s[2]\n"
918 "fmla v28.4s, v20.4s, v6.s[2]\n"
919 "fmla v29.4s, v20.4s, v9.s[2]\n"
920 "fmla v30.4s, v20.4s, v12.s[2]\n"
921 "fmla v31.4s, v20.4s, v15.s[2]\n"
922 "fmla v26.4s, v21.4s, v0.s[3]\n"
923 "fmla v27.4s, v21.4s, v3.s[3]\n"
924 "fmla v28.4s, v21.4s, v6.s[3]\n"
925 "fmla v29.4s, v21.4s, v9.s[3]\n"
926 "fmla v30.4s, v21.4s, v12.s[3]\n"
927 "fmla v31.4s, v21.4s, v15.s[3]\n"
928 "fmla v26.4s, v22.4s, v1.s[0]\n"
929 "fmla v27.4s, v22.4s, v4.s[0]\n"
930 "fmla v28.4s, v22.4s, v7.s[0]\n"
931 "fmla v29.4s, v22.4s, v10.s[0]\n"
932 "fmla v30.4s, v22.4s, v13.s[0]\n"
933 "fmla v31.4s, v22.4s, v16.s[0]\n"
934 "fmla v26.4s, v23.4s, v1.s[1]\n"
935 "fmla v27.4s, v23.4s, v4.s[1]\n"
936 "fmla v28.4s, v23.4s, v7.s[1]\n"
937 "fmla v29.4s, v23.4s, v10.s[1]\n"
938 "fmla v30.4s, v23.4s, v13.s[1]\n"
939 "fmla v31.4s, v23.4s, v16.s[1]\n"
940 "fmla v26.4s, v24.4s, v1.s[2]\n"
941 "fmla v27.4s, v24.4s, v4.s[2]\n"
942 "fmla v28.4s, v24.4s, v7.s[2]\n"
943 "fmla v29.4s, v24.4s, v10.s[2]\n"
944 "fmla v30.4s, v24.4s, v13.s[2]\n"
945 "fmla v31.4s, v24.4s, v16.s[2]\n"
946 "fmla v26.4s, v25.4s, v1.s[3]\n"
947 "fmla v27.4s, v25.4s, v4.s[3]\n"
948 "fmla v28.4s, v25.4s, v7.s[3]\n"
949 "fmla v29.4s, v25.4s, v10.s[3]\n"
950 "fmla v30.4s, v25.4s, v13.s[3]\n"
951 "fmla v31.4s, v25.4s, v16.s[3]\n"
952 "fmla v26.4s, v18.4s, v2.s[0]\n"
953 "fmla v27.4s, v18.4s, v5.s[0]\n"
954 "fmla v28.4s, v18.4s, v8.s[0]\n"
955 "fmla v29.4s, v18.4s, v11.s[0]\n"
956 "fmla v30.4s, v18.4s, v14.s[0]\n"
957 "fmla v31.4s, v18.4s, v17.s[0]\n"
958 "fmla v26.4s, v19.4s, v2.s[1]\n"
959 "fmla v27.4s, v19.4s, v5.s[1]\n"
960 "fmla v28.4s, v19.4s, v8.s[1]\n"
961 "fmla v29.4s, v19.4s, v11.s[1]\n"
962 "fmla v30.4s, v19.4s, v14.s[1]\n"
963 "fmla v31.4s, v19.4s, v17.s[1]\n"
964 "5:\n"
965 "ld1r {v24.4s}, [%[minptr]]\n"
966 "ld1r {v25.4s}, [%[maxptr]]\n"
967 "fmax v26.4s, v26.4s, v24.4s\n"
968 "fmax v27.4s, v27.4s, v24.4s\n"
969 "fmax v28.4s, v28.4s, v24.4s\n"
970 "fmax v29.4s, v29.4s, v24.4s\n"
971 "fmin v26.4s, v26.4s, v25.4s\n"
972 "fmin v27.4s, v27.4s, v25.4s\n"
973 "fmin v28.4s, v28.4s, v25.4s\n"
974 "fmin v29.4s, v29.4s, v25.4s\n"
975 "str q26, [%[c_ptr0]]\n"
976 "fmax v30.4s, v30.4s, v24.4s\n"
977 "add %[c_ptr0], %[c_ptr0], #0x10\n"
978 "fmax v31.4s, v31.4s, v24.4s\n"
979 "str q27, [c_ptr1]\n"
980 "fmin v30.4s, v30.4s, v25.4s\n"
981 "fmin v31.4s, v31.4s, v25.4s\n"
982 "str q28, [c_ptr2]\n"
983 "str q29, [c_ptr3]\n"
984 "str q30, [c_ptr4]\n"
985 "str q31, [c_ptr5]\n"
986 ".unreq a_ptr1\n"
987 ".unreq a_ptr2\n"
988 ".unreq a_ptr3\n"
989 ".unreq a_ptr4\n"
990 ".unreq a_ptr5\n"
991 ".unreq c_ptr1\n"
992 ".unreq c_ptr2\n"
993 ".unreq c_ptr3\n"
994 ".unreq c_ptr4\n"
995 ".unreq c_ptr5\n"
996 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
997 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
998 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
999 );
1000 break;
1001 case 11:
1002 __asm __volatile (
1003 "a_ptr1 .req X0\n"
1004 "a_ptr2 .req X1\n"
1005 "a_ptr3 .req X2\n"
1006 "a_ptr4 .req X3\n"
1007 "a_ptr5 .req X4\n"
1008 "c_ptr1 .req X5\n"
1009 "c_ptr2 .req X6\n"
1010 "c_ptr3 .req X7\n"
1011 "c_ptr4 .req X8\n"
1012 "c_ptr5 .req X9\n"
1013 "add a_ptr1, %[a_ptr0], %[lda]\n"
1014 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1015 "add a_ptr2, a_ptr1, %[lda]\n"
1016 "add c_ptr2, c_ptr1, %[ldc]\n"
1017 "add a_ptr3, a_ptr2, %[lda]\n"
1018 "add c_ptr3, c_ptr2, %[ldc]\n"
1019 "add a_ptr4, a_ptr3, %[lda]\n"
1020 "add c_ptr4, c_ptr3, %[ldc]\n"
1021 "add a_ptr5, a_ptr4, %[lda]\n"
1022 "add c_ptr5, c_ptr4, %[ldc]\n"
1023 "cbz %[oob_rows], 1f\n"
1024 "subs %[oob_rows], %[oob_rows], #0x1\n"
1025 "add c_ptr5, %[c_ptr0], #0x0\n"
1026 "add a_ptr5, %[a_ptr0], #0x0\n"
1027 "b.eq 1f\n"
1028 "subs %[oob_rows], %[oob_rows], #0x1\n"
1029 "add c_ptr4, %[c_ptr0], #0x0\n"
1030 "add a_ptr4, %[a_ptr0], #0x0\n"
1031 "b.eq 1f\n"
1032 "subs %[oob_rows], %[oob_rows], #0x1\n"
1033 "add c_ptr3, %[c_ptr0], #0x0\n"
1034 "add a_ptr3, %[a_ptr0], #0x0\n"
1035 "b.eq 1f\n"
1036 "subs %[oob_rows], %[oob_rows], #0x1\n"
1037 "add c_ptr2, %[c_ptr0], #0x0\n"
1038 "add a_ptr2, %[a_ptr0], #0x0\n"
1039 "b.eq 1f\n"
1040 "subs %[oob_rows], %[oob_rows], #0x1\n"
1041 "add c_ptr1, %[c_ptr0], #0x0\n"
1042 "add a_ptr1, %[a_ptr0], #0x0\n"
1043 "1:\n"
1044 "ldr q0, [%[a_ptr0]], #0x10\n"
1045 "ldr q3, [a_ptr1], #0x10\n"
1046 "ldr q6, [a_ptr2], #0x10\n"
1047 "ldr q9, [a_ptr3], #0x10\n"
1048 "ldr q12, [a_ptr4], #0x10\n"
1049 "ldr q15, [a_ptr5], #0x10\n"
1050 "ldr q1, [%[a_ptr0]], #0x10\n"
1051 "ldr q4, [a_ptr1], #0x10\n"
1052 "ldr q7, [a_ptr2], #0x10\n"
1053 "ldr q10, [a_ptr3], #0x10\n"
1054 "ldr d2, [%[a_ptr0]], #0x8\n"
1055 "ldr q13, [a_ptr4], #0x10\n"
1056 "ldr d5, [a_ptr1], #0x8\n"
1057 "ldr q16, [a_ptr5], #0x10\n"
1058 "ldr d8, [a_ptr2], #0x8\n"
1059 "ldr q18, [%[b_ptr0]]\n"
1060 "ldr d11, [a_ptr3], #0x8\n"
1061 "ldr q19, [%[b_ptr0], #0x10]\n"
1062 "ldr d14, [a_ptr4], #0x8\n"
1063 "ldr q20, [%[b_ptr0], #0x20]\n"
1064 "ldr d17, [a_ptr5], #0x8\n"
1065 "ldr q21, [%[b_ptr0], #0x30]\n"
1066 "ld1 {v2.s}[2], [%[a_ptr0]]\n"
1067 "ldr q22, [%[b_ptr0], #0x40]\n"
1068 "ld1 {v5.s}[2], [a_ptr1]\n"
1069 "ldr q23, [%[b_ptr0], #0x50]\n"
1070 "ld1 {v8.s}[2], [a_ptr2]\n"
1071 "ldr q24, [%[b_ptr0], #0x60]\n"
1072 "ld1 {v11.s}[2], [a_ptr3]\n"
1073 "ldr q25, [%[b_ptr0], #0x70]\n"
1074 "ld1 {v14.s}[2], [a_ptr4]\n"
1075 "ld1 {v17.s}[2], [a_ptr5]\n"
1076 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1077 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1078 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1079 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1080 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1081 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1082 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1083 "cbz %[loops], 2f\n"
1084 "ldr q26, [%[biasptr]]\n"
1085 "add %[biasptr], %[biasptr], %[biasinc]\n"
1086 "subs %[loops], %[loops], #0x1\n"
1087 "mov v27.16b, v26.16b\n"
1088 "mov v28.16b, v26.16b\n"
1089 "mov v29.16b, v26.16b\n"
1090 "mov v30.16b, v26.16b\n"
1091 "mov v31.16b, v26.16b\n"
1092 "fmla v26.4s, v18.4s, v0.s[0]\n"
1093 "fmla v27.4s, v18.4s, v3.s[0]\n"
1094 "fmla v28.4s, v18.4s, v6.s[0]\n"
1095 "fmla v29.4s, v18.4s, v9.s[0]\n"
1096 "fmla v30.4s, v18.4s, v12.s[0]\n"
1097 "fmla v31.4s, v18.4s, v15.s[0]\n"
1098 "ldr q18, [%[b_ptr0]]\n"
1099 "fmla v26.4s, v19.4s, v0.s[1]\n"
1100 "fmla v27.4s, v19.4s, v3.s[1]\n"
1101 "fmla v28.4s, v19.4s, v6.s[1]\n"
1102 "fmla v29.4s, v19.4s, v9.s[1]\n"
1103 "fmla v30.4s, v19.4s, v12.s[1]\n"
1104 "fmla v31.4s, v19.4s, v15.s[1]\n"
1105 "ldr q19, [%[b_ptr0], #0x10]\n"
1106 "fmla v26.4s, v20.4s, v0.s[2]\n"
1107 "fmla v27.4s, v20.4s, v3.s[2]\n"
1108 "fmla v28.4s, v20.4s, v6.s[2]\n"
1109 "fmla v29.4s, v20.4s, v9.s[2]\n"
1110 "fmla v30.4s, v20.4s, v12.s[2]\n"
1111 "fmla v31.4s, v20.4s, v15.s[2]\n"
1112 "ldr q20, [%[b_ptr0], #0x20]\n"
1113 "fmla v26.4s, v21.4s, v0.s[3]\n"
1114 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1115 "fmla v27.4s, v21.4s, v3.s[3]\n"
1116 "fmla v28.4s, v21.4s, v6.s[3]\n"
1117 "fmla v29.4s, v21.4s, v9.s[3]\n"
1118 "fmla v30.4s, v21.4s, v12.s[3]\n"
1119 "fmla v31.4s, v21.4s, v15.s[3]\n"
1120 "fmla v26.4s, v22.4s, v1.s[0]\n"
1121 "fmla v27.4s, v22.4s, v4.s[0]\n"
1122 "fmla v28.4s, v22.4s, v7.s[0]\n"
1123 "fmla v29.4s, v22.4s, v10.s[0]\n"
1124 "fmla v30.4s, v22.4s, v13.s[0]\n"
1125 "fmla v31.4s, v22.4s, v16.s[0]\n"
1126 "fmla v26.4s, v23.4s, v1.s[1]\n"
1127 "fmla v27.4s, v23.4s, v4.s[1]\n"
1128 "fmla v28.4s, v23.4s, v7.s[1]\n"
1129 "fmla v29.4s, v23.4s, v10.s[1]\n"
1130 "fmla v30.4s, v23.4s, v13.s[1]\n"
1131 "fmla v31.4s, v23.4s, v16.s[1]\n"
1132 "fmla v26.4s, v24.4s, v1.s[2]\n"
1133 "fmla v27.4s, v24.4s, v4.s[2]\n"
1134 "fmla v28.4s, v24.4s, v7.s[2]\n"
1135 "fmla v29.4s, v24.4s, v10.s[2]\n"
1136 "fmla v30.4s, v24.4s, v13.s[2]\n"
1137 "fmla v31.4s, v24.4s, v16.s[2]\n"
1138 "fmla v26.4s, v25.4s, v1.s[3]\n"
1139 "fmla v27.4s, v25.4s, v4.s[3]\n"
1140 "fmla v28.4s, v25.4s, v7.s[3]\n"
1141 "fmla v29.4s, v25.4s, v10.s[3]\n"
1142 "fmla v30.4s, v25.4s, v13.s[3]\n"
1143 "fmla v31.4s, v25.4s, v16.s[3]\n"
1144 "fmla v26.4s, v18.4s, v2.s[0]\n"
1145 "fmla v27.4s, v18.4s, v5.s[0]\n"
1146 "fmla v28.4s, v18.4s, v8.s[0]\n"
1147 "fmla v29.4s, v18.4s, v11.s[0]\n"
1148 "fmla v30.4s, v18.4s, v14.s[0]\n"
1149 "fmla v31.4s, v18.4s, v17.s[0]\n"
1150 "fmla v26.4s, v19.4s, v2.s[1]\n"
1151 "fmla v27.4s, v19.4s, v5.s[1]\n"
1152 "fmla v28.4s, v19.4s, v8.s[1]\n"
1153 "fmla v29.4s, v19.4s, v11.s[1]\n"
1154 "fmla v30.4s, v19.4s, v14.s[1]\n"
1155 "fmla v31.4s, v19.4s, v17.s[1]\n"
1156 "fmla v26.4s, v20.4s, v2.s[2]\n"
1157 "fmla v27.4s, v20.4s, v5.s[2]\n"
1158 "fmla v28.4s, v20.4s, v8.s[2]\n"
1159 "fmla v29.4s, v20.4s, v11.s[2]\n"
1160 "fmla v30.4s, v20.4s, v14.s[2]\n"
1161 "fmla v31.4s, v20.4s, v17.s[2]\n"
1162 "b.eq 3f\n"
1163 "4:\n"
1164 "ld1r {v24.4s}, [%[minptr]]\n"
1165 "subs %[loops], %[loops], #0x1\n"
1166 "ld1r {v25.4s}, [%[maxptr]]\n"
1167 "ldr q18, [%[b_ptr0]]\n"
1168 "fmax v26.4s, v26.4s, v24.4s\n"
1169 "ldr q19, [%[b_ptr0], #0x10]\n"
1170 "fmax v27.4s, v27.4s, v24.4s\n"
1171 "ldr q20, [%[b_ptr0], #0x20]\n"
1172 "fmax v28.4s, v28.4s, v24.4s\n"
1173 "ldr q21, [%[b_ptr0], #0x30]\n"
1174 "fmax v29.4s, v29.4s, v24.4s\n"
1175 "ldr q22, [%[b_ptr0], #0x40]\n"
1176 "fmin v26.4s, v26.4s, v25.4s\n"
1177 "ldr q23, [%[b_ptr0], #0x50]\n"
1178 "fmin v27.4s, v27.4s, v25.4s\n"
1179 "fmin v28.4s, v28.4s, v25.4s\n"
1180 "fmin v29.4s, v29.4s, v25.4s\n"
1181 "str q26, [%[c_ptr0]]\n"
1182 "fmax v30.4s, v30.4s, v24.4s\n"
1183 "ldr q26, [%[biasptr]]\n"
1184 "fmax v31.4s, v31.4s, v24.4s\n"
1185 "ldr q24, [%[b_ptr0], #0x60]\n"
1186 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1187 "str q27, [c_ptr1]\n"
1188 "add c_ptr1, c_ptr1, #0x10\n"
1189 "fmin v30.4s, v30.4s, v25.4s\n"
1190 "add %[biasptr], %[biasptr], %[biasinc]\n"
1191 "fmin v31.4s, v31.4s, v25.4s\n"
1192 "str q28, [c_ptr2]\n"
1193 "mov v27.16b, v26.16b\n"
1194 "ldr q25, [%[b_ptr0], #0x70]\n"
1195 "mov v28.16b, v26.16b\n"
1196 "add c_ptr2, c_ptr2, #0x10\n"
1197 "str q29, [c_ptr3]\n"
1198 "add c_ptr3, c_ptr3, #0x10\n"
1199 "mov v29.16b, v26.16b\n"
1200 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1201 "fmla v27.4s, v18.4s, v3.s[0]\n"
1202 "str q30, [c_ptr4]\n"
1203 "mov v30.16b, v26.16b\n"
1204 "add c_ptr4, c_ptr4, #0x10\n"
1205 "fmla v28.4s, v18.4s, v6.s[0]\n"
1206 "str q31, [c_ptr5]\n"
1207 "mov v31.16b, v26.16b\n"
1208 "add c_ptr5, c_ptr5, #0x10\n"
1209 "fmla v26.4s, v18.4s, v0.s[0]\n"
1210 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1211 "fmla v29.4s, v18.4s, v9.s[0]\n"
1212 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1213 "fmla v30.4s, v18.4s, v12.s[0]\n"
1214 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1215 "fmla v31.4s, v18.4s, v15.s[0]\n"
1216 "ldr q18, [%[b_ptr0]]\n"
1217 "fmla v26.4s, v19.4s, v0.s[1]\n"
1218 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1219 "fmla v27.4s, v19.4s, v3.s[1]\n"
1220 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1221 "fmla v28.4s, v19.4s, v6.s[1]\n"
1222 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1223 "fmla v29.4s, v19.4s, v9.s[1]\n"
1224 "fmla v30.4s, v19.4s, v12.s[1]\n"
1225 "fmla v31.4s, v19.4s, v15.s[1]\n"
1226 "ldr q19, [%[b_ptr0], #0x10]\n"
1227 "fmla v26.4s, v20.4s, v0.s[2]\n"
1228 "fmla v27.4s, v20.4s, v3.s[2]\n"
1229 "fmla v28.4s, v20.4s, v6.s[2]\n"
1230 "fmla v29.4s, v20.4s, v9.s[2]\n"
1231 "fmla v30.4s, v20.4s, v12.s[2]\n"
1232 "fmla v31.4s, v20.4s, v15.s[2]\n"
1233 "ldr q20, [%[b_ptr0], #0x20]\n"
1234 "fmla v26.4s, v21.4s, v0.s[3]\n"
1235 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1236 "fmla v27.4s, v21.4s, v3.s[3]\n"
1237 "fmla v28.4s, v21.4s, v6.s[3]\n"
1238 "fmla v29.4s, v21.4s, v9.s[3]\n"
1239 "fmla v30.4s, v21.4s, v12.s[3]\n"
1240 "fmla v31.4s, v21.4s, v15.s[3]\n"
1241 "fmla v26.4s, v22.4s, v1.s[0]\n"
1242 "fmla v27.4s, v22.4s, v4.s[0]\n"
1243 "fmla v28.4s, v22.4s, v7.s[0]\n"
1244 "fmla v29.4s, v22.4s, v10.s[0]\n"
1245 "fmla v30.4s, v22.4s, v13.s[0]\n"
1246 "fmla v31.4s, v22.4s, v16.s[0]\n"
1247 "fmla v26.4s, v23.4s, v1.s[1]\n"
1248 "fmla v27.4s, v23.4s, v4.s[1]\n"
1249 "fmla v28.4s, v23.4s, v7.s[1]\n"
1250 "fmla v29.4s, v23.4s, v10.s[1]\n"
1251 "fmla v30.4s, v23.4s, v13.s[1]\n"
1252 "fmla v31.4s, v23.4s, v16.s[1]\n"
1253 "fmla v26.4s, v24.4s, v1.s[2]\n"
1254 "fmla v27.4s, v24.4s, v4.s[2]\n"
1255 "fmla v28.4s, v24.4s, v7.s[2]\n"
1256 "fmla v29.4s, v24.4s, v10.s[2]\n"
1257 "fmla v30.4s, v24.4s, v13.s[2]\n"
1258 "fmla v31.4s, v24.4s, v16.s[2]\n"
1259 "fmla v26.4s, v25.4s, v1.s[3]\n"
1260 "fmla v27.4s, v25.4s, v4.s[3]\n"
1261 "fmla v28.4s, v25.4s, v7.s[3]\n"
1262 "fmla v29.4s, v25.4s, v10.s[3]\n"
1263 "fmla v30.4s, v25.4s, v13.s[3]\n"
1264 "fmla v31.4s, v25.4s, v16.s[3]\n"
1265 "fmla v26.4s, v18.4s, v2.s[0]\n"
1266 "fmla v27.4s, v18.4s, v5.s[0]\n"
1267 "fmla v28.4s, v18.4s, v8.s[0]\n"
1268 "fmla v29.4s, v18.4s, v11.s[0]\n"
1269 "fmla v30.4s, v18.4s, v14.s[0]\n"
1270 "fmla v31.4s, v18.4s, v17.s[0]\n"
1271 "fmla v26.4s, v19.4s, v2.s[1]\n"
1272 "fmla v27.4s, v19.4s, v5.s[1]\n"
1273 "fmla v28.4s, v19.4s, v8.s[1]\n"
1274 "fmla v29.4s, v19.4s, v11.s[1]\n"
1275 "fmla v30.4s, v19.4s, v14.s[1]\n"
1276 "fmla v31.4s, v19.4s, v17.s[1]\n"
1277 "fmla v26.4s, v20.4s, v2.s[2]\n"
1278 "fmla v27.4s, v20.4s, v5.s[2]\n"
1279 "fmla v28.4s, v20.4s, v8.s[2]\n"
1280 "fmla v29.4s, v20.4s, v11.s[2]\n"
1281 "fmla v30.4s, v20.4s, v14.s[2]\n"
1282 "fmla v31.4s, v20.4s, v17.s[2]\n"
1283 "b.ne 4b\n"
1284 "3:\n"
1285 "ld1r {v24.4s}, [%[minptr]]\n"
1286 "ld1r {v25.4s}, [%[maxptr]]\n"
1287 "ldr q18, [%[b_ptr0]]\n"
1288 "ldr q19, [%[b_ptr0], #0x10]\n"
1289 "fmax v26.4s, v26.4s, v24.4s\n"
1290 "ldr q20, [%[b_ptr0], #0x20]\n"
1291 "fmax v27.4s, v27.4s, v24.4s\n"
1292 "ldr q21, [%[b_ptr0], #0x30]\n"
1293 "fmax v28.4s, v28.4s, v24.4s\n"
1294 "ldr q22, [%[b_ptr0], #0x40]\n"
1295 "fmax v29.4s, v29.4s, v24.4s\n"
1296 "ldr q23, [%[b_ptr0], #0x50]\n"
1297 "fmin v26.4s, v26.4s, v25.4s\n"
1298 "fmin v27.4s, v27.4s, v25.4s\n"
1299 "fmin v28.4s, v28.4s, v25.4s\n"
1300 "fmin v29.4s, v29.4s, v25.4s\n"
1301 "str q26, [%[c_ptr0]]\n"
1302 "fmax v30.4s, v30.4s, v24.4s\n"
1303 "ldr q26, [%[biasptr]]\n"
1304 "fmax v31.4s, v31.4s, v24.4s\n"
1305 "ldr q24, [%[b_ptr0], #0x60]\n"
1306 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1307 "str q27, [c_ptr1]\n"
1308 "add c_ptr1, c_ptr1, #0x10\n"
1309 "fmin v30.4s, v30.4s, v25.4s\n"
1310 "add %[biasptr], %[biasptr], %[biasinc]\n"
1311 "fmin v31.4s, v31.4s, v25.4s\n"
1312 "str q28, [c_ptr2]\n"
1313 "mov v27.16b, v26.16b\n"
1314 "ldr q25, [%[b_ptr0], #0x70]\n"
1315 "mov v28.16b, v26.16b\n"
1316 "add c_ptr2, c_ptr2, #0x10\n"
1317 "str q29, [c_ptr3]\n"
1318 "add c_ptr3, c_ptr3, #0x10\n"
1319 "mov v29.16b, v26.16b\n"
1320 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1321 "fmla v27.4s, v18.4s, v3.s[0]\n"
1322 "str q30, [c_ptr4]\n"
1323 "mov v30.16b, v26.16b\n"
1324 "add c_ptr4, c_ptr4, #0x10\n"
1325 "fmla v28.4s, v18.4s, v6.s[0]\n"
1326 "str q31, [c_ptr5]\n"
1327 "mov v31.16b, v26.16b\n"
1328 "add c_ptr5, c_ptr5, #0x10\n"
1329 "fmla v26.4s, v18.4s, v0.s[0]\n"
1330 "fmla v29.4s, v18.4s, v9.s[0]\n"
1331 "fmla v30.4s, v18.4s, v12.s[0]\n"
1332 "fmla v31.4s, v18.4s, v15.s[0]\n"
1333 "ldr q18, [%[b_ptr0]]\n"
1334 "fmla v26.4s, v19.4s, v0.s[1]\n"
1335 "fmla v27.4s, v19.4s, v3.s[1]\n"
1336 "fmla v28.4s, v19.4s, v6.s[1]\n"
1337 "fmla v29.4s, v19.4s, v9.s[1]\n"
1338 "fmla v30.4s, v19.4s, v12.s[1]\n"
1339 "fmla v31.4s, v19.4s, v15.s[1]\n"
1340 "ldr q19, [%[b_ptr0], #0x10]\n"
1341 "fmla v26.4s, v20.4s, v0.s[2]\n"
1342 "fmla v27.4s, v20.4s, v3.s[2]\n"
1343 "fmla v28.4s, v20.4s, v6.s[2]\n"
1344 "fmla v29.4s, v20.4s, v9.s[2]\n"
1345 "fmla v30.4s, v20.4s, v12.s[2]\n"
1346 "fmla v31.4s, v20.4s, v15.s[2]\n"
1347 "ldr q20, [%[b_ptr0], #0x20]\n"
1348 "fmla v26.4s, v21.4s, v0.s[3]\n"
1349 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1350 "fmla v27.4s, v21.4s, v3.s[3]\n"
1351 "fmla v28.4s, v21.4s, v6.s[3]\n"
1352 "fmla v29.4s, v21.4s, v9.s[3]\n"
1353 "fmla v30.4s, v21.4s, v12.s[3]\n"
1354 "fmla v31.4s, v21.4s, v15.s[3]\n"
1355 "fmla v26.4s, v22.4s, v1.s[0]\n"
1356 "fmla v27.4s, v22.4s, v4.s[0]\n"
1357 "fmla v28.4s, v22.4s, v7.s[0]\n"
1358 "fmla v29.4s, v22.4s, v10.s[0]\n"
1359 "fmla v30.4s, v22.4s, v13.s[0]\n"
1360 "fmla v31.4s, v22.4s, v16.s[0]\n"
1361 "fmla v26.4s, v23.4s, v1.s[1]\n"
1362 "fmla v27.4s, v23.4s, v4.s[1]\n"
1363 "fmla v28.4s, v23.4s, v7.s[1]\n"
1364 "fmla v29.4s, v23.4s, v10.s[1]\n"
1365 "fmla v30.4s, v23.4s, v13.s[1]\n"
1366 "fmla v31.4s, v23.4s, v16.s[1]\n"
1367 "fmla v26.4s, v24.4s, v1.s[2]\n"
1368 "fmla v27.4s, v24.4s, v4.s[2]\n"
1369 "fmla v28.4s, v24.4s, v7.s[2]\n"
1370 "fmla v29.4s, v24.4s, v10.s[2]\n"
1371 "fmla v30.4s, v24.4s, v13.s[2]\n"
1372 "fmla v31.4s, v24.4s, v16.s[2]\n"
1373 "fmla v26.4s, v25.4s, v1.s[3]\n"
1374 "fmla v27.4s, v25.4s, v4.s[3]\n"
1375 "fmla v28.4s, v25.4s, v7.s[3]\n"
1376 "fmla v29.4s, v25.4s, v10.s[3]\n"
1377 "fmla v30.4s, v25.4s, v13.s[3]\n"
1378 "fmla v31.4s, v25.4s, v16.s[3]\n"
1379 "fmla v26.4s, v18.4s, v2.s[0]\n"
1380 "fmla v27.4s, v18.4s, v5.s[0]\n"
1381 "fmla v28.4s, v18.4s, v8.s[0]\n"
1382 "fmla v29.4s, v18.4s, v11.s[0]\n"
1383 "fmla v30.4s, v18.4s, v14.s[0]\n"
1384 "fmla v31.4s, v18.4s, v17.s[0]\n"
1385 "fmla v26.4s, v19.4s, v2.s[1]\n"
1386 "fmla v27.4s, v19.4s, v5.s[1]\n"
1387 "fmla v28.4s, v19.4s, v8.s[1]\n"
1388 "fmla v29.4s, v19.4s, v11.s[1]\n"
1389 "fmla v30.4s, v19.4s, v14.s[1]\n"
1390 "fmla v31.4s, v19.4s, v17.s[1]\n"
1391 "fmla v26.4s, v20.4s, v2.s[2]\n"
1392 "fmla v27.4s, v20.4s, v5.s[2]\n"
1393 "fmla v28.4s, v20.4s, v8.s[2]\n"
1394 "fmla v29.4s, v20.4s, v11.s[2]\n"
1395 "fmla v30.4s, v20.4s, v14.s[2]\n"
1396 "fmla v31.4s, v20.4s, v17.s[2]\n"
1397 "b 5f\n"
1398 "2:\n"
1399 "ldr q26, [%[biasptr]]\n"
1400 "add %[biasptr], %[biasptr], %[biasinc]\n"
1401 "mov v27.16b, v26.16b\n"
1402 "mov v28.16b, v26.16b\n"
1403 "mov v29.16b, v26.16b\n"
1404 "mov v30.16b, v26.16b\n"
1405 "mov v31.16b, v26.16b\n"
1406 "fmla v26.4s, v18.4s, v0.s[0]\n"
1407 "fmla v27.4s, v18.4s, v3.s[0]\n"
1408 "fmla v28.4s, v18.4s, v6.s[0]\n"
1409 "fmla v29.4s, v18.4s, v9.s[0]\n"
1410 "fmla v30.4s, v18.4s, v12.s[0]\n"
1411 "fmla v31.4s, v18.4s, v15.s[0]\n"
1412 "ldr q18, [%[b_ptr0]]\n"
1413 "fmla v26.4s, v19.4s, v0.s[1]\n"
1414 "fmla v27.4s, v19.4s, v3.s[1]\n"
1415 "fmla v28.4s, v19.4s, v6.s[1]\n"
1416 "fmla v29.4s, v19.4s, v9.s[1]\n"
1417 "fmla v30.4s, v19.4s, v12.s[1]\n"
1418 "fmla v31.4s, v19.4s, v15.s[1]\n"
1419 "ldr q19, [%[b_ptr0], #0x10]\n"
1420 "fmla v26.4s, v20.4s, v0.s[2]\n"
1421 "fmla v27.4s, v20.4s, v3.s[2]\n"
1422 "fmla v28.4s, v20.4s, v6.s[2]\n"
1423 "fmla v29.4s, v20.4s, v9.s[2]\n"
1424 "fmla v30.4s, v20.4s, v12.s[2]\n"
1425 "fmla v31.4s, v20.4s, v15.s[2]\n"
1426 "ldr q20, [%[b_ptr0], #0x20]\n"
1427 "fmla v26.4s, v21.4s, v0.s[3]\n"
1428 "add %[b_ptr0], %[b_ptr0], #0x30\n"
1429 "fmla v27.4s, v21.4s, v3.s[3]\n"
1430 "fmla v28.4s, v21.4s, v6.s[3]\n"
1431 "fmla v29.4s, v21.4s, v9.s[3]\n"
1432 "fmla v30.4s, v21.4s, v12.s[3]\n"
1433 "fmla v31.4s, v21.4s, v15.s[3]\n"
1434 "fmla v26.4s, v22.4s, v1.s[0]\n"
1435 "fmla v27.4s, v22.4s, v4.s[0]\n"
1436 "fmla v28.4s, v22.4s, v7.s[0]\n"
1437 "fmla v29.4s, v22.4s, v10.s[0]\n"
1438 "fmla v30.4s, v22.4s, v13.s[0]\n"
1439 "fmla v31.4s, v22.4s, v16.s[0]\n"
1440 "fmla v26.4s, v23.4s, v1.s[1]\n"
1441 "fmla v27.4s, v23.4s, v4.s[1]\n"
1442 "fmla v28.4s, v23.4s, v7.s[1]\n"
1443 "fmla v29.4s, v23.4s, v10.s[1]\n"
1444 "fmla v30.4s, v23.4s, v13.s[1]\n"
1445 "fmla v31.4s, v23.4s, v16.s[1]\n"
1446 "fmla v26.4s, v24.4s, v1.s[2]\n"
1447 "fmla v27.4s, v24.4s, v4.s[2]\n"
1448 "fmla v28.4s, v24.4s, v7.s[2]\n"
1449 "fmla v29.4s, v24.4s, v10.s[2]\n"
1450 "fmla v30.4s, v24.4s, v13.s[2]\n"
1451 "fmla v31.4s, v24.4s, v16.s[2]\n"
1452 "fmla v26.4s, v25.4s, v1.s[3]\n"
1453 "fmla v27.4s, v25.4s, v4.s[3]\n"
1454 "fmla v28.4s, v25.4s, v7.s[3]\n"
1455 "fmla v29.4s, v25.4s, v10.s[3]\n"
1456 "fmla v30.4s, v25.4s, v13.s[3]\n"
1457 "fmla v31.4s, v25.4s, v16.s[3]\n"
1458 "fmla v26.4s, v18.4s, v2.s[0]\n"
1459 "fmla v27.4s, v18.4s, v5.s[0]\n"
1460 "fmla v28.4s, v18.4s, v8.s[0]\n"
1461 "fmla v29.4s, v18.4s, v11.s[0]\n"
1462 "fmla v30.4s, v18.4s, v14.s[0]\n"
1463 "fmla v31.4s, v18.4s, v17.s[0]\n"
1464 "fmla v26.4s, v19.4s, v2.s[1]\n"
1465 "fmla v27.4s, v19.4s, v5.s[1]\n"
1466 "fmla v28.4s, v19.4s, v8.s[1]\n"
1467 "fmla v29.4s, v19.4s, v11.s[1]\n"
1468 "fmla v30.4s, v19.4s, v14.s[1]\n"
1469 "fmla v31.4s, v19.4s, v17.s[1]\n"
1470 "fmla v26.4s, v20.4s, v2.s[2]\n"
1471 "fmla v27.4s, v20.4s, v5.s[2]\n"
1472 "fmla v28.4s, v20.4s, v8.s[2]\n"
1473 "fmla v29.4s, v20.4s, v11.s[2]\n"
1474 "fmla v30.4s, v20.4s, v14.s[2]\n"
1475 "fmla v31.4s, v20.4s, v17.s[2]\n"
1476 "5:\n"
1477 "ld1r {v24.4s}, [%[minptr]]\n"
1478 "ld1r {v25.4s}, [%[maxptr]]\n"
1479 "fmax v26.4s, v26.4s, v24.4s\n"
1480 "fmax v27.4s, v27.4s, v24.4s\n"
1481 "fmax v28.4s, v28.4s, v24.4s\n"
1482 "fmax v29.4s, v29.4s, v24.4s\n"
1483 "fmin v26.4s, v26.4s, v25.4s\n"
1484 "fmin v27.4s, v27.4s, v25.4s\n"
1485 "fmin v28.4s, v28.4s, v25.4s\n"
1486 "fmin v29.4s, v29.4s, v25.4s\n"
1487 "str q26, [%[c_ptr0]]\n"
1488 "fmax v30.4s, v30.4s, v24.4s\n"
1489 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1490 "fmax v31.4s, v31.4s, v24.4s\n"
1491 "str q27, [c_ptr1]\n"
1492 "fmin v30.4s, v30.4s, v25.4s\n"
1493 "fmin v31.4s, v31.4s, v25.4s\n"
1494 "str q28, [c_ptr2]\n"
1495 "str q29, [c_ptr3]\n"
1496 "str q30, [c_ptr4]\n"
1497 "str q31, [c_ptr5]\n"
1498 ".unreq a_ptr1\n"
1499 ".unreq a_ptr2\n"
1500 ".unreq a_ptr3\n"
1501 ".unreq a_ptr4\n"
1502 ".unreq a_ptr5\n"
1503 ".unreq c_ptr1\n"
1504 ".unreq c_ptr2\n"
1505 ".unreq c_ptr3\n"
1506 ".unreq c_ptr4\n"
1507 ".unreq c_ptr5\n"
1508 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
1509 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
1510 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
1511 );
1512 break;
1513 case 12:
1514 __asm __volatile (
1515 "a_ptr1 .req X0\n"
1516 "a_ptr2 .req X1\n"
1517 "a_ptr3 .req X2\n"
1518 "a_ptr4 .req X3\n"
1519 "a_ptr5 .req X4\n"
1520 "c_ptr1 .req X5\n"
1521 "c_ptr2 .req X6\n"
1522 "c_ptr3 .req X7\n"
1523 "c_ptr4 .req X8\n"
1524 "c_ptr5 .req X9\n"
1525 "add a_ptr1, %[a_ptr0], %[lda]\n"
1526 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1527 "add a_ptr2, a_ptr1, %[lda]\n"
1528 "add c_ptr2, c_ptr1, %[ldc]\n"
1529 "add a_ptr3, a_ptr2, %[lda]\n"
1530 "add c_ptr3, c_ptr2, %[ldc]\n"
1531 "add a_ptr4, a_ptr3, %[lda]\n"
1532 "add c_ptr4, c_ptr3, %[ldc]\n"
1533 "add a_ptr5, a_ptr4, %[lda]\n"
1534 "add c_ptr5, c_ptr4, %[ldc]\n"
1535 "cbz %[oob_rows], 1f\n"
1536 "subs %[oob_rows], %[oob_rows], #0x1\n"
1537 "add c_ptr5, %[c_ptr0], #0x0\n"
1538 "add a_ptr5, %[a_ptr0], #0x0\n"
1539 "b.eq 1f\n"
1540 "subs %[oob_rows], %[oob_rows], #0x1\n"
1541 "add c_ptr4, %[c_ptr0], #0x0\n"
1542 "add a_ptr4, %[a_ptr0], #0x0\n"
1543 "b.eq 1f\n"
1544 "subs %[oob_rows], %[oob_rows], #0x1\n"
1545 "add c_ptr3, %[c_ptr0], #0x0\n"
1546 "add a_ptr3, %[a_ptr0], #0x0\n"
1547 "b.eq 1f\n"
1548 "subs %[oob_rows], %[oob_rows], #0x1\n"
1549 "add c_ptr2, %[c_ptr0], #0x0\n"
1550 "add a_ptr2, %[a_ptr0], #0x0\n"
1551 "b.eq 1f\n"
1552 "subs %[oob_rows], %[oob_rows], #0x1\n"
1553 "add c_ptr1, %[c_ptr0], #0x0\n"
1554 "add a_ptr1, %[a_ptr0], #0x0\n"
1555 "1:\n"
1556 "ldr q0, [%[a_ptr0]], #0x10\n"
1557 "ldr q3, [a_ptr1], #0x10\n"
1558 "ldr q6, [a_ptr2], #0x10\n"
1559 "ldr q9, [a_ptr3], #0x10\n"
1560 "ldr q12, [a_ptr4], #0x10\n"
1561 "ldr q15, [a_ptr5], #0x10\n"
1562 "ldr q1, [%[a_ptr0]], #0x10\n"
1563 "ldr q4, [a_ptr1], #0x10\n"
1564 "ldr q7, [a_ptr2], #0x10\n"
1565 "ldr q10, [a_ptr3], #0x10\n"
1566 "ldr q13, [a_ptr4], #0x10\n"
1567 "ldr q16, [a_ptr5], #0x10\n"
1568 "ldr q2, [%[a_ptr0]]\n"
1569 "ldr q5, [a_ptr1]\n"
1570 "ldr q8, [a_ptr2]\n"
1571 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
1572 "ldr q11, [a_ptr3]\n"
1573 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
1574 "ldr q14, [a_ptr4]\n"
1575 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
1576 "ldr q17, [a_ptr5]\n"
1577 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
1578 "ldr q18, [%[b_ptr0]]\n"
1579 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
1580 "ldr q19, [%[b_ptr0], #0x10]\n"
1581 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
1582 "ldr q20, [%[b_ptr0], #0x20]\n"
1583 "ldr q21, [%[b_ptr0], #0x30]\n"
1584 "ldr q22, [%[b_ptr0], #0x40]\n"
1585 "ldr q23, [%[b_ptr0], #0x50]\n"
1586 "ldr q24, [%[b_ptr0], #0x60]\n"
1587 "ldr q25, [%[b_ptr0], #0x70]\n"
1588 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1589 "cbz %[loops], 2f\n"
1590 "ldr q26, [%[biasptr]]\n"
1591 "add %[biasptr], %[biasptr], %[biasinc]\n"
1592 "subs %[loops], %[loops], #0x1\n"
1593 "mov v27.16b, v26.16b\n"
1594 "mov v28.16b, v26.16b\n"
1595 "mov v29.16b, v26.16b\n"
1596 "mov v30.16b, v26.16b\n"
1597 "mov v31.16b, v26.16b\n"
1598 "fmla v26.4s, v18.4s, v0.s[0]\n"
1599 "fmla v27.4s, v18.4s, v3.s[0]\n"
1600 "fmla v28.4s, v18.4s, v6.s[0]\n"
1601 "fmla v29.4s, v18.4s, v9.s[0]\n"
1602 "fmla v30.4s, v18.4s, v12.s[0]\n"
1603 "fmla v31.4s, v18.4s, v15.s[0]\n"
1604 "ldr q18, [%[b_ptr0]]\n"
1605 "fmla v26.4s, v19.4s, v0.s[1]\n"
1606 "fmla v27.4s, v19.4s, v3.s[1]\n"
1607 "fmla v28.4s, v19.4s, v6.s[1]\n"
1608 "fmla v29.4s, v19.4s, v9.s[1]\n"
1609 "fmla v30.4s, v19.4s, v12.s[1]\n"
1610 "fmla v31.4s, v19.4s, v15.s[1]\n"
1611 "ldr q19, [%[b_ptr0], #0x10]\n"
1612 "fmla v26.4s, v20.4s, v0.s[2]\n"
1613 "fmla v27.4s, v20.4s, v3.s[2]\n"
1614 "fmla v28.4s, v20.4s, v6.s[2]\n"
1615 "fmla v29.4s, v20.4s, v9.s[2]\n"
1616 "fmla v30.4s, v20.4s, v12.s[2]\n"
1617 "fmla v31.4s, v20.4s, v15.s[2]\n"
1618 "ldr q20, [%[b_ptr0], #0x20]\n"
1619 "fmla v26.4s, v21.4s, v0.s[3]\n"
1620 "fmla v27.4s, v21.4s, v3.s[3]\n"
1621 "fmla v28.4s, v21.4s, v6.s[3]\n"
1622 "fmla v29.4s, v21.4s, v9.s[3]\n"
1623 "fmla v30.4s, v21.4s, v12.s[3]\n"
1624 "fmla v31.4s, v21.4s, v15.s[3]\n"
1625 "ldr q21, [%[b_ptr0], #0x30]\n"
1626 "fmla v26.4s, v22.4s, v1.s[0]\n"
1627 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1628 "fmla v27.4s, v22.4s, v4.s[0]\n"
1629 "fmla v28.4s, v22.4s, v7.s[0]\n"
1630 "fmla v29.4s, v22.4s, v10.s[0]\n"
1631 "fmla v30.4s, v22.4s, v13.s[0]\n"
1632 "fmla v31.4s, v22.4s, v16.s[0]\n"
1633 "fmla v26.4s, v23.4s, v1.s[1]\n"
1634 "fmla v27.4s, v23.4s, v4.s[1]\n"
1635 "fmla v28.4s, v23.4s, v7.s[1]\n"
1636 "fmla v29.4s, v23.4s, v10.s[1]\n"
1637 "fmla v30.4s, v23.4s, v13.s[1]\n"
1638 "fmla v31.4s, v23.4s, v16.s[1]\n"
1639 "fmla v26.4s, v24.4s, v1.s[2]\n"
1640 "fmla v27.4s, v24.4s, v4.s[2]\n"
1641 "fmla v28.4s, v24.4s, v7.s[2]\n"
1642 "fmla v29.4s, v24.4s, v10.s[2]\n"
1643 "fmla v30.4s, v24.4s, v13.s[2]\n"
1644 "fmla v31.4s, v24.4s, v16.s[2]\n"
1645 "fmla v26.4s, v25.4s, v1.s[3]\n"
1646 "fmla v27.4s, v25.4s, v4.s[3]\n"
1647 "fmla v28.4s, v25.4s, v7.s[3]\n"
1648 "fmla v29.4s, v25.4s, v10.s[3]\n"
1649 "fmla v30.4s, v25.4s, v13.s[3]\n"
1650 "fmla v31.4s, v25.4s, v16.s[3]\n"
1651 "fmla v26.4s, v18.4s, v2.s[0]\n"
1652 "fmla v27.4s, v18.4s, v5.s[0]\n"
1653 "fmla v28.4s, v18.4s, v8.s[0]\n"
1654 "fmla v29.4s, v18.4s, v11.s[0]\n"
1655 "fmla v30.4s, v18.4s, v14.s[0]\n"
1656 "fmla v31.4s, v18.4s, v17.s[0]\n"
1657 "fmla v26.4s, v19.4s, v2.s[1]\n"
1658 "fmla v27.4s, v19.4s, v5.s[1]\n"
1659 "fmla v28.4s, v19.4s, v8.s[1]\n"
1660 "fmla v29.4s, v19.4s, v11.s[1]\n"
1661 "fmla v30.4s, v19.4s, v14.s[1]\n"
1662 "fmla v31.4s, v19.4s, v17.s[1]\n"
1663 "fmla v26.4s, v20.4s, v2.s[2]\n"
1664 "fmla v27.4s, v20.4s, v5.s[2]\n"
1665 "fmla v28.4s, v20.4s, v8.s[2]\n"
1666 "fmla v29.4s, v20.4s, v11.s[2]\n"
1667 "fmla v30.4s, v20.4s, v14.s[2]\n"
1668 "fmla v31.4s, v20.4s, v17.s[2]\n"
1669 "fmla v26.4s, v21.4s, v2.s[3]\n"
1670 "fmla v27.4s, v21.4s, v5.s[3]\n"
1671 "fmla v28.4s, v21.4s, v8.s[3]\n"
1672 "fmla v29.4s, v21.4s, v11.s[3]\n"
1673 "fmla v30.4s, v21.4s, v14.s[3]\n"
1674 "fmla v31.4s, v21.4s, v17.s[3]\n"
1675 "b.eq 3f\n"
1676 "4:\n"
1677 "ld1r {v24.4s}, [%[minptr]]\n"
1678 "subs %[loops], %[loops], #0x1\n"
1679 "ld1r {v25.4s}, [%[maxptr]]\n"
1680 "ldr q18, [%[b_ptr0]]\n"
1681 "fmax v26.4s, v26.4s, v24.4s\n"
1682 "ldr q19, [%[b_ptr0], #0x10]\n"
1683 "fmax v27.4s, v27.4s, v24.4s\n"
1684 "ldr q20, [%[b_ptr0], #0x20]\n"
1685 "fmax v28.4s, v28.4s, v24.4s\n"
1686 "ldr q21, [%[b_ptr0], #0x30]\n"
1687 "fmax v29.4s, v29.4s, v24.4s\n"
1688 "ldr q22, [%[b_ptr0], #0x40]\n"
1689 "fmin v26.4s, v26.4s, v25.4s\n"
1690 "ldr q23, [%[b_ptr0], #0x50]\n"
1691 "fmin v27.4s, v27.4s, v25.4s\n"
1692 "fmin v28.4s, v28.4s, v25.4s\n"
1693 "fmin v29.4s, v29.4s, v25.4s\n"
1694 "str q26, [%[c_ptr0]]\n"
1695 "fmax v30.4s, v30.4s, v24.4s\n"
1696 "ldr q26, [%[biasptr]]\n"
1697 "fmax v31.4s, v31.4s, v24.4s\n"
1698 "ldr q24, [%[b_ptr0], #0x60]\n"
1699 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1700 "str q27, [c_ptr1]\n"
1701 "add c_ptr1, c_ptr1, #0x10\n"
1702 "fmin v30.4s, v30.4s, v25.4s\n"
1703 "add %[biasptr], %[biasptr], %[biasinc]\n"
1704 "fmin v31.4s, v31.4s, v25.4s\n"
1705 "str q28, [c_ptr2]\n"
1706 "mov v27.16b, v26.16b\n"
1707 "ldr q25, [%[b_ptr0], #0x70]\n"
1708 "mov v28.16b, v26.16b\n"
1709 "add c_ptr2, c_ptr2, #0x10\n"
1710 "str q29, [c_ptr3]\n"
1711 "add c_ptr3, c_ptr3, #0x10\n"
1712 "mov v29.16b, v26.16b\n"
1713 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1714 "fmla v27.4s, v18.4s, v3.s[0]\n"
1715 "str q30, [c_ptr4]\n"
1716 "mov v30.16b, v26.16b\n"
1717 "add c_ptr4, c_ptr4, #0x10\n"
1718 "fmla v28.4s, v18.4s, v6.s[0]\n"
1719 "str q31, [c_ptr5]\n"
1720 "mov v31.16b, v26.16b\n"
1721 "add c_ptr5, c_ptr5, #0x10\n"
1722 "fmla v26.4s, v18.4s, v0.s[0]\n"
1723 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1724 "fmla v29.4s, v18.4s, v9.s[0]\n"
1725 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1726 "fmla v30.4s, v18.4s, v12.s[0]\n"
1727 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1728 "fmla v31.4s, v18.4s, v15.s[0]\n"
1729 "ldr q18, [%[b_ptr0]]\n"
1730 "fmla v26.4s, v19.4s, v0.s[1]\n"
1731 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1732 "fmla v27.4s, v19.4s, v3.s[1]\n"
1733 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1734 "fmla v28.4s, v19.4s, v6.s[1]\n"
1735 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1736 "fmla v29.4s, v19.4s, v9.s[1]\n"
1737 "fmla v30.4s, v19.4s, v12.s[1]\n"
1738 "fmla v31.4s, v19.4s, v15.s[1]\n"
1739 "ldr q19, [%[b_ptr0], #0x10]\n"
1740 "fmla v26.4s, v20.4s, v0.s[2]\n"
1741 "fmla v27.4s, v20.4s, v3.s[2]\n"
1742 "fmla v28.4s, v20.4s, v6.s[2]\n"
1743 "fmla v29.4s, v20.4s, v9.s[2]\n"
1744 "fmla v30.4s, v20.4s, v12.s[2]\n"
1745 "fmla v31.4s, v20.4s, v15.s[2]\n"
1746 "ldr q20, [%[b_ptr0], #0x20]\n"
1747 "fmla v26.4s, v21.4s, v0.s[3]\n"
1748 "fmla v27.4s, v21.4s, v3.s[3]\n"
1749 "fmla v28.4s, v21.4s, v6.s[3]\n"
1750 "fmla v29.4s, v21.4s, v9.s[3]\n"
1751 "fmla v30.4s, v21.4s, v12.s[3]\n"
1752 "fmla v31.4s, v21.4s, v15.s[3]\n"
1753 "ldr q21, [%[b_ptr0], #0x30]\n"
1754 "fmla v26.4s, v22.4s, v1.s[0]\n"
1755 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1756 "fmla v27.4s, v22.4s, v4.s[0]\n"
1757 "fmla v28.4s, v22.4s, v7.s[0]\n"
1758 "fmla v29.4s, v22.4s, v10.s[0]\n"
1759 "fmla v30.4s, v22.4s, v13.s[0]\n"
1760 "fmla v31.4s, v22.4s, v16.s[0]\n"
1761 "fmla v26.4s, v23.4s, v1.s[1]\n"
1762 "fmla v27.4s, v23.4s, v4.s[1]\n"
1763 "fmla v28.4s, v23.4s, v7.s[1]\n"
1764 "fmla v29.4s, v23.4s, v10.s[1]\n"
1765 "fmla v30.4s, v23.4s, v13.s[1]\n"
1766 "fmla v31.4s, v23.4s, v16.s[1]\n"
1767 "fmla v26.4s, v24.4s, v1.s[2]\n"
1768 "fmla v27.4s, v24.4s, v4.s[2]\n"
1769 "fmla v28.4s, v24.4s, v7.s[2]\n"
1770 "fmla v29.4s, v24.4s, v10.s[2]\n"
1771 "fmla v30.4s, v24.4s, v13.s[2]\n"
1772 "fmla v31.4s, v24.4s, v16.s[2]\n"
1773 "fmla v26.4s, v25.4s, v1.s[3]\n"
1774 "fmla v27.4s, v25.4s, v4.s[3]\n"
1775 "fmla v28.4s, v25.4s, v7.s[3]\n"
1776 "fmla v29.4s, v25.4s, v10.s[3]\n"
1777 "fmla v30.4s, v25.4s, v13.s[3]\n"
1778 "fmla v31.4s, v25.4s, v16.s[3]\n"
1779 "fmla v26.4s, v18.4s, v2.s[0]\n"
1780 "fmla v27.4s, v18.4s, v5.s[0]\n"
1781 "fmla v28.4s, v18.4s, v8.s[0]\n"
1782 "fmla v29.4s, v18.4s, v11.s[0]\n"
1783 "fmla v30.4s, v18.4s, v14.s[0]\n"
1784 "fmla v31.4s, v18.4s, v17.s[0]\n"
1785 "fmla v26.4s, v19.4s, v2.s[1]\n"
1786 "fmla v27.4s, v19.4s, v5.s[1]\n"
1787 "fmla v28.4s, v19.4s, v8.s[1]\n"
1788 "fmla v29.4s, v19.4s, v11.s[1]\n"
1789 "fmla v30.4s, v19.4s, v14.s[1]\n"
1790 "fmla v31.4s, v19.4s, v17.s[1]\n"
1791 "fmla v26.4s, v20.4s, v2.s[2]\n"
1792 "fmla v27.4s, v20.4s, v5.s[2]\n"
1793 "fmla v28.4s, v20.4s, v8.s[2]\n"
1794 "fmla v29.4s, v20.4s, v11.s[2]\n"
1795 "fmla v30.4s, v20.4s, v14.s[2]\n"
1796 "fmla v31.4s, v20.4s, v17.s[2]\n"
1797 "fmla v26.4s, v21.4s, v2.s[3]\n"
1798 "fmla v27.4s, v21.4s, v5.s[3]\n"
1799 "fmla v28.4s, v21.4s, v8.s[3]\n"
1800 "fmla v29.4s, v21.4s, v11.s[3]\n"
1801 "fmla v30.4s, v21.4s, v14.s[3]\n"
1802 "fmla v31.4s, v21.4s, v17.s[3]\n"
1803 "b.ne 4b\n"
1804 "3:\n"
1805 "ld1r {v24.4s}, [%[minptr]]\n"
1806 "ld1r {v25.4s}, [%[maxptr]]\n"
1807 "ldr q18, [%[b_ptr0]]\n"
1808 "ldr q19, [%[b_ptr0], #0x10]\n"
1809 "fmax v26.4s, v26.4s, v24.4s\n"
1810 "ldr q20, [%[b_ptr0], #0x20]\n"
1811 "fmax v27.4s, v27.4s, v24.4s\n"
1812 "ldr q21, [%[b_ptr0], #0x30]\n"
1813 "fmax v28.4s, v28.4s, v24.4s\n"
1814 "ldr q22, [%[b_ptr0], #0x40]\n"
1815 "fmax v29.4s, v29.4s, v24.4s\n"
1816 "ldr q23, [%[b_ptr0], #0x50]\n"
1817 "fmin v26.4s, v26.4s, v25.4s\n"
1818 "fmin v27.4s, v27.4s, v25.4s\n"
1819 "fmin v28.4s, v28.4s, v25.4s\n"
1820 "fmin v29.4s, v29.4s, v25.4s\n"
1821 "str q26, [%[c_ptr0]]\n"
1822 "fmax v30.4s, v30.4s, v24.4s\n"
1823 "ldr q26, [%[biasptr]]\n"
1824 "fmax v31.4s, v31.4s, v24.4s\n"
1825 "ldr q24, [%[b_ptr0], #0x60]\n"
1826 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1827 "str q27, [c_ptr1]\n"
1828 "add c_ptr1, c_ptr1, #0x10\n"
1829 "fmin v30.4s, v30.4s, v25.4s\n"
1830 "add %[biasptr], %[biasptr], %[biasinc]\n"
1831 "fmin v31.4s, v31.4s, v25.4s\n"
1832 "str q28, [c_ptr2]\n"
1833 "mov v27.16b, v26.16b\n"
1834 "ldr q25, [%[b_ptr0], #0x70]\n"
1835 "mov v28.16b, v26.16b\n"
1836 "add c_ptr2, c_ptr2, #0x10\n"
1837 "str q29, [c_ptr3]\n"
1838 "add c_ptr3, c_ptr3, #0x10\n"
1839 "mov v29.16b, v26.16b\n"
1840 "add %[b_ptr0], %[b_ptr0], #0x80\n"
1841 "fmla v27.4s, v18.4s, v3.s[0]\n"
1842 "str q30, [c_ptr4]\n"
1843 "mov v30.16b, v26.16b\n"
1844 "add c_ptr4, c_ptr4, #0x10\n"
1845 "fmla v28.4s, v18.4s, v6.s[0]\n"
1846 "str q31, [c_ptr5]\n"
1847 "mov v31.16b, v26.16b\n"
1848 "add c_ptr5, c_ptr5, #0x10\n"
1849 "fmla v26.4s, v18.4s, v0.s[0]\n"
1850 "fmla v29.4s, v18.4s, v9.s[0]\n"
1851 "fmla v30.4s, v18.4s, v12.s[0]\n"
1852 "fmla v31.4s, v18.4s, v15.s[0]\n"
1853 "ldr q18, [%[b_ptr0]]\n"
1854 "fmla v26.4s, v19.4s, v0.s[1]\n"
1855 "fmla v27.4s, v19.4s, v3.s[1]\n"
1856 "fmla v28.4s, v19.4s, v6.s[1]\n"
1857 "fmla v29.4s, v19.4s, v9.s[1]\n"
1858 "fmla v30.4s, v19.4s, v12.s[1]\n"
1859 "fmla v31.4s, v19.4s, v15.s[1]\n"
1860 "ldr q19, [%[b_ptr0], #0x10]\n"
1861 "fmla v26.4s, v20.4s, v0.s[2]\n"
1862 "fmla v27.4s, v20.4s, v3.s[2]\n"
1863 "fmla v28.4s, v20.4s, v6.s[2]\n"
1864 "fmla v29.4s, v20.4s, v9.s[2]\n"
1865 "fmla v30.4s, v20.4s, v12.s[2]\n"
1866 "fmla v31.4s, v20.4s, v15.s[2]\n"
1867 "ldr q20, [%[b_ptr0], #0x20]\n"
1868 "fmla v26.4s, v21.4s, v0.s[3]\n"
1869 "fmla v27.4s, v21.4s, v3.s[3]\n"
1870 "fmla v28.4s, v21.4s, v6.s[3]\n"
1871 "fmla v29.4s, v21.4s, v9.s[3]\n"
1872 "fmla v30.4s, v21.4s, v12.s[3]\n"
1873 "fmla v31.4s, v21.4s, v15.s[3]\n"
1874 "ldr q21, [%[b_ptr0], #0x30]\n"
1875 "fmla v26.4s, v22.4s, v1.s[0]\n"
1876 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1877 "fmla v27.4s, v22.4s, v4.s[0]\n"
1878 "fmla v28.4s, v22.4s, v7.s[0]\n"
1879 "fmla v29.4s, v22.4s, v10.s[0]\n"
1880 "fmla v30.4s, v22.4s, v13.s[0]\n"
1881 "fmla v31.4s, v22.4s, v16.s[0]\n"
1882 "fmla v26.4s, v23.4s, v1.s[1]\n"
1883 "fmla v27.4s, v23.4s, v4.s[1]\n"
1884 "fmla v28.4s, v23.4s, v7.s[1]\n"
1885 "fmla v29.4s, v23.4s, v10.s[1]\n"
1886 "fmla v30.4s, v23.4s, v13.s[1]\n"
1887 "fmla v31.4s, v23.4s, v16.s[1]\n"
1888 "fmla v26.4s, v24.4s, v1.s[2]\n"
1889 "fmla v27.4s, v24.4s, v4.s[2]\n"
1890 "fmla v28.4s, v24.4s, v7.s[2]\n"
1891 "fmla v29.4s, v24.4s, v10.s[2]\n"
1892 "fmla v30.4s, v24.4s, v13.s[2]\n"
1893 "fmla v31.4s, v24.4s, v16.s[2]\n"
1894 "fmla v26.4s, v25.4s, v1.s[3]\n"
1895 "fmla v27.4s, v25.4s, v4.s[3]\n"
1896 "fmla v28.4s, v25.4s, v7.s[3]\n"
1897 "fmla v29.4s, v25.4s, v10.s[3]\n"
1898 "fmla v30.4s, v25.4s, v13.s[3]\n"
1899 "fmla v31.4s, v25.4s, v16.s[3]\n"
1900 "fmla v26.4s, v18.4s, v2.s[0]\n"
1901 "fmla v27.4s, v18.4s, v5.s[0]\n"
1902 "fmla v28.4s, v18.4s, v8.s[0]\n"
1903 "fmla v29.4s, v18.4s, v11.s[0]\n"
1904 "fmla v30.4s, v18.4s, v14.s[0]\n"
1905 "fmla v31.4s, v18.4s, v17.s[0]\n"
1906 "fmla v26.4s, v19.4s, v2.s[1]\n"
1907 "fmla v27.4s, v19.4s, v5.s[1]\n"
1908 "fmla v28.4s, v19.4s, v8.s[1]\n"
1909 "fmla v29.4s, v19.4s, v11.s[1]\n"
1910 "fmla v30.4s, v19.4s, v14.s[1]\n"
1911 "fmla v31.4s, v19.4s, v17.s[1]\n"
1912 "fmla v26.4s, v20.4s, v2.s[2]\n"
1913 "fmla v27.4s, v20.4s, v5.s[2]\n"
1914 "fmla v28.4s, v20.4s, v8.s[2]\n"
1915 "fmla v29.4s, v20.4s, v11.s[2]\n"
1916 "fmla v30.4s, v20.4s, v14.s[2]\n"
1917 "fmla v31.4s, v20.4s, v17.s[2]\n"
1918 "fmla v26.4s, v21.4s, v2.s[3]\n"
1919 "fmla v27.4s, v21.4s, v5.s[3]\n"
1920 "fmla v28.4s, v21.4s, v8.s[3]\n"
1921 "fmla v29.4s, v21.4s, v11.s[3]\n"
1922 "fmla v30.4s, v21.4s, v14.s[3]\n"
1923 "fmla v31.4s, v21.4s, v17.s[3]\n"
1924 "b 5f\n"
1925 "2:\n"
1926 "ldr q26, [%[biasptr]]\n"
1927 "add %[biasptr], %[biasptr], %[biasinc]\n"
1928 "mov v27.16b, v26.16b\n"
1929 "mov v28.16b, v26.16b\n"
1930 "mov v29.16b, v26.16b\n"
1931 "mov v30.16b, v26.16b\n"
1932 "mov v31.16b, v26.16b\n"
1933 "fmla v26.4s, v18.4s, v0.s[0]\n"
1934 "fmla v27.4s, v18.4s, v3.s[0]\n"
1935 "fmla v28.4s, v18.4s, v6.s[0]\n"
1936 "fmla v29.4s, v18.4s, v9.s[0]\n"
1937 "fmla v30.4s, v18.4s, v12.s[0]\n"
1938 "fmla v31.4s, v18.4s, v15.s[0]\n"
1939 "ldr q18, [%[b_ptr0]]\n"
1940 "fmla v26.4s, v19.4s, v0.s[1]\n"
1941 "fmla v27.4s, v19.4s, v3.s[1]\n"
1942 "fmla v28.4s, v19.4s, v6.s[1]\n"
1943 "fmla v29.4s, v19.4s, v9.s[1]\n"
1944 "fmla v30.4s, v19.4s, v12.s[1]\n"
1945 "fmla v31.4s, v19.4s, v15.s[1]\n"
1946 "ldr q19, [%[b_ptr0], #0x10]\n"
1947 "fmla v26.4s, v20.4s, v0.s[2]\n"
1948 "fmla v27.4s, v20.4s, v3.s[2]\n"
1949 "fmla v28.4s, v20.4s, v6.s[2]\n"
1950 "fmla v29.4s, v20.4s, v9.s[2]\n"
1951 "fmla v30.4s, v20.4s, v12.s[2]\n"
1952 "fmla v31.4s, v20.4s, v15.s[2]\n"
1953 "ldr q20, [%[b_ptr0], #0x20]\n"
1954 "fmla v26.4s, v21.4s, v0.s[3]\n"
1955 "fmla v27.4s, v21.4s, v3.s[3]\n"
1956 "fmla v28.4s, v21.4s, v6.s[3]\n"
1957 "fmla v29.4s, v21.4s, v9.s[3]\n"
1958 "fmla v30.4s, v21.4s, v12.s[3]\n"
1959 "fmla v31.4s, v21.4s, v15.s[3]\n"
1960 "ldr q21, [%[b_ptr0], #0x30]\n"
1961 "fmla v26.4s, v22.4s, v1.s[0]\n"
1962 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1963 "fmla v27.4s, v22.4s, v4.s[0]\n"
1964 "fmla v28.4s, v22.4s, v7.s[0]\n"
1965 "fmla v29.4s, v22.4s, v10.s[0]\n"
1966 "fmla v30.4s, v22.4s, v13.s[0]\n"
1967 "fmla v31.4s, v22.4s, v16.s[0]\n"
1968 "fmla v26.4s, v23.4s, v1.s[1]\n"
1969 "fmla v27.4s, v23.4s, v4.s[1]\n"
1970 "fmla v28.4s, v23.4s, v7.s[1]\n"
1971 "fmla v29.4s, v23.4s, v10.s[1]\n"
1972 "fmla v30.4s, v23.4s, v13.s[1]\n"
1973 "fmla v31.4s, v23.4s, v16.s[1]\n"
1974 "fmla v26.4s, v24.4s, v1.s[2]\n"
1975 "fmla v27.4s, v24.4s, v4.s[2]\n"
1976 "fmla v28.4s, v24.4s, v7.s[2]\n"
1977 "fmla v29.4s, v24.4s, v10.s[2]\n"
1978 "fmla v30.4s, v24.4s, v13.s[2]\n"
1979 "fmla v31.4s, v24.4s, v16.s[2]\n"
1980 "fmla v26.4s, v25.4s, v1.s[3]\n"
1981 "fmla v27.4s, v25.4s, v4.s[3]\n"
1982 "fmla v28.4s, v25.4s, v7.s[3]\n"
1983 "fmla v29.4s, v25.4s, v10.s[3]\n"
1984 "fmla v30.4s, v25.4s, v13.s[3]\n"
1985 "fmla v31.4s, v25.4s, v16.s[3]\n"
1986 "fmla v26.4s, v18.4s, v2.s[0]\n"
1987 "fmla v27.4s, v18.4s, v5.s[0]\n"
1988 "fmla v28.4s, v18.4s, v8.s[0]\n"
1989 "fmla v29.4s, v18.4s, v11.s[0]\n"
1990 "fmla v30.4s, v18.4s, v14.s[0]\n"
1991 "fmla v31.4s, v18.4s, v17.s[0]\n"
1992 "fmla v26.4s, v19.4s, v2.s[1]\n"
1993 "fmla v27.4s, v19.4s, v5.s[1]\n"
1994 "fmla v28.4s, v19.4s, v8.s[1]\n"
1995 "fmla v29.4s, v19.4s, v11.s[1]\n"
1996 "fmla v30.4s, v19.4s, v14.s[1]\n"
1997 "fmla v31.4s, v19.4s, v17.s[1]\n"
1998 "fmla v26.4s, v20.4s, v2.s[2]\n"
1999 "fmla v27.4s, v20.4s, v5.s[2]\n"
2000 "fmla v28.4s, v20.4s, v8.s[2]\n"
2001 "fmla v29.4s, v20.4s, v11.s[2]\n"
2002 "fmla v30.4s, v20.4s, v14.s[2]\n"
2003 "fmla v31.4s, v20.4s, v17.s[2]\n"
2004 "fmla v26.4s, v21.4s, v2.s[3]\n"
2005 "fmla v27.4s, v21.4s, v5.s[3]\n"
2006 "fmla v28.4s, v21.4s, v8.s[3]\n"
2007 "fmla v29.4s, v21.4s, v11.s[3]\n"
2008 "fmla v30.4s, v21.4s, v14.s[3]\n"
2009 "fmla v31.4s, v21.4s, v17.s[3]\n"
2010 "5:\n"
2011 "ld1r {v24.4s}, [%[minptr]]\n"
2012 "ld1r {v25.4s}, [%[maxptr]]\n"
2013 "fmax v26.4s, v26.4s, v24.4s\n"
2014 "fmax v27.4s, v27.4s, v24.4s\n"
2015 "fmax v28.4s, v28.4s, v24.4s\n"
2016 "fmax v29.4s, v29.4s, v24.4s\n"
2017 "fmin v26.4s, v26.4s, v25.4s\n"
2018 "fmin v27.4s, v27.4s, v25.4s\n"
2019 "fmin v28.4s, v28.4s, v25.4s\n"
2020 "fmin v29.4s, v29.4s, v25.4s\n"
2021 "str q26, [%[c_ptr0]]\n"
2022 "fmax v30.4s, v30.4s, v24.4s\n"
2023 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2024 "fmax v31.4s, v31.4s, v24.4s\n"
2025 "str q27, [c_ptr1]\n"
2026 "fmin v30.4s, v30.4s, v25.4s\n"
2027 "fmin v31.4s, v31.4s, v25.4s\n"
2028 "str q28, [c_ptr2]\n"
2029 "str q29, [c_ptr3]\n"
2030 "str q30, [c_ptr4]\n"
2031 "str q31, [c_ptr5]\n"
2032 ".unreq a_ptr1\n"
2033 ".unreq a_ptr2\n"
2034 ".unreq a_ptr3\n"
2035 ".unreq a_ptr4\n"
2036 ".unreq a_ptr5\n"
2037 ".unreq c_ptr1\n"
2038 ".unreq c_ptr2\n"
2039 ".unreq c_ptr3\n"
2040 ".unreq c_ptr4\n"
2041 ".unreq c_ptr5\n"
2042 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
2043 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
2044 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
2045 );
2046 break;
2047 case 13:
2048 __asm __volatile (
2049 "a_ptr1 .req X0\n"
2050 "a_ptr2 .req X1\n"
2051 "a_ptr3 .req X2\n"
2052 "a_ptr4 .req X3\n"
2053 "a_ptr5 .req X4\n"
2054 "c_ptr1 .req X5\n"
2055 "c_ptr2 .req X6\n"
2056 "c_ptr3 .req X7\n"
2057 "c_ptr4 .req X8\n"
2058 "c_ptr5 .req X9\n"
2059 "add a_ptr1, %[a_ptr0], %[lda]\n"
2060 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2061 "add a_ptr2, a_ptr1, %[lda]\n"
2062 "add c_ptr2, c_ptr1, %[ldc]\n"
2063 "add a_ptr3, a_ptr2, %[lda]\n"
2064 "add c_ptr3, c_ptr2, %[ldc]\n"
2065 "add a_ptr4, a_ptr3, %[lda]\n"
2066 "add c_ptr4, c_ptr3, %[ldc]\n"
2067 "add a_ptr5, a_ptr4, %[lda]\n"
2068 "add c_ptr5, c_ptr4, %[ldc]\n"
2069 "cbz %[oob_rows], 1f\n"
2070 "subs %[oob_rows], %[oob_rows], #0x1\n"
2071 "add c_ptr5, %[c_ptr0], #0x0\n"
2072 "add a_ptr5, %[a_ptr0], #0x0\n"
2073 "b.eq 1f\n"
2074 "subs %[oob_rows], %[oob_rows], #0x1\n"
2075 "add c_ptr4, %[c_ptr0], #0x0\n"
2076 "add a_ptr4, %[a_ptr0], #0x0\n"
2077 "b.eq 1f\n"
2078 "subs %[oob_rows], %[oob_rows], #0x1\n"
2079 "add c_ptr3, %[c_ptr0], #0x0\n"
2080 "add a_ptr3, %[a_ptr0], #0x0\n"
2081 "b.eq 1f\n"
2082 "subs %[oob_rows], %[oob_rows], #0x1\n"
2083 "add c_ptr2, %[c_ptr0], #0x0\n"
2084 "add a_ptr2, %[a_ptr0], #0x0\n"
2085 "b.eq 1f\n"
2086 "subs %[oob_rows], %[oob_rows], #0x1\n"
2087 "add c_ptr1, %[c_ptr0], #0x0\n"
2088 "add a_ptr1, %[a_ptr0], #0x0\n"
2089 "1:\n"
2090 "ldr q0, [%[a_ptr0]], #0x10\n"
2091 "ldr q4, [a_ptr1], #0x10\n"
2092 "ldr q8, [a_ptr2], #0x10\n"
2093 "ldr q12, [a_ptr3], #0x10\n"
2094 "ldr q16, [a_ptr4], #0x10\n"
2095 "ldr q20, [a_ptr5], #0x10\n"
2096 "ldr q1, [%[a_ptr0]], #0x10\n"
2097 "ldr q5, [a_ptr1], #0x10\n"
2098 "ldr q9, [a_ptr2], #0x10\n"
2099 "ldr q13, [a_ptr3], #0x10\n"
2100 "ldr q17, [a_ptr4], #0x10\n"
2101 "ldr q21, [a_ptr5], #0x10\n"
2102 "ldr q2, [%[a_ptr0]], #0x10\n"
2103 "ldr q6, [a_ptr1], #0x10\n"
2104 "ldr q10, [a_ptr2], #0x10\n"
2105 "ldr q14, [a_ptr3], #0x10\n"
2106 "ldr s3, [%[a_ptr0]]\n"
2107 "ldr q18, [a_ptr4], #0x10\n"
2108 "ldr s7, [a_ptr1]\n"
2109 "ldr q22, [a_ptr5], #0x10\n"
2110 "ldr s11, [a_ptr2]\n"
2111 "ldr q24, [%[b_ptr0]]\n"
2112 "ldr s15, [a_ptr3]\n"
2113 "ldr q25, [%[b_ptr0], #0x10]\n"
2114 "ldr s19, [a_ptr4]\n"
2115 "ldr s23, [a_ptr5]\n"
2116 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2117 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2118 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2119 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2120 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2121 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2122 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2123 "cbz %[loops], 2f\n"
2124 "ldr q26, [%[biasptr]]\n"
2125 "add %[biasptr], %[biasptr], %[biasinc]\n"
2126 "subs %[loops], %[loops], #0x1\n"
2127 "mov v27.16b, v26.16b\n"
2128 "mov v28.16b, v26.16b\n"
2129 "mov v29.16b, v26.16b\n"
2130 "mov v30.16b, v26.16b\n"
2131 "mov v31.16b, v26.16b\n"
2132 "fmla v26.4s, v24.4s, v0.s[0]\n"
2133 "fmla v27.4s, v24.4s, v4.s[0]\n"
2134 "fmla v28.4s, v24.4s, v8.s[0]\n"
2135 "fmla v29.4s, v24.4s, v12.s[0]\n"
2136 "fmla v30.4s, v24.4s, v16.s[0]\n"
2137 "fmla v31.4s, v24.4s, v20.s[0]\n"
2138 "ldr q24, [%[b_ptr0]]\n"
2139 "fmla v26.4s, v25.4s, v0.s[1]\n"
2140 "fmla v27.4s, v25.4s, v4.s[1]\n"
2141 "fmla v28.4s, v25.4s, v8.s[1]\n"
2142 "fmla v29.4s, v25.4s, v12.s[1]\n"
2143 "fmla v30.4s, v25.4s, v16.s[1]\n"
2144 "fmla v31.4s, v25.4s, v20.s[1]\n"
2145 "ldr q25, [%[b_ptr0], #0x10]\n"
2146 "fmla v26.4s, v24.4s, v0.s[2]\n"
2147 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2148 "fmla v27.4s, v24.4s, v4.s[2]\n"
2149 "fmla v28.4s, v24.4s, v8.s[2]\n"
2150 "fmla v29.4s, v24.4s, v12.s[2]\n"
2151 "fmla v30.4s, v24.4s, v16.s[2]\n"
2152 "fmla v31.4s, v24.4s, v20.s[2]\n"
2153 "ldr q24, [%[b_ptr0]]\n"
2154 "fmla v26.4s, v25.4s, v0.s[3]\n"
2155 "fmla v27.4s, v25.4s, v4.s[3]\n"
2156 "fmla v28.4s, v25.4s, v8.s[3]\n"
2157 "fmla v29.4s, v25.4s, v12.s[3]\n"
2158 "fmla v30.4s, v25.4s, v16.s[3]\n"
2159 "fmla v31.4s, v25.4s, v20.s[3]\n"
2160 "ldr q25, [%[b_ptr0], #0x10]\n"
2161 "fmla v26.4s, v24.4s, v1.s[0]\n"
2162 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2163 "fmla v27.4s, v24.4s, v5.s[0]\n"
2164 "fmla v28.4s, v24.4s, v9.s[0]\n"
2165 "fmla v29.4s, v24.4s, v13.s[0]\n"
2166 "fmla v30.4s, v24.4s, v17.s[0]\n"
2167 "fmla v31.4s, v24.4s, v21.s[0]\n"
2168 "ldr q24, [%[b_ptr0]]\n"
2169 "fmla v26.4s, v25.4s, v1.s[1]\n"
2170 "fmla v27.4s, v25.4s, v5.s[1]\n"
2171 "fmla v28.4s, v25.4s, v9.s[1]\n"
2172 "fmla v29.4s, v25.4s, v13.s[1]\n"
2173 "fmla v30.4s, v25.4s, v17.s[1]\n"
2174 "fmla v31.4s, v25.4s, v21.s[1]\n"
2175 "ldr q25, [%[b_ptr0], #0x10]\n"
2176 "fmla v26.4s, v24.4s, v1.s[2]\n"
2177 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2178 "fmla v27.4s, v24.4s, v5.s[2]\n"
2179 "fmla v28.4s, v24.4s, v9.s[2]\n"
2180 "fmla v29.4s, v24.4s, v13.s[2]\n"
2181 "fmla v30.4s, v24.4s, v17.s[2]\n"
2182 "fmla v31.4s, v24.4s, v21.s[2]\n"
2183 "ldr q24, [%[b_ptr0]]\n"
2184 "fmla v26.4s, v25.4s, v1.s[3]\n"
2185 "fmla v27.4s, v25.4s, v5.s[3]\n"
2186 "fmla v28.4s, v25.4s, v9.s[3]\n"
2187 "fmla v29.4s, v25.4s, v13.s[3]\n"
2188 "fmla v30.4s, v25.4s, v17.s[3]\n"
2189 "fmla v31.4s, v25.4s, v21.s[3]\n"
2190 "ldr q25, [%[b_ptr0], #0x10]\n"
2191 "fmla v26.4s, v24.4s, v2.s[0]\n"
2192 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2193 "fmla v27.4s, v24.4s, v6.s[0]\n"
2194 "fmla v28.4s, v24.4s, v10.s[0]\n"
2195 "fmla v29.4s, v24.4s, v14.s[0]\n"
2196 "fmla v30.4s, v24.4s, v18.s[0]\n"
2197 "fmla v31.4s, v24.4s, v22.s[0]\n"
2198 "ldr q24, [%[b_ptr0]]\n"
2199 "fmla v26.4s, v25.4s, v2.s[1]\n"
2200 "fmla v27.4s, v25.4s, v6.s[1]\n"
2201 "fmla v28.4s, v25.4s, v10.s[1]\n"
2202 "fmla v29.4s, v25.4s, v14.s[1]\n"
2203 "fmla v30.4s, v25.4s, v18.s[1]\n"
2204 "fmla v31.4s, v25.4s, v22.s[1]\n"
2205 "ldr q25, [%[b_ptr0], #0x10]\n"
2206 "fmla v26.4s, v24.4s, v2.s[2]\n"
2207 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2208 "fmla v27.4s, v24.4s, v6.s[2]\n"
2209 "fmla v28.4s, v24.4s, v10.s[2]\n"
2210 "fmla v29.4s, v24.4s, v14.s[2]\n"
2211 "fmla v30.4s, v24.4s, v18.s[2]\n"
2212 "fmla v31.4s, v24.4s, v22.s[2]\n"
2213 "ldr q24, [%[b_ptr0]]\n"
2214 "fmla v26.4s, v25.4s, v2.s[3]\n"
2215 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2216 "fmla v27.4s, v25.4s, v6.s[3]\n"
2217 "fmla v28.4s, v25.4s, v10.s[3]\n"
2218 "fmla v29.4s, v25.4s, v14.s[3]\n"
2219 "fmla v30.4s, v25.4s, v18.s[3]\n"
2220 "fmla v31.4s, v25.4s, v22.s[3]\n"
2221 "fmla v26.4s, v24.4s, v3.s[0]\n"
2222 "fmla v27.4s, v24.4s, v7.s[0]\n"
2223 "fmla v28.4s, v24.4s, v11.s[0]\n"
2224 "fmla v29.4s, v24.4s, v15.s[0]\n"
2225 "fmla v30.4s, v24.4s, v19.s[0]\n"
2226 "fmla v31.4s, v24.4s, v23.s[0]\n"
2227 "b.eq 3f\n"
2228 "4:\n"
2229 "ld1r {v24.4s}, [%[minptr]]\n"
2230 "subs %[loops], %[loops], #0x1\n"
2231 "ld1r {v25.4s}, [%[maxptr]]\n"
2232 "fmax v26.4s, v26.4s, v24.4s\n"
2233 "fmax v27.4s, v27.4s, v24.4s\n"
2234 "fmax v28.4s, v28.4s, v24.4s\n"
2235 "fmax v29.4s, v29.4s, v24.4s\n"
2236 "fmin v26.4s, v26.4s, v25.4s\n"
2237 "fmin v27.4s, v27.4s, v25.4s\n"
2238 "fmin v28.4s, v28.4s, v25.4s\n"
2239 "fmin v29.4s, v29.4s, v25.4s\n"
2240 "str q26, [%[c_ptr0]]\n"
2241 "fmax v30.4s, v30.4s, v24.4s\n"
2242 "ldr q26, [%[biasptr]]\n"
2243 "fmax v31.4s, v31.4s, v24.4s\n"
2244 "ldr q24, [%[b_ptr0]]\n"
2245 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2246 "str q27, [c_ptr1]\n"
2247 "add c_ptr1, c_ptr1, #0x10\n"
2248 "fmin v30.4s, v30.4s, v25.4s\n"
2249 "add %[biasptr], %[biasptr], %[biasinc]\n"
2250 "fmin v31.4s, v31.4s, v25.4s\n"
2251 "str q28, [c_ptr2]\n"
2252 "mov v27.16b, v26.16b\n"
2253 "ldr q25, [%[b_ptr0], #0x10]\n"
2254 "mov v28.16b, v26.16b\n"
2255 "add c_ptr2, c_ptr2, #0x10\n"
2256 "str q29, [c_ptr3]\n"
2257 "add c_ptr3, c_ptr3, #0x10\n"
2258 "mov v29.16b, v26.16b\n"
2259 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2260 "fmla v27.4s, v24.4s, v4.s[0]\n"
2261 "str q30, [c_ptr4]\n"
2262 "mov v30.16b, v26.16b\n"
2263 "add c_ptr4, c_ptr4, #0x10\n"
2264 "fmla v28.4s, v24.4s, v8.s[0]\n"
2265 "str q31, [c_ptr5]\n"
2266 "mov v31.16b, v26.16b\n"
2267 "add c_ptr5, c_ptr5, #0x10\n"
2268 "fmla v26.4s, v24.4s, v0.s[0]\n"
2269 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2270 "fmla v29.4s, v24.4s, v12.s[0]\n"
2271 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2272 "fmla v30.4s, v24.4s, v16.s[0]\n"
2273 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2274 "fmla v31.4s, v24.4s, v20.s[0]\n"
2275 "ldr q24, [%[b_ptr0]]\n"
2276 "fmla v26.4s, v25.4s, v0.s[1]\n"
2277 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2278 "fmla v27.4s, v25.4s, v4.s[1]\n"
2279 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2280 "fmla v28.4s, v25.4s, v8.s[1]\n"
2281 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2282 "fmla v29.4s, v25.4s, v12.s[1]\n"
2283 "fmla v30.4s, v25.4s, v16.s[1]\n"
2284 "fmla v31.4s, v25.4s, v20.s[1]\n"
2285 "ldr q25, [%[b_ptr0], #0x10]\n"
2286 "fmla v26.4s, v24.4s, v0.s[2]\n"
2287 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2288 "fmla v27.4s, v24.4s, v4.s[2]\n"
2289 "fmla v28.4s, v24.4s, v8.s[2]\n"
2290 "fmla v29.4s, v24.4s, v12.s[2]\n"
2291 "fmla v30.4s, v24.4s, v16.s[2]\n"
2292 "fmla v31.4s, v24.4s, v20.s[2]\n"
2293 "ldr q24, [%[b_ptr0]]\n"
2294 "fmla v26.4s, v25.4s, v0.s[3]\n"
2295 "fmla v27.4s, v25.4s, v4.s[3]\n"
2296 "fmla v28.4s, v25.4s, v8.s[3]\n"
2297 "fmla v29.4s, v25.4s, v12.s[3]\n"
2298 "fmla v30.4s, v25.4s, v16.s[3]\n"
2299 "fmla v31.4s, v25.4s, v20.s[3]\n"
2300 "ldr q25, [%[b_ptr0], #0x10]\n"
2301 "fmla v26.4s, v24.4s, v1.s[0]\n"
2302 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2303 "fmla v27.4s, v24.4s, v5.s[0]\n"
2304 "fmla v28.4s, v24.4s, v9.s[0]\n"
2305 "fmla v29.4s, v24.4s, v13.s[0]\n"
2306 "fmla v30.4s, v24.4s, v17.s[0]\n"
2307 "fmla v31.4s, v24.4s, v21.s[0]\n"
2308 "ldr q24, [%[b_ptr0]]\n"
2309 "fmla v26.4s, v25.4s, v1.s[1]\n"
2310 "fmla v27.4s, v25.4s, v5.s[1]\n"
2311 "fmla v28.4s, v25.4s, v9.s[1]\n"
2312 "fmla v29.4s, v25.4s, v13.s[1]\n"
2313 "fmla v30.4s, v25.4s, v17.s[1]\n"
2314 "fmla v31.4s, v25.4s, v21.s[1]\n"
2315 "ldr q25, [%[b_ptr0], #0x10]\n"
2316 "fmla v26.4s, v24.4s, v1.s[2]\n"
2317 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2318 "fmla v27.4s, v24.4s, v5.s[2]\n"
2319 "fmla v28.4s, v24.4s, v9.s[2]\n"
2320 "fmla v29.4s, v24.4s, v13.s[2]\n"
2321 "fmla v30.4s, v24.4s, v17.s[2]\n"
2322 "fmla v31.4s, v24.4s, v21.s[2]\n"
2323 "ldr q24, [%[b_ptr0]]\n"
2324 "fmla v26.4s, v25.4s, v1.s[3]\n"
2325 "fmla v27.4s, v25.4s, v5.s[3]\n"
2326 "fmla v28.4s, v25.4s, v9.s[3]\n"
2327 "fmla v29.4s, v25.4s, v13.s[3]\n"
2328 "fmla v30.4s, v25.4s, v17.s[3]\n"
2329 "fmla v31.4s, v25.4s, v21.s[3]\n"
2330 "ldr q25, [%[b_ptr0], #0x10]\n"
2331 "fmla v26.4s, v24.4s, v2.s[0]\n"
2332 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2333 "fmla v27.4s, v24.4s, v6.s[0]\n"
2334 "fmla v28.4s, v24.4s, v10.s[0]\n"
2335 "fmla v29.4s, v24.4s, v14.s[0]\n"
2336 "fmla v30.4s, v24.4s, v18.s[0]\n"
2337 "fmla v31.4s, v24.4s, v22.s[0]\n"
2338 "ldr q24, [%[b_ptr0]]\n"
2339 "fmla v26.4s, v25.4s, v2.s[1]\n"
2340 "fmla v27.4s, v25.4s, v6.s[1]\n"
2341 "fmla v28.4s, v25.4s, v10.s[1]\n"
2342 "fmla v29.4s, v25.4s, v14.s[1]\n"
2343 "fmla v30.4s, v25.4s, v18.s[1]\n"
2344 "fmla v31.4s, v25.4s, v22.s[1]\n"
2345 "ldr q25, [%[b_ptr0], #0x10]\n"
2346 "fmla v26.4s, v24.4s, v2.s[2]\n"
2347 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2348 "fmla v27.4s, v24.4s, v6.s[2]\n"
2349 "fmla v28.4s, v24.4s, v10.s[2]\n"
2350 "fmla v29.4s, v24.4s, v14.s[2]\n"
2351 "fmla v30.4s, v24.4s, v18.s[2]\n"
2352 "fmla v31.4s, v24.4s, v22.s[2]\n"
2353 "ldr q24, [%[b_ptr0]]\n"
2354 "fmla v26.4s, v25.4s, v2.s[3]\n"
2355 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2356 "fmla v27.4s, v25.4s, v6.s[3]\n"
2357 "fmla v28.4s, v25.4s, v10.s[3]\n"
2358 "fmla v29.4s, v25.4s, v14.s[3]\n"
2359 "fmla v30.4s, v25.4s, v18.s[3]\n"
2360 "fmla v31.4s, v25.4s, v22.s[3]\n"
2361 "fmla v26.4s, v24.4s, v3.s[0]\n"
2362 "fmla v27.4s, v24.4s, v7.s[0]\n"
2363 "fmla v28.4s, v24.4s, v11.s[0]\n"
2364 "fmla v29.4s, v24.4s, v15.s[0]\n"
2365 "fmla v30.4s, v24.4s, v19.s[0]\n"
2366 "fmla v31.4s, v24.4s, v23.s[0]\n"
2367 "b.ne 4b\n"
2368 "3:\n"
2369 "ld1r {v24.4s}, [%[minptr]]\n"
2370 "ld1r {v25.4s}, [%[maxptr]]\n"
2371 "fmax v26.4s, v26.4s, v24.4s\n"
2372 "fmax v27.4s, v27.4s, v24.4s\n"
2373 "fmax v28.4s, v28.4s, v24.4s\n"
2374 "fmax v29.4s, v29.4s, v24.4s\n"
2375 "fmin v26.4s, v26.4s, v25.4s\n"
2376 "fmin v27.4s, v27.4s, v25.4s\n"
2377 "fmin v28.4s, v28.4s, v25.4s\n"
2378 "fmin v29.4s, v29.4s, v25.4s\n"
2379 "str q26, [%[c_ptr0]]\n"
2380 "fmax v30.4s, v30.4s, v24.4s\n"
2381 "ldr q26, [%[biasptr]]\n"
2382 "fmax v31.4s, v31.4s, v24.4s\n"
2383 "ldr q24, [%[b_ptr0]]\n"
2384 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2385 "str q27, [c_ptr1]\n"
2386 "add c_ptr1, c_ptr1, #0x10\n"
2387 "fmin v30.4s, v30.4s, v25.4s\n"
2388 "add %[biasptr], %[biasptr], %[biasinc]\n"
2389 "fmin v31.4s, v31.4s, v25.4s\n"
2390 "str q28, [c_ptr2]\n"
2391 "mov v27.16b, v26.16b\n"
2392 "ldr q25, [%[b_ptr0], #0x10]\n"
2393 "mov v28.16b, v26.16b\n"
2394 "add c_ptr2, c_ptr2, #0x10\n"
2395 "str q29, [c_ptr3]\n"
2396 "add c_ptr3, c_ptr3, #0x10\n"
2397 "mov v29.16b, v26.16b\n"
2398 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2399 "fmla v27.4s, v24.4s, v4.s[0]\n"
2400 "str q30, [c_ptr4]\n"
2401 "mov v30.16b, v26.16b\n"
2402 "add c_ptr4, c_ptr4, #0x10\n"
2403 "fmla v28.4s, v24.4s, v8.s[0]\n"
2404 "str q31, [c_ptr5]\n"
2405 "mov v31.16b, v26.16b\n"
2406 "add c_ptr5, c_ptr5, #0x10\n"
2407 "fmla v26.4s, v24.4s, v0.s[0]\n"
2408 "fmla v29.4s, v24.4s, v12.s[0]\n"
2409 "fmla v30.4s, v24.4s, v16.s[0]\n"
2410 "fmla v31.4s, v24.4s, v20.s[0]\n"
2411 "ldr q24, [%[b_ptr0]]\n"
2412 "fmla v26.4s, v25.4s, v0.s[1]\n"
2413 "fmla v27.4s, v25.4s, v4.s[1]\n"
2414 "fmla v28.4s, v25.4s, v8.s[1]\n"
2415 "fmla v29.4s, v25.4s, v12.s[1]\n"
2416 "fmla v30.4s, v25.4s, v16.s[1]\n"
2417 "fmla v31.4s, v25.4s, v20.s[1]\n"
2418 "ldr q25, [%[b_ptr0], #0x10]\n"
2419 "fmla v26.4s, v24.4s, v0.s[2]\n"
2420 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2421 "fmla v27.4s, v24.4s, v4.s[2]\n"
2422 "fmla v28.4s, v24.4s, v8.s[2]\n"
2423 "fmla v29.4s, v24.4s, v12.s[2]\n"
2424 "fmla v30.4s, v24.4s, v16.s[2]\n"
2425 "fmla v31.4s, v24.4s, v20.s[2]\n"
2426 "ldr q24, [%[b_ptr0]]\n"
2427 "fmla v26.4s, v25.4s, v0.s[3]\n"
2428 "fmla v27.4s, v25.4s, v4.s[3]\n"
2429 "fmla v28.4s, v25.4s, v8.s[3]\n"
2430 "fmla v29.4s, v25.4s, v12.s[3]\n"
2431 "fmla v30.4s, v25.4s, v16.s[3]\n"
2432 "fmla v31.4s, v25.4s, v20.s[3]\n"
2433 "ldr q25, [%[b_ptr0], #0x10]\n"
2434 "fmla v26.4s, v24.4s, v1.s[0]\n"
2435 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2436 "fmla v27.4s, v24.4s, v5.s[0]\n"
2437 "fmla v28.4s, v24.4s, v9.s[0]\n"
2438 "fmla v29.4s, v24.4s, v13.s[0]\n"
2439 "fmla v30.4s, v24.4s, v17.s[0]\n"
2440 "fmla v31.4s, v24.4s, v21.s[0]\n"
2441 "ldr q24, [%[b_ptr0]]\n"
2442 "fmla v26.4s, v25.4s, v1.s[1]\n"
2443 "fmla v27.4s, v25.4s, v5.s[1]\n"
2444 "fmla v28.4s, v25.4s, v9.s[1]\n"
2445 "fmla v29.4s, v25.4s, v13.s[1]\n"
2446 "fmla v30.4s, v25.4s, v17.s[1]\n"
2447 "fmla v31.4s, v25.4s, v21.s[1]\n"
2448 "ldr q25, [%[b_ptr0], #0x10]\n"
2449 "fmla v26.4s, v24.4s, v1.s[2]\n"
2450 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2451 "fmla v27.4s, v24.4s, v5.s[2]\n"
2452 "fmla v28.4s, v24.4s, v9.s[2]\n"
2453 "fmla v29.4s, v24.4s, v13.s[2]\n"
2454 "fmla v30.4s, v24.4s, v17.s[2]\n"
2455 "fmla v31.4s, v24.4s, v21.s[2]\n"
2456 "ldr q24, [%[b_ptr0]]\n"
2457 "fmla v26.4s, v25.4s, v1.s[3]\n"
2458 "fmla v27.4s, v25.4s, v5.s[3]\n"
2459 "fmla v28.4s, v25.4s, v9.s[3]\n"
2460 "fmla v29.4s, v25.4s, v13.s[3]\n"
2461 "fmla v30.4s, v25.4s, v17.s[3]\n"
2462 "fmla v31.4s, v25.4s, v21.s[3]\n"
2463 "ldr q25, [%[b_ptr0], #0x10]\n"
2464 "fmla v26.4s, v24.4s, v2.s[0]\n"
2465 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2466 "fmla v27.4s, v24.4s, v6.s[0]\n"
2467 "fmla v28.4s, v24.4s, v10.s[0]\n"
2468 "fmla v29.4s, v24.4s, v14.s[0]\n"
2469 "fmla v30.4s, v24.4s, v18.s[0]\n"
2470 "fmla v31.4s, v24.4s, v22.s[0]\n"
2471 "ldr q24, [%[b_ptr0]]\n"
2472 "fmla v26.4s, v25.4s, v2.s[1]\n"
2473 "fmla v27.4s, v25.4s, v6.s[1]\n"
2474 "fmla v28.4s, v25.4s, v10.s[1]\n"
2475 "fmla v29.4s, v25.4s, v14.s[1]\n"
2476 "fmla v30.4s, v25.4s, v18.s[1]\n"
2477 "fmla v31.4s, v25.4s, v22.s[1]\n"
2478 "ldr q25, [%[b_ptr0], #0x10]\n"
2479 "fmla v26.4s, v24.4s, v2.s[2]\n"
2480 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2481 "fmla v27.4s, v24.4s, v6.s[2]\n"
2482 "fmla v28.4s, v24.4s, v10.s[2]\n"
2483 "fmla v29.4s, v24.4s, v14.s[2]\n"
2484 "fmla v30.4s, v24.4s, v18.s[2]\n"
2485 "fmla v31.4s, v24.4s, v22.s[2]\n"
2486 "ldr q24, [%[b_ptr0]]\n"
2487 "fmla v26.4s, v25.4s, v2.s[3]\n"
2488 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2489 "fmla v27.4s, v25.4s, v6.s[3]\n"
2490 "fmla v28.4s, v25.4s, v10.s[3]\n"
2491 "fmla v29.4s, v25.4s, v14.s[3]\n"
2492 "fmla v30.4s, v25.4s, v18.s[3]\n"
2493 "fmla v31.4s, v25.4s, v22.s[3]\n"
2494 "fmla v26.4s, v24.4s, v3.s[0]\n"
2495 "fmla v27.4s, v24.4s, v7.s[0]\n"
2496 "fmla v28.4s, v24.4s, v11.s[0]\n"
2497 "fmla v29.4s, v24.4s, v15.s[0]\n"
2498 "fmla v30.4s, v24.4s, v19.s[0]\n"
2499 "fmla v31.4s, v24.4s, v23.s[0]\n"
2500 "b 5f\n"
2501 "2:\n"
2502 "ldr q26, [%[biasptr]]\n"
2503 "add %[biasptr], %[biasptr], %[biasinc]\n"
2504 "mov v27.16b, v26.16b\n"
2505 "mov v28.16b, v26.16b\n"
2506 "mov v29.16b, v26.16b\n"
2507 "mov v30.16b, v26.16b\n"
2508 "mov v31.16b, v26.16b\n"
2509 "fmla v26.4s, v24.4s, v0.s[0]\n"
2510 "fmla v27.4s, v24.4s, v4.s[0]\n"
2511 "fmla v28.4s, v24.4s, v8.s[0]\n"
2512 "fmla v29.4s, v24.4s, v12.s[0]\n"
2513 "fmla v30.4s, v24.4s, v16.s[0]\n"
2514 "fmla v31.4s, v24.4s, v20.s[0]\n"
2515 "ldr q24, [%[b_ptr0]]\n"
2516 "fmla v26.4s, v25.4s, v0.s[1]\n"
2517 "fmla v27.4s, v25.4s, v4.s[1]\n"
2518 "fmla v28.4s, v25.4s, v8.s[1]\n"
2519 "fmla v29.4s, v25.4s, v12.s[1]\n"
2520 "fmla v30.4s, v25.4s, v16.s[1]\n"
2521 "fmla v31.4s, v25.4s, v20.s[1]\n"
2522 "ldr q25, [%[b_ptr0], #0x10]\n"
2523 "fmla v26.4s, v24.4s, v0.s[2]\n"
2524 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2525 "fmla v27.4s, v24.4s, v4.s[2]\n"
2526 "fmla v28.4s, v24.4s, v8.s[2]\n"
2527 "fmla v29.4s, v24.4s, v12.s[2]\n"
2528 "fmla v30.4s, v24.4s, v16.s[2]\n"
2529 "fmla v31.4s, v24.4s, v20.s[2]\n"
2530 "ldr q24, [%[b_ptr0]]\n"
2531 "fmla v26.4s, v25.4s, v0.s[3]\n"
2532 "fmla v27.4s, v25.4s, v4.s[3]\n"
2533 "fmla v28.4s, v25.4s, v8.s[3]\n"
2534 "fmla v29.4s, v25.4s, v12.s[3]\n"
2535 "fmla v30.4s, v25.4s, v16.s[3]\n"
2536 "fmla v31.4s, v25.4s, v20.s[3]\n"
2537 "ldr q25, [%[b_ptr0], #0x10]\n"
2538 "fmla v26.4s, v24.4s, v1.s[0]\n"
2539 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2540 "fmla v27.4s, v24.4s, v5.s[0]\n"
2541 "fmla v28.4s, v24.4s, v9.s[0]\n"
2542 "fmla v29.4s, v24.4s, v13.s[0]\n"
2543 "fmla v30.4s, v24.4s, v17.s[0]\n"
2544 "fmla v31.4s, v24.4s, v21.s[0]\n"
2545 "ldr q24, [%[b_ptr0]]\n"
2546 "fmla v26.4s, v25.4s, v1.s[1]\n"
2547 "fmla v27.4s, v25.4s, v5.s[1]\n"
2548 "fmla v28.4s, v25.4s, v9.s[1]\n"
2549 "fmla v29.4s, v25.4s, v13.s[1]\n"
2550 "fmla v30.4s, v25.4s, v17.s[1]\n"
2551 "fmla v31.4s, v25.4s, v21.s[1]\n"
2552 "ldr q25, [%[b_ptr0], #0x10]\n"
2553 "fmla v26.4s, v24.4s, v1.s[2]\n"
2554 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2555 "fmla v27.4s, v24.4s, v5.s[2]\n"
2556 "fmla v28.4s, v24.4s, v9.s[2]\n"
2557 "fmla v29.4s, v24.4s, v13.s[2]\n"
2558 "fmla v30.4s, v24.4s, v17.s[2]\n"
2559 "fmla v31.4s, v24.4s, v21.s[2]\n"
2560 "ldr q24, [%[b_ptr0]]\n"
2561 "fmla v26.4s, v25.4s, v1.s[3]\n"
2562 "fmla v27.4s, v25.4s, v5.s[3]\n"
2563 "fmla v28.4s, v25.4s, v9.s[3]\n"
2564 "fmla v29.4s, v25.4s, v13.s[3]\n"
2565 "fmla v30.4s, v25.4s, v17.s[3]\n"
2566 "fmla v31.4s, v25.4s, v21.s[3]\n"
2567 "ldr q25, [%[b_ptr0], #0x10]\n"
2568 "fmla v26.4s, v24.4s, v2.s[0]\n"
2569 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2570 "fmla v27.4s, v24.4s, v6.s[0]\n"
2571 "fmla v28.4s, v24.4s, v10.s[0]\n"
2572 "fmla v29.4s, v24.4s, v14.s[0]\n"
2573 "fmla v30.4s, v24.4s, v18.s[0]\n"
2574 "fmla v31.4s, v24.4s, v22.s[0]\n"
2575 "ldr q24, [%[b_ptr0]]\n"
2576 "fmla v26.4s, v25.4s, v2.s[1]\n"
2577 "fmla v27.4s, v25.4s, v6.s[1]\n"
2578 "fmla v28.4s, v25.4s, v10.s[1]\n"
2579 "fmla v29.4s, v25.4s, v14.s[1]\n"
2580 "fmla v30.4s, v25.4s, v18.s[1]\n"
2581 "fmla v31.4s, v25.4s, v22.s[1]\n"
2582 "ldr q25, [%[b_ptr0], #0x10]\n"
2583 "fmla v26.4s, v24.4s, v2.s[2]\n"
2584 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2585 "fmla v27.4s, v24.4s, v6.s[2]\n"
2586 "fmla v28.4s, v24.4s, v10.s[2]\n"
2587 "fmla v29.4s, v24.4s, v14.s[2]\n"
2588 "fmla v30.4s, v24.4s, v18.s[2]\n"
2589 "fmla v31.4s, v24.4s, v22.s[2]\n"
2590 "ldr q24, [%[b_ptr0]]\n"
2591 "fmla v26.4s, v25.4s, v2.s[3]\n"
2592 "add %[b_ptr0], %[b_ptr0], #0x10\n"
2593 "fmla v27.4s, v25.4s, v6.s[3]\n"
2594 "fmla v28.4s, v25.4s, v10.s[3]\n"
2595 "fmla v29.4s, v25.4s, v14.s[3]\n"
2596 "fmla v30.4s, v25.4s, v18.s[3]\n"
2597 "fmla v31.4s, v25.4s, v22.s[3]\n"
2598 "fmla v26.4s, v24.4s, v3.s[0]\n"
2599 "fmla v27.4s, v24.4s, v7.s[0]\n"
2600 "fmla v28.4s, v24.4s, v11.s[0]\n"
2601 "fmla v29.4s, v24.4s, v15.s[0]\n"
2602 "fmla v30.4s, v24.4s, v19.s[0]\n"
2603 "fmla v31.4s, v24.4s, v23.s[0]\n"
2604 "5:\n"
2605 "ld1r {v24.4s}, [%[minptr]]\n"
2606 "ld1r {v25.4s}, [%[maxptr]]\n"
2607 "fmax v26.4s, v26.4s, v24.4s\n"
2608 "fmax v27.4s, v27.4s, v24.4s\n"
2609 "fmax v28.4s, v28.4s, v24.4s\n"
2610 "fmax v29.4s, v29.4s, v24.4s\n"
2611 "fmin v26.4s, v26.4s, v25.4s\n"
2612 "fmin v27.4s, v27.4s, v25.4s\n"
2613 "fmin v28.4s, v28.4s, v25.4s\n"
2614 "fmin v29.4s, v29.4s, v25.4s\n"
2615 "str q26, [%[c_ptr0]]\n"
2616 "fmax v30.4s, v30.4s, v24.4s\n"
2617 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2618 "fmax v31.4s, v31.4s, v24.4s\n"
2619 "str q27, [c_ptr1]\n"
2620 "fmin v30.4s, v30.4s, v25.4s\n"
2621 "fmin v31.4s, v31.4s, v25.4s\n"
2622 "str q28, [c_ptr2]\n"
2623 "str q29, [c_ptr3]\n"
2624 "str q30, [c_ptr4]\n"
2625 "str q31, [c_ptr5]\n"
2626 ".unreq a_ptr1\n"
2627 ".unreq a_ptr2\n"
2628 ".unreq a_ptr3\n"
2629 ".unreq a_ptr4\n"
2630 ".unreq a_ptr5\n"
2631 ".unreq c_ptr1\n"
2632 ".unreq c_ptr2\n"
2633 ".unreq c_ptr3\n"
2634 ".unreq c_ptr4\n"
2635 ".unreq c_ptr5\n"
2636 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
2637 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
2638 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
2639 );
2640 break;
2641 case 14:
2642 __asm __volatile (
2643 "a_ptr1 .req X0\n"
2644 "a_ptr2 .req X1\n"
2645 "a_ptr3 .req X2\n"
2646 "a_ptr4 .req X3\n"
2647 "a_ptr5 .req X4\n"
2648 "c_ptr1 .req X5\n"
2649 "c_ptr2 .req X6\n"
2650 "c_ptr3 .req X7\n"
2651 "c_ptr4 .req X8\n"
2652 "c_ptr5 .req X9\n"
2653 "add a_ptr1, %[a_ptr0], %[lda]\n"
2654 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2655 "add a_ptr2, a_ptr1, %[lda]\n"
2656 "add c_ptr2, c_ptr1, %[ldc]\n"
2657 "add a_ptr3, a_ptr2, %[lda]\n"
2658 "add c_ptr3, c_ptr2, %[ldc]\n"
2659 "add a_ptr4, a_ptr3, %[lda]\n"
2660 "add c_ptr4, c_ptr3, %[ldc]\n"
2661 "add a_ptr5, a_ptr4, %[lda]\n"
2662 "add c_ptr5, c_ptr4, %[ldc]\n"
2663 "cbz %[oob_rows], 1f\n"
2664 "subs %[oob_rows], %[oob_rows], #0x1\n"
2665 "add c_ptr5, %[c_ptr0], #0x0\n"
2666 "add a_ptr5, %[a_ptr0], #0x0\n"
2667 "b.eq 1f\n"
2668 "subs %[oob_rows], %[oob_rows], #0x1\n"
2669 "add c_ptr4, %[c_ptr0], #0x0\n"
2670 "add a_ptr4, %[a_ptr0], #0x0\n"
2671 "b.eq 1f\n"
2672 "subs %[oob_rows], %[oob_rows], #0x1\n"
2673 "add c_ptr3, %[c_ptr0], #0x0\n"
2674 "add a_ptr3, %[a_ptr0], #0x0\n"
2675 "b.eq 1f\n"
2676 "subs %[oob_rows], %[oob_rows], #0x1\n"
2677 "add c_ptr2, %[c_ptr0], #0x0\n"
2678 "add a_ptr2, %[a_ptr0], #0x0\n"
2679 "b.eq 1f\n"
2680 "subs %[oob_rows], %[oob_rows], #0x1\n"
2681 "add c_ptr1, %[c_ptr0], #0x0\n"
2682 "add a_ptr1, %[a_ptr0], #0x0\n"
2683 "1:\n"
2684 "ldr q0, [%[a_ptr0]], #0x10\n"
2685 "ldr q4, [a_ptr1], #0x10\n"
2686 "ldr q8, [a_ptr2], #0x10\n"
2687 "ldr q12, [a_ptr3], #0x10\n"
2688 "ldr q16, [a_ptr4], #0x10\n"
2689 "ldr q20, [a_ptr5], #0x10\n"
2690 "ldr q1, [%[a_ptr0]], #0x10\n"
2691 "ldr q5, [a_ptr1], #0x10\n"
2692 "ldr q9, [a_ptr2], #0x10\n"
2693 "ldr q13, [a_ptr3], #0x10\n"
2694 "ldr q17, [a_ptr4], #0x10\n"
2695 "ldr q21, [a_ptr5], #0x10\n"
2696 "ldr q2, [%[a_ptr0]], #0x10\n"
2697 "ldr q6, [a_ptr1], #0x10\n"
2698 "ldr q10, [a_ptr2], #0x10\n"
2699 "ldr q14, [a_ptr3], #0x10\n"
2700 "ldr d3, [%[a_ptr0]]\n"
2701 "ldr q18, [a_ptr4], #0x10\n"
2702 "ldr d7, [a_ptr1]\n"
2703 "ldr q22, [a_ptr5], #0x10\n"
2704 "ldr d11, [a_ptr2]\n"
2705 "ldr q24, [%[b_ptr0]]\n"
2706 "ldr d15, [a_ptr3]\n"
2707 "ldr q25, [%[b_ptr0], #0x10]\n"
2708 "ldr d19, [a_ptr4]\n"
2709 "ldr d23, [a_ptr5]\n"
2710 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
2711 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
2712 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
2713 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
2714 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
2715 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
2716 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2717 "cbz %[loops], 2f\n"
2718 "ldr q26, [%[biasptr]]\n"
2719 "add %[biasptr], %[biasptr], %[biasinc]\n"
2720 "subs %[loops], %[loops], #0x1\n"
2721 "mov v27.16b, v26.16b\n"
2722 "mov v28.16b, v26.16b\n"
2723 "mov v29.16b, v26.16b\n"
2724 "mov v30.16b, v26.16b\n"
2725 "mov v31.16b, v26.16b\n"
2726 "fmla v26.4s, v24.4s, v0.s[0]\n"
2727 "fmla v27.4s, v24.4s, v4.s[0]\n"
2728 "fmla v28.4s, v24.4s, v8.s[0]\n"
2729 "fmla v29.4s, v24.4s, v12.s[0]\n"
2730 "fmla v30.4s, v24.4s, v16.s[0]\n"
2731 "fmla v31.4s, v24.4s, v20.s[0]\n"
2732 "ldr q24, [%[b_ptr0]]\n"
2733 "fmla v26.4s, v25.4s, v0.s[1]\n"
2734 "fmla v27.4s, v25.4s, v4.s[1]\n"
2735 "fmla v28.4s, v25.4s, v8.s[1]\n"
2736 "fmla v29.4s, v25.4s, v12.s[1]\n"
2737 "fmla v30.4s, v25.4s, v16.s[1]\n"
2738 "fmla v31.4s, v25.4s, v20.s[1]\n"
2739 "ldr q25, [%[b_ptr0], #0x10]\n"
2740 "fmla v26.4s, v24.4s, v0.s[2]\n"
2741 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2742 "fmla v27.4s, v24.4s, v4.s[2]\n"
2743 "fmla v28.4s, v24.4s, v8.s[2]\n"
2744 "fmla v29.4s, v24.4s, v12.s[2]\n"
2745 "fmla v30.4s, v24.4s, v16.s[2]\n"
2746 "fmla v31.4s, v24.4s, v20.s[2]\n"
2747 "ldr q24, [%[b_ptr0]]\n"
2748 "fmla v26.4s, v25.4s, v0.s[3]\n"
2749 "fmla v27.4s, v25.4s, v4.s[3]\n"
2750 "fmla v28.4s, v25.4s, v8.s[3]\n"
2751 "fmla v29.4s, v25.4s, v12.s[3]\n"
2752 "fmla v30.4s, v25.4s, v16.s[3]\n"
2753 "fmla v31.4s, v25.4s, v20.s[3]\n"
2754 "ldr q25, [%[b_ptr0], #0x10]\n"
2755 "fmla v26.4s, v24.4s, v1.s[0]\n"
2756 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2757 "fmla v27.4s, v24.4s, v5.s[0]\n"
2758 "fmla v28.4s, v24.4s, v9.s[0]\n"
2759 "fmla v29.4s, v24.4s, v13.s[0]\n"
2760 "fmla v30.4s, v24.4s, v17.s[0]\n"
2761 "fmla v31.4s, v24.4s, v21.s[0]\n"
2762 "ldr q24, [%[b_ptr0]]\n"
2763 "fmla v26.4s, v25.4s, v1.s[1]\n"
2764 "fmla v27.4s, v25.4s, v5.s[1]\n"
2765 "fmla v28.4s, v25.4s, v9.s[1]\n"
2766 "fmla v29.4s, v25.4s, v13.s[1]\n"
2767 "fmla v30.4s, v25.4s, v17.s[1]\n"
2768 "fmla v31.4s, v25.4s, v21.s[1]\n"
2769 "ldr q25, [%[b_ptr0], #0x10]\n"
2770 "fmla v26.4s, v24.4s, v1.s[2]\n"
2771 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2772 "fmla v27.4s, v24.4s, v5.s[2]\n"
2773 "fmla v28.4s, v24.4s, v9.s[2]\n"
2774 "fmla v29.4s, v24.4s, v13.s[2]\n"
2775 "fmla v30.4s, v24.4s, v17.s[2]\n"
2776 "fmla v31.4s, v24.4s, v21.s[2]\n"
2777 "ldr q24, [%[b_ptr0]]\n"
2778 "fmla v26.4s, v25.4s, v1.s[3]\n"
2779 "fmla v27.4s, v25.4s, v5.s[3]\n"
2780 "fmla v28.4s, v25.4s, v9.s[3]\n"
2781 "fmla v29.4s, v25.4s, v13.s[3]\n"
2782 "fmla v30.4s, v25.4s, v17.s[3]\n"
2783 "fmla v31.4s, v25.4s, v21.s[3]\n"
2784 "ldr q25, [%[b_ptr0], #0x10]\n"
2785 "fmla v26.4s, v24.4s, v2.s[0]\n"
2786 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2787 "fmla v27.4s, v24.4s, v6.s[0]\n"
2788 "fmla v28.4s, v24.4s, v10.s[0]\n"
2789 "fmla v29.4s, v24.4s, v14.s[0]\n"
2790 "fmla v30.4s, v24.4s, v18.s[0]\n"
2791 "fmla v31.4s, v24.4s, v22.s[0]\n"
2792 "ldr q24, [%[b_ptr0]]\n"
2793 "fmla v26.4s, v25.4s, v2.s[1]\n"
2794 "fmla v27.4s, v25.4s, v6.s[1]\n"
2795 "fmla v28.4s, v25.4s, v10.s[1]\n"
2796 "fmla v29.4s, v25.4s, v14.s[1]\n"
2797 "fmla v30.4s, v25.4s, v18.s[1]\n"
2798 "fmla v31.4s, v25.4s, v22.s[1]\n"
2799 "ldr q25, [%[b_ptr0], #0x10]\n"
2800 "fmla v26.4s, v24.4s, v2.s[2]\n"
2801 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2802 "fmla v27.4s, v24.4s, v6.s[2]\n"
2803 "fmla v28.4s, v24.4s, v10.s[2]\n"
2804 "fmla v29.4s, v24.4s, v14.s[2]\n"
2805 "fmla v30.4s, v24.4s, v18.s[2]\n"
2806 "fmla v31.4s, v24.4s, v22.s[2]\n"
2807 "ldr q24, [%[b_ptr0]]\n"
2808 "fmla v26.4s, v25.4s, v2.s[3]\n"
2809 "fmla v27.4s, v25.4s, v6.s[3]\n"
2810 "fmla v28.4s, v25.4s, v10.s[3]\n"
2811 "fmla v29.4s, v25.4s, v14.s[3]\n"
2812 "fmla v30.4s, v25.4s, v18.s[3]\n"
2813 "fmla v31.4s, v25.4s, v22.s[3]\n"
2814 "ldr q25, [%[b_ptr0], #0x10]\n"
2815 "fmla v26.4s, v24.4s, v3.s[0]\n"
2816 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2817 "fmla v27.4s, v24.4s, v7.s[0]\n"
2818 "fmla v28.4s, v24.4s, v11.s[0]\n"
2819 "fmla v29.4s, v24.4s, v15.s[0]\n"
2820 "fmla v30.4s, v24.4s, v19.s[0]\n"
2821 "fmla v31.4s, v24.4s, v23.s[0]\n"
2822 "fmla v26.4s, v25.4s, v3.s[1]\n"
2823 "fmla v27.4s, v25.4s, v7.s[1]\n"
2824 "fmla v28.4s, v25.4s, v11.s[1]\n"
2825 "fmla v29.4s, v25.4s, v15.s[1]\n"
2826 "fmla v30.4s, v25.4s, v19.s[1]\n"
2827 "fmla v31.4s, v25.4s, v23.s[1]\n"
2828 "b.eq 3f\n"
2829 "4:\n"
2830 "ld1r {v24.4s}, [%[minptr]]\n"
2831 "subs %[loops], %[loops], #0x1\n"
2832 "ld1r {v25.4s}, [%[maxptr]]\n"
2833 "fmax v26.4s, v26.4s, v24.4s\n"
2834 "fmax v27.4s, v27.4s, v24.4s\n"
2835 "fmax v28.4s, v28.4s, v24.4s\n"
2836 "fmax v29.4s, v29.4s, v24.4s\n"
2837 "fmin v26.4s, v26.4s, v25.4s\n"
2838 "fmin v27.4s, v27.4s, v25.4s\n"
2839 "fmin v28.4s, v28.4s, v25.4s\n"
2840 "fmin v29.4s, v29.4s, v25.4s\n"
2841 "str q26, [%[c_ptr0]]\n"
2842 "fmax v30.4s, v30.4s, v24.4s\n"
2843 "ldr q26, [%[biasptr]]\n"
2844 "fmax v31.4s, v31.4s, v24.4s\n"
2845 "ldr q24, [%[b_ptr0]]\n"
2846 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2847 "str q27, [c_ptr1]\n"
2848 "add c_ptr1, c_ptr1, #0x10\n"
2849 "fmin v30.4s, v30.4s, v25.4s\n"
2850 "add %[biasptr], %[biasptr], %[biasinc]\n"
2851 "fmin v31.4s, v31.4s, v25.4s\n"
2852 "str q28, [c_ptr2]\n"
2853 "mov v27.16b, v26.16b\n"
2854 "ldr q25, [%[b_ptr0], #0x10]\n"
2855 "mov v28.16b, v26.16b\n"
2856 "add c_ptr2, c_ptr2, #0x10\n"
2857 "str q29, [c_ptr3]\n"
2858 "add c_ptr3, c_ptr3, #0x10\n"
2859 "mov v29.16b, v26.16b\n"
2860 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2861 "fmla v27.4s, v24.4s, v4.s[0]\n"
2862 "str q30, [c_ptr4]\n"
2863 "mov v30.16b, v26.16b\n"
2864 "add c_ptr4, c_ptr4, #0x10\n"
2865 "fmla v28.4s, v24.4s, v8.s[0]\n"
2866 "str q31, [c_ptr5]\n"
2867 "mov v31.16b, v26.16b\n"
2868 "add c_ptr5, c_ptr5, #0x10\n"
2869 "fmla v26.4s, v24.4s, v0.s[0]\n"
2870 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2871 "fmla v29.4s, v24.4s, v12.s[0]\n"
2872 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2873 "fmla v30.4s, v24.4s, v16.s[0]\n"
2874 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2875 "fmla v31.4s, v24.4s, v20.s[0]\n"
2876 "ldr q24, [%[b_ptr0]]\n"
2877 "fmla v26.4s, v25.4s, v0.s[1]\n"
2878 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2879 "fmla v27.4s, v25.4s, v4.s[1]\n"
2880 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2881 "fmla v28.4s, v25.4s, v8.s[1]\n"
2882 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2883 "fmla v29.4s, v25.4s, v12.s[1]\n"
2884 "fmla v30.4s, v25.4s, v16.s[1]\n"
2885 "fmla v31.4s, v25.4s, v20.s[1]\n"
2886 "ldr q25, [%[b_ptr0], #0x10]\n"
2887 "fmla v26.4s, v24.4s, v0.s[2]\n"
2888 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2889 "fmla v27.4s, v24.4s, v4.s[2]\n"
2890 "fmla v28.4s, v24.4s, v8.s[2]\n"
2891 "fmla v29.4s, v24.4s, v12.s[2]\n"
2892 "fmla v30.4s, v24.4s, v16.s[2]\n"
2893 "fmla v31.4s, v24.4s, v20.s[2]\n"
2894 "ldr q24, [%[b_ptr0]]\n"
2895 "fmla v26.4s, v25.4s, v0.s[3]\n"
2896 "fmla v27.4s, v25.4s, v4.s[3]\n"
2897 "fmla v28.4s, v25.4s, v8.s[3]\n"
2898 "fmla v29.4s, v25.4s, v12.s[3]\n"
2899 "fmla v30.4s, v25.4s, v16.s[3]\n"
2900 "fmla v31.4s, v25.4s, v20.s[3]\n"
2901 "ldr q25, [%[b_ptr0], #0x10]\n"
2902 "fmla v26.4s, v24.4s, v1.s[0]\n"
2903 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2904 "fmla v27.4s, v24.4s, v5.s[0]\n"
2905 "fmla v28.4s, v24.4s, v9.s[0]\n"
2906 "fmla v29.4s, v24.4s, v13.s[0]\n"
2907 "fmla v30.4s, v24.4s, v17.s[0]\n"
2908 "fmla v31.4s, v24.4s, v21.s[0]\n"
2909 "ldr q24, [%[b_ptr0]]\n"
2910 "fmla v26.4s, v25.4s, v1.s[1]\n"
2911 "fmla v27.4s, v25.4s, v5.s[1]\n"
2912 "fmla v28.4s, v25.4s, v9.s[1]\n"
2913 "fmla v29.4s, v25.4s, v13.s[1]\n"
2914 "fmla v30.4s, v25.4s, v17.s[1]\n"
2915 "fmla v31.4s, v25.4s, v21.s[1]\n"
2916 "ldr q25, [%[b_ptr0], #0x10]\n"
2917 "fmla v26.4s, v24.4s, v1.s[2]\n"
2918 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2919 "fmla v27.4s, v24.4s, v5.s[2]\n"
2920 "fmla v28.4s, v24.4s, v9.s[2]\n"
2921 "fmla v29.4s, v24.4s, v13.s[2]\n"
2922 "fmla v30.4s, v24.4s, v17.s[2]\n"
2923 "fmla v31.4s, v24.4s, v21.s[2]\n"
2924 "ldr q24, [%[b_ptr0]]\n"
2925 "fmla v26.4s, v25.4s, v1.s[3]\n"
2926 "fmla v27.4s, v25.4s, v5.s[3]\n"
2927 "fmla v28.4s, v25.4s, v9.s[3]\n"
2928 "fmla v29.4s, v25.4s, v13.s[3]\n"
2929 "fmla v30.4s, v25.4s, v17.s[3]\n"
2930 "fmla v31.4s, v25.4s, v21.s[3]\n"
2931 "ldr q25, [%[b_ptr0], #0x10]\n"
2932 "fmla v26.4s, v24.4s, v2.s[0]\n"
2933 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2934 "fmla v27.4s, v24.4s, v6.s[0]\n"
2935 "fmla v28.4s, v24.4s, v10.s[0]\n"
2936 "fmla v29.4s, v24.4s, v14.s[0]\n"
2937 "fmla v30.4s, v24.4s, v18.s[0]\n"
2938 "fmla v31.4s, v24.4s, v22.s[0]\n"
2939 "ldr q24, [%[b_ptr0]]\n"
2940 "fmla v26.4s, v25.4s, v2.s[1]\n"
2941 "fmla v27.4s, v25.4s, v6.s[1]\n"
2942 "fmla v28.4s, v25.4s, v10.s[1]\n"
2943 "fmla v29.4s, v25.4s, v14.s[1]\n"
2944 "fmla v30.4s, v25.4s, v18.s[1]\n"
2945 "fmla v31.4s, v25.4s, v22.s[1]\n"
2946 "ldr q25, [%[b_ptr0], #0x10]\n"
2947 "fmla v26.4s, v24.4s, v2.s[2]\n"
2948 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2949 "fmla v27.4s, v24.4s, v6.s[2]\n"
2950 "fmla v28.4s, v24.4s, v10.s[2]\n"
2951 "fmla v29.4s, v24.4s, v14.s[2]\n"
2952 "fmla v30.4s, v24.4s, v18.s[2]\n"
2953 "fmla v31.4s, v24.4s, v22.s[2]\n"
2954 "ldr q24, [%[b_ptr0]]\n"
2955 "fmla v26.4s, v25.4s, v2.s[3]\n"
2956 "fmla v27.4s, v25.4s, v6.s[3]\n"
2957 "fmla v28.4s, v25.4s, v10.s[3]\n"
2958 "fmla v29.4s, v25.4s, v14.s[3]\n"
2959 "fmla v30.4s, v25.4s, v18.s[3]\n"
2960 "fmla v31.4s, v25.4s, v22.s[3]\n"
2961 "ldr q25, [%[b_ptr0], #0x10]\n"
2962 "fmla v26.4s, v24.4s, v3.s[0]\n"
2963 "add %[b_ptr0], %[b_ptr0], #0x20\n"
2964 "fmla v27.4s, v24.4s, v7.s[0]\n"
2965 "fmla v28.4s, v24.4s, v11.s[0]\n"
2966 "fmla v29.4s, v24.4s, v15.s[0]\n"
2967 "fmla v30.4s, v24.4s, v19.s[0]\n"
2968 "fmla v31.4s, v24.4s, v23.s[0]\n"
2969 "fmla v26.4s, v25.4s, v3.s[1]\n"
2970 "fmla v27.4s, v25.4s, v7.s[1]\n"
2971 "fmla v28.4s, v25.4s, v11.s[1]\n"
2972 "fmla v29.4s, v25.4s, v15.s[1]\n"
2973 "fmla v30.4s, v25.4s, v19.s[1]\n"
2974 "fmla v31.4s, v25.4s, v23.s[1]\n"
2975 "b.ne 4b\n"
2976 "3:\n"
2977 "ld1r {v24.4s}, [%[minptr]]\n"
2978 "ld1r {v25.4s}, [%[maxptr]]\n"
2979 "fmax v26.4s, v26.4s, v24.4s\n"
2980 "fmax v27.4s, v27.4s, v24.4s\n"
2981 "fmax v28.4s, v28.4s, v24.4s\n"
2982 "fmax v29.4s, v29.4s, v24.4s\n"
2983 "fmin v26.4s, v26.4s, v25.4s\n"
2984 "fmin v27.4s, v27.4s, v25.4s\n"
2985 "fmin v28.4s, v28.4s, v25.4s\n"
2986 "fmin v29.4s, v29.4s, v25.4s\n"
2987 "str q26, [%[c_ptr0]]\n"
2988 "fmax v30.4s, v30.4s, v24.4s\n"
2989 "ldr q26, [%[biasptr]]\n"
2990 "fmax v31.4s, v31.4s, v24.4s\n"
2991 "ldr q24, [%[b_ptr0]]\n"
2992 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2993 "str q27, [c_ptr1]\n"
2994 "add c_ptr1, c_ptr1, #0x10\n"
2995 "fmin v30.4s, v30.4s, v25.4s\n"
2996 "add %[biasptr], %[biasptr], %[biasinc]\n"
2997 "fmin v31.4s, v31.4s, v25.4s\n"
2998 "str q28, [c_ptr2]\n"
2999 "mov v27.16b, v26.16b\n"
3000 "ldr q25, [%[b_ptr0], #0x10]\n"
3001 "mov v28.16b, v26.16b\n"
3002 "add c_ptr2, c_ptr2, #0x10\n"
3003 "str q29, [c_ptr3]\n"
3004 "add c_ptr3, c_ptr3, #0x10\n"
3005 "mov v29.16b, v26.16b\n"
3006 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3007 "fmla v27.4s, v24.4s, v4.s[0]\n"
3008 "str q30, [c_ptr4]\n"
3009 "mov v30.16b, v26.16b\n"
3010 "add c_ptr4, c_ptr4, #0x10\n"
3011 "fmla v28.4s, v24.4s, v8.s[0]\n"
3012 "str q31, [c_ptr5]\n"
3013 "mov v31.16b, v26.16b\n"
3014 "add c_ptr5, c_ptr5, #0x10\n"
3015 "fmla v26.4s, v24.4s, v0.s[0]\n"
3016 "fmla v29.4s, v24.4s, v12.s[0]\n"
3017 "fmla v30.4s, v24.4s, v16.s[0]\n"
3018 "fmla v31.4s, v24.4s, v20.s[0]\n"
3019 "ldr q24, [%[b_ptr0]]\n"
3020 "fmla v26.4s, v25.4s, v0.s[1]\n"
3021 "fmla v27.4s, v25.4s, v4.s[1]\n"
3022 "fmla v28.4s, v25.4s, v8.s[1]\n"
3023 "fmla v29.4s, v25.4s, v12.s[1]\n"
3024 "fmla v30.4s, v25.4s, v16.s[1]\n"
3025 "fmla v31.4s, v25.4s, v20.s[1]\n"
3026 "ldr q25, [%[b_ptr0], #0x10]\n"
3027 "fmla v26.4s, v24.4s, v0.s[2]\n"
3028 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3029 "fmla v27.4s, v24.4s, v4.s[2]\n"
3030 "fmla v28.4s, v24.4s, v8.s[2]\n"
3031 "fmla v29.4s, v24.4s, v12.s[2]\n"
3032 "fmla v30.4s, v24.4s, v16.s[2]\n"
3033 "fmla v31.4s, v24.4s, v20.s[2]\n"
3034 "ldr q24, [%[b_ptr0]]\n"
3035 "fmla v26.4s, v25.4s, v0.s[3]\n"
3036 "fmla v27.4s, v25.4s, v4.s[3]\n"
3037 "fmla v28.4s, v25.4s, v8.s[3]\n"
3038 "fmla v29.4s, v25.4s, v12.s[3]\n"
3039 "fmla v30.4s, v25.4s, v16.s[3]\n"
3040 "fmla v31.4s, v25.4s, v20.s[3]\n"
3041 "ldr q25, [%[b_ptr0], #0x10]\n"
3042 "fmla v26.4s, v24.4s, v1.s[0]\n"
3043 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3044 "fmla v27.4s, v24.4s, v5.s[0]\n"
3045 "fmla v28.4s, v24.4s, v9.s[0]\n"
3046 "fmla v29.4s, v24.4s, v13.s[0]\n"
3047 "fmla v30.4s, v24.4s, v17.s[0]\n"
3048 "fmla v31.4s, v24.4s, v21.s[0]\n"
3049 "ldr q24, [%[b_ptr0]]\n"
3050 "fmla v26.4s, v25.4s, v1.s[1]\n"
3051 "fmla v27.4s, v25.4s, v5.s[1]\n"
3052 "fmla v28.4s, v25.4s, v9.s[1]\n"
3053 "fmla v29.4s, v25.4s, v13.s[1]\n"
3054 "fmla v30.4s, v25.4s, v17.s[1]\n"
3055 "fmla v31.4s, v25.4s, v21.s[1]\n"
3056 "ldr q25, [%[b_ptr0], #0x10]\n"
3057 "fmla v26.4s, v24.4s, v1.s[2]\n"
3058 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3059 "fmla v27.4s, v24.4s, v5.s[2]\n"
3060 "fmla v28.4s, v24.4s, v9.s[2]\n"
3061 "fmla v29.4s, v24.4s, v13.s[2]\n"
3062 "fmla v30.4s, v24.4s, v17.s[2]\n"
3063 "fmla v31.4s, v24.4s, v21.s[2]\n"
3064 "ldr q24, [%[b_ptr0]]\n"
3065 "fmla v26.4s, v25.4s, v1.s[3]\n"
3066 "fmla v27.4s, v25.4s, v5.s[3]\n"
3067 "fmla v28.4s, v25.4s, v9.s[3]\n"
3068 "fmla v29.4s, v25.4s, v13.s[3]\n"
3069 "fmla v30.4s, v25.4s, v17.s[3]\n"
3070 "fmla v31.4s, v25.4s, v21.s[3]\n"
3071 "ldr q25, [%[b_ptr0], #0x10]\n"
3072 "fmla v26.4s, v24.4s, v2.s[0]\n"
3073 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3074 "fmla v27.4s, v24.4s, v6.s[0]\n"
3075 "fmla v28.4s, v24.4s, v10.s[0]\n"
3076 "fmla v29.4s, v24.4s, v14.s[0]\n"
3077 "fmla v30.4s, v24.4s, v18.s[0]\n"
3078 "fmla v31.4s, v24.4s, v22.s[0]\n"
3079 "ldr q24, [%[b_ptr0]]\n"
3080 "fmla v26.4s, v25.4s, v2.s[1]\n"
3081 "fmla v27.4s, v25.4s, v6.s[1]\n"
3082 "fmla v28.4s, v25.4s, v10.s[1]\n"
3083 "fmla v29.4s, v25.4s, v14.s[1]\n"
3084 "fmla v30.4s, v25.4s, v18.s[1]\n"
3085 "fmla v31.4s, v25.4s, v22.s[1]\n"
3086 "ldr q25, [%[b_ptr0], #0x10]\n"
3087 "fmla v26.4s, v24.4s, v2.s[2]\n"
3088 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3089 "fmla v27.4s, v24.4s, v6.s[2]\n"
3090 "fmla v28.4s, v24.4s, v10.s[2]\n"
3091 "fmla v29.4s, v24.4s, v14.s[2]\n"
3092 "fmla v30.4s, v24.4s, v18.s[2]\n"
3093 "fmla v31.4s, v24.4s, v22.s[2]\n"
3094 "ldr q24, [%[b_ptr0]]\n"
3095 "fmla v26.4s, v25.4s, v2.s[3]\n"
3096 "fmla v27.4s, v25.4s, v6.s[3]\n"
3097 "fmla v28.4s, v25.4s, v10.s[3]\n"
3098 "fmla v29.4s, v25.4s, v14.s[3]\n"
3099 "fmla v30.4s, v25.4s, v18.s[3]\n"
3100 "fmla v31.4s, v25.4s, v22.s[3]\n"
3101 "ldr q25, [%[b_ptr0], #0x10]\n"
3102 "fmla v26.4s, v24.4s, v3.s[0]\n"
3103 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3104 "fmla v27.4s, v24.4s, v7.s[0]\n"
3105 "fmla v28.4s, v24.4s, v11.s[0]\n"
3106 "fmla v29.4s, v24.4s, v15.s[0]\n"
3107 "fmla v30.4s, v24.4s, v19.s[0]\n"
3108 "fmla v31.4s, v24.4s, v23.s[0]\n"
3109 "fmla v26.4s, v25.4s, v3.s[1]\n"
3110 "fmla v27.4s, v25.4s, v7.s[1]\n"
3111 "fmla v28.4s, v25.4s, v11.s[1]\n"
3112 "fmla v29.4s, v25.4s, v15.s[1]\n"
3113 "fmla v30.4s, v25.4s, v19.s[1]\n"
3114 "fmla v31.4s, v25.4s, v23.s[1]\n"
3115 "b 5f\n"
3116 "2:\n"
3117 "ldr q26, [%[biasptr]]\n"
3118 "add %[biasptr], %[biasptr], %[biasinc]\n"
3119 "mov v27.16b, v26.16b\n"
3120 "mov v28.16b, v26.16b\n"
3121 "mov v29.16b, v26.16b\n"
3122 "mov v30.16b, v26.16b\n"
3123 "mov v31.16b, v26.16b\n"
3124 "fmla v26.4s, v24.4s, v0.s[0]\n"
3125 "fmla v27.4s, v24.4s, v4.s[0]\n"
3126 "fmla v28.4s, v24.4s, v8.s[0]\n"
3127 "fmla v29.4s, v24.4s, v12.s[0]\n"
3128 "fmla v30.4s, v24.4s, v16.s[0]\n"
3129 "fmla v31.4s, v24.4s, v20.s[0]\n"
3130 "ldr q24, [%[b_ptr0]]\n"
3131 "fmla v26.4s, v25.4s, v0.s[1]\n"
3132 "fmla v27.4s, v25.4s, v4.s[1]\n"
3133 "fmla v28.4s, v25.4s, v8.s[1]\n"
3134 "fmla v29.4s, v25.4s, v12.s[1]\n"
3135 "fmla v30.4s, v25.4s, v16.s[1]\n"
3136 "fmla v31.4s, v25.4s, v20.s[1]\n"
3137 "ldr q25, [%[b_ptr0], #0x10]\n"
3138 "fmla v26.4s, v24.4s, v0.s[2]\n"
3139 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3140 "fmla v27.4s, v24.4s, v4.s[2]\n"
3141 "fmla v28.4s, v24.4s, v8.s[2]\n"
3142 "fmla v29.4s, v24.4s, v12.s[2]\n"
3143 "fmla v30.4s, v24.4s, v16.s[2]\n"
3144 "fmla v31.4s, v24.4s, v20.s[2]\n"
3145 "ldr q24, [%[b_ptr0]]\n"
3146 "fmla v26.4s, v25.4s, v0.s[3]\n"
3147 "fmla v27.4s, v25.4s, v4.s[3]\n"
3148 "fmla v28.4s, v25.4s, v8.s[3]\n"
3149 "fmla v29.4s, v25.4s, v12.s[3]\n"
3150 "fmla v30.4s, v25.4s, v16.s[3]\n"
3151 "fmla v31.4s, v25.4s, v20.s[3]\n"
3152 "ldr q25, [%[b_ptr0], #0x10]\n"
3153 "fmla v26.4s, v24.4s, v1.s[0]\n"
3154 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3155 "fmla v27.4s, v24.4s, v5.s[0]\n"
3156 "fmla v28.4s, v24.4s, v9.s[0]\n"
3157 "fmla v29.4s, v24.4s, v13.s[0]\n"
3158 "fmla v30.4s, v24.4s, v17.s[0]\n"
3159 "fmla v31.4s, v24.4s, v21.s[0]\n"
3160 "ldr q24, [%[b_ptr0]]\n"
3161 "fmla v26.4s, v25.4s, v1.s[1]\n"
3162 "fmla v27.4s, v25.4s, v5.s[1]\n"
3163 "fmla v28.4s, v25.4s, v9.s[1]\n"
3164 "fmla v29.4s, v25.4s, v13.s[1]\n"
3165 "fmla v30.4s, v25.4s, v17.s[1]\n"
3166 "fmla v31.4s, v25.4s, v21.s[1]\n"
3167 "ldr q25, [%[b_ptr0], #0x10]\n"
3168 "fmla v26.4s, v24.4s, v1.s[2]\n"
3169 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3170 "fmla v27.4s, v24.4s, v5.s[2]\n"
3171 "fmla v28.4s, v24.4s, v9.s[2]\n"
3172 "fmla v29.4s, v24.4s, v13.s[2]\n"
3173 "fmla v30.4s, v24.4s, v17.s[2]\n"
3174 "fmla v31.4s, v24.4s, v21.s[2]\n"
3175 "ldr q24, [%[b_ptr0]]\n"
3176 "fmla v26.4s, v25.4s, v1.s[3]\n"
3177 "fmla v27.4s, v25.4s, v5.s[3]\n"
3178 "fmla v28.4s, v25.4s, v9.s[3]\n"
3179 "fmla v29.4s, v25.4s, v13.s[3]\n"
3180 "fmla v30.4s, v25.4s, v17.s[3]\n"
3181 "fmla v31.4s, v25.4s, v21.s[3]\n"
3182 "ldr q25, [%[b_ptr0], #0x10]\n"
3183 "fmla v26.4s, v24.4s, v2.s[0]\n"
3184 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3185 "fmla v27.4s, v24.4s, v6.s[0]\n"
3186 "fmla v28.4s, v24.4s, v10.s[0]\n"
3187 "fmla v29.4s, v24.4s, v14.s[0]\n"
3188 "fmla v30.4s, v24.4s, v18.s[0]\n"
3189 "fmla v31.4s, v24.4s, v22.s[0]\n"
3190 "ldr q24, [%[b_ptr0]]\n"
3191 "fmla v26.4s, v25.4s, v2.s[1]\n"
3192 "fmla v27.4s, v25.4s, v6.s[1]\n"
3193 "fmla v28.4s, v25.4s, v10.s[1]\n"
3194 "fmla v29.4s, v25.4s, v14.s[1]\n"
3195 "fmla v30.4s, v25.4s, v18.s[1]\n"
3196 "fmla v31.4s, v25.4s, v22.s[1]\n"
3197 "ldr q25, [%[b_ptr0], #0x10]\n"
3198 "fmla v26.4s, v24.4s, v2.s[2]\n"
3199 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3200 "fmla v27.4s, v24.4s, v6.s[2]\n"
3201 "fmla v28.4s, v24.4s, v10.s[2]\n"
3202 "fmla v29.4s, v24.4s, v14.s[2]\n"
3203 "fmla v30.4s, v24.4s, v18.s[2]\n"
3204 "fmla v31.4s, v24.4s, v22.s[2]\n"
3205 "ldr q24, [%[b_ptr0]]\n"
3206 "fmla v26.4s, v25.4s, v2.s[3]\n"
3207 "fmla v27.4s, v25.4s, v6.s[3]\n"
3208 "fmla v28.4s, v25.4s, v10.s[3]\n"
3209 "fmla v29.4s, v25.4s, v14.s[3]\n"
3210 "fmla v30.4s, v25.4s, v18.s[3]\n"
3211 "fmla v31.4s, v25.4s, v22.s[3]\n"
3212 "ldr q25, [%[b_ptr0], #0x10]\n"
3213 "fmla v26.4s, v24.4s, v3.s[0]\n"
3214 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3215 "fmla v27.4s, v24.4s, v7.s[0]\n"
3216 "fmla v28.4s, v24.4s, v11.s[0]\n"
3217 "fmla v29.4s, v24.4s, v15.s[0]\n"
3218 "fmla v30.4s, v24.4s, v19.s[0]\n"
3219 "fmla v31.4s, v24.4s, v23.s[0]\n"
3220 "fmla v26.4s, v25.4s, v3.s[1]\n"
3221 "fmla v27.4s, v25.4s, v7.s[1]\n"
3222 "fmla v28.4s, v25.4s, v11.s[1]\n"
3223 "fmla v29.4s, v25.4s, v15.s[1]\n"
3224 "fmla v30.4s, v25.4s, v19.s[1]\n"
3225 "fmla v31.4s, v25.4s, v23.s[1]\n"
3226 "5:\n"
3227 "ld1r {v24.4s}, [%[minptr]]\n"
3228 "ld1r {v25.4s}, [%[maxptr]]\n"
3229 "fmax v26.4s, v26.4s, v24.4s\n"
3230 "fmax v27.4s, v27.4s, v24.4s\n"
3231 "fmax v28.4s, v28.4s, v24.4s\n"
3232 "fmax v29.4s, v29.4s, v24.4s\n"
3233 "fmin v26.4s, v26.4s, v25.4s\n"
3234 "fmin v27.4s, v27.4s, v25.4s\n"
3235 "fmin v28.4s, v28.4s, v25.4s\n"
3236 "fmin v29.4s, v29.4s, v25.4s\n"
3237 "str q26, [%[c_ptr0]]\n"
3238 "fmax v30.4s, v30.4s, v24.4s\n"
3239 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3240 "fmax v31.4s, v31.4s, v24.4s\n"
3241 "str q27, [c_ptr1]\n"
3242 "fmin v30.4s, v30.4s, v25.4s\n"
3243 "fmin v31.4s, v31.4s, v25.4s\n"
3244 "str q28, [c_ptr2]\n"
3245 "str q29, [c_ptr3]\n"
3246 "str q30, [c_ptr4]\n"
3247 "str q31, [c_ptr5]\n"
3248 ".unreq a_ptr1\n"
3249 ".unreq a_ptr2\n"
3250 ".unreq a_ptr3\n"
3251 ".unreq a_ptr4\n"
3252 ".unreq a_ptr5\n"
3253 ".unreq c_ptr1\n"
3254 ".unreq c_ptr2\n"
3255 ".unreq c_ptr3\n"
3256 ".unreq c_ptr4\n"
3257 ".unreq c_ptr5\n"
3258 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
3259 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
3260 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
3261 );
3262 break;
3263 case 15:
3264 __asm __volatile (
3265 "a_ptr1 .req X0\n"
3266 "a_ptr2 .req X1\n"
3267 "a_ptr3 .req X2\n"
3268 "a_ptr4 .req X3\n"
3269 "a_ptr5 .req X4\n"
3270 "c_ptr1 .req X5\n"
3271 "c_ptr2 .req X6\n"
3272 "c_ptr3 .req X7\n"
3273 "c_ptr4 .req X8\n"
3274 "c_ptr5 .req X9\n"
3275 "add a_ptr1, %[a_ptr0], %[lda]\n"
3276 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3277 "add a_ptr2, a_ptr1, %[lda]\n"
3278 "add c_ptr2, c_ptr1, %[ldc]\n"
3279 "add a_ptr3, a_ptr2, %[lda]\n"
3280 "add c_ptr3, c_ptr2, %[ldc]\n"
3281 "add a_ptr4, a_ptr3, %[lda]\n"
3282 "add c_ptr4, c_ptr3, %[ldc]\n"
3283 "add a_ptr5, a_ptr4, %[lda]\n"
3284 "add c_ptr5, c_ptr4, %[ldc]\n"
3285 "cbz %[oob_rows], 1f\n"
3286 "subs %[oob_rows], %[oob_rows], #0x1\n"
3287 "add c_ptr5, %[c_ptr0], #0x0\n"
3288 "add a_ptr5, %[a_ptr0], #0x0\n"
3289 "b.eq 1f\n"
3290 "subs %[oob_rows], %[oob_rows], #0x1\n"
3291 "add c_ptr4, %[c_ptr0], #0x0\n"
3292 "add a_ptr4, %[a_ptr0], #0x0\n"
3293 "b.eq 1f\n"
3294 "subs %[oob_rows], %[oob_rows], #0x1\n"
3295 "add c_ptr3, %[c_ptr0], #0x0\n"
3296 "add a_ptr3, %[a_ptr0], #0x0\n"
3297 "b.eq 1f\n"
3298 "subs %[oob_rows], %[oob_rows], #0x1\n"
3299 "add c_ptr2, %[c_ptr0], #0x0\n"
3300 "add a_ptr2, %[a_ptr0], #0x0\n"
3301 "b.eq 1f\n"
3302 "subs %[oob_rows], %[oob_rows], #0x1\n"
3303 "add c_ptr1, %[c_ptr0], #0x0\n"
3304 "add a_ptr1, %[a_ptr0], #0x0\n"
3305 "1:\n"
3306 "ldr q0, [%[a_ptr0]], #0x10\n"
3307 "ldr q4, [a_ptr1], #0x10\n"
3308 "ldr q8, [a_ptr2], #0x10\n"
3309 "ldr q12, [a_ptr3], #0x10\n"
3310 "ldr q16, [a_ptr4], #0x10\n"
3311 "ldr q20, [a_ptr5], #0x10\n"
3312 "ldr q1, [%[a_ptr0]], #0x10\n"
3313 "ldr q5, [a_ptr1], #0x10\n"
3314 "ldr q9, [a_ptr2], #0x10\n"
3315 "ldr q13, [a_ptr3], #0x10\n"
3316 "ldr q17, [a_ptr4], #0x10\n"
3317 "ldr q21, [a_ptr5], #0x10\n"
3318 "ldr q2, [%[a_ptr0]], #0x10\n"
3319 "ldr q6, [a_ptr1], #0x10\n"
3320 "ldr q10, [a_ptr2], #0x10\n"
3321 "ldr q14, [a_ptr3], #0x10\n"
3322 "ldr d3, [%[a_ptr0]], #0x8\n"
3323 "ldr q18, [a_ptr4], #0x10\n"
3324 "ldr d7, [a_ptr1], #0x8\n"
3325 "ldr q22, [a_ptr5], #0x10\n"
3326 "ldr d11, [a_ptr2], #0x8\n"
3327 "ldr q24, [%[b_ptr0]]\n"
3328 "ldr d15, [a_ptr3], #0x8\n"
3329 "ldr q25, [%[b_ptr0], #0x10]\n"
3330 "ldr d19, [a_ptr4], #0x8\n"
3331 "ldr d23, [a_ptr5], #0x8\n"
3332 "ld1 {v3.s}[2], [%[a_ptr0]]\n"
3333 "ld1 {v7.s}[2], [a_ptr1]\n"
3334 "ld1 {v11.s}[2], [a_ptr2]\n"
3335 "ld1 {v15.s}[2], [a_ptr3]\n"
3336 "ld1 {v19.s}[2], [a_ptr4]\n"
3337 "ld1 {v23.s}[2], [a_ptr5]\n"
3338 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
3339 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
3340 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
3341 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
3342 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
3343 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
3344 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3345 "cbz %[loops], 2f\n"
3346 "ldr q26, [%[biasptr]]\n"
3347 "add %[biasptr], %[biasptr], %[biasinc]\n"
3348 "subs %[loops], %[loops], #0x1\n"
3349 "mov v27.16b, v26.16b\n"
3350 "mov v28.16b, v26.16b\n"
3351 "mov v29.16b, v26.16b\n"
3352 "mov v30.16b, v26.16b\n"
3353 "mov v31.16b, v26.16b\n"
3354 "fmla v26.4s, v24.4s, v0.s[0]\n"
3355 "fmla v27.4s, v24.4s, v4.s[0]\n"
3356 "fmla v28.4s, v24.4s, v8.s[0]\n"
3357 "fmla v29.4s, v24.4s, v12.s[0]\n"
3358 "fmla v30.4s, v24.4s, v16.s[0]\n"
3359 "fmla v31.4s, v24.4s, v20.s[0]\n"
3360 "ldr q24, [%[b_ptr0]]\n"
3361 "fmla v26.4s, v25.4s, v0.s[1]\n"
3362 "fmla v27.4s, v25.4s, v4.s[1]\n"
3363 "fmla v28.4s, v25.4s, v8.s[1]\n"
3364 "fmla v29.4s, v25.4s, v12.s[1]\n"
3365 "fmla v30.4s, v25.4s, v16.s[1]\n"
3366 "fmla v31.4s, v25.4s, v20.s[1]\n"
3367 "ldr q25, [%[b_ptr0], #0x10]\n"
3368 "fmla v26.4s, v24.4s, v0.s[2]\n"
3369 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3370 "fmla v27.4s, v24.4s, v4.s[2]\n"
3371 "fmla v28.4s, v24.4s, v8.s[2]\n"
3372 "fmla v29.4s, v24.4s, v12.s[2]\n"
3373 "fmla v30.4s, v24.4s, v16.s[2]\n"
3374 "fmla v31.4s, v24.4s, v20.s[2]\n"
3375 "ldr q24, [%[b_ptr0]]\n"
3376 "fmla v26.4s, v25.4s, v0.s[3]\n"
3377 "fmla v27.4s, v25.4s, v4.s[3]\n"
3378 "fmla v28.4s, v25.4s, v8.s[3]\n"
3379 "fmla v29.4s, v25.4s, v12.s[3]\n"
3380 "fmla v30.4s, v25.4s, v16.s[3]\n"
3381 "fmla v31.4s, v25.4s, v20.s[3]\n"
3382 "ldr q25, [%[b_ptr0], #0x10]\n"
3383 "fmla v26.4s, v24.4s, v1.s[0]\n"
3384 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3385 "fmla v27.4s, v24.4s, v5.s[0]\n"
3386 "fmla v28.4s, v24.4s, v9.s[0]\n"
3387 "fmla v29.4s, v24.4s, v13.s[0]\n"
3388 "fmla v30.4s, v24.4s, v17.s[0]\n"
3389 "fmla v31.4s, v24.4s, v21.s[0]\n"
3390 "ldr q24, [%[b_ptr0]]\n"
3391 "fmla v26.4s, v25.4s, v1.s[1]\n"
3392 "fmla v27.4s, v25.4s, v5.s[1]\n"
3393 "fmla v28.4s, v25.4s, v9.s[1]\n"
3394 "fmla v29.4s, v25.4s, v13.s[1]\n"
3395 "fmla v30.4s, v25.4s, v17.s[1]\n"
3396 "fmla v31.4s, v25.4s, v21.s[1]\n"
3397 "ldr q25, [%[b_ptr0], #0x10]\n"
3398 "fmla v26.4s, v24.4s, v1.s[2]\n"
3399 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3400 "fmla v27.4s, v24.4s, v5.s[2]\n"
3401 "fmla v28.4s, v24.4s, v9.s[2]\n"
3402 "fmla v29.4s, v24.4s, v13.s[2]\n"
3403 "fmla v30.4s, v24.4s, v17.s[2]\n"
3404 "fmla v31.4s, v24.4s, v21.s[2]\n"
3405 "ldr q24, [%[b_ptr0]]\n"
3406 "fmla v26.4s, v25.4s, v1.s[3]\n"
3407 "fmla v27.4s, v25.4s, v5.s[3]\n"
3408 "fmla v28.4s, v25.4s, v9.s[3]\n"
3409 "fmla v29.4s, v25.4s, v13.s[3]\n"
3410 "fmla v30.4s, v25.4s, v17.s[3]\n"
3411 "fmla v31.4s, v25.4s, v21.s[3]\n"
3412 "ldr q25, [%[b_ptr0], #0x10]\n"
3413 "fmla v26.4s, v24.4s, v2.s[0]\n"
3414 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3415 "fmla v27.4s, v24.4s, v6.s[0]\n"
3416 "fmla v28.4s, v24.4s, v10.s[0]\n"
3417 "fmla v29.4s, v24.4s, v14.s[0]\n"
3418 "fmla v30.4s, v24.4s, v18.s[0]\n"
3419 "fmla v31.4s, v24.4s, v22.s[0]\n"
3420 "ldr q24, [%[b_ptr0]]\n"
3421 "fmla v26.4s, v25.4s, v2.s[1]\n"
3422 "fmla v27.4s, v25.4s, v6.s[1]\n"
3423 "fmla v28.4s, v25.4s, v10.s[1]\n"
3424 "fmla v29.4s, v25.4s, v14.s[1]\n"
3425 "fmla v30.4s, v25.4s, v18.s[1]\n"
3426 "fmla v31.4s, v25.4s, v22.s[1]\n"
3427 "ldr q25, [%[b_ptr0], #0x10]\n"
3428 "fmla v26.4s, v24.4s, v2.s[2]\n"
3429 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3430 "fmla v27.4s, v24.4s, v6.s[2]\n"
3431 "fmla v28.4s, v24.4s, v10.s[2]\n"
3432 "fmla v29.4s, v24.4s, v14.s[2]\n"
3433 "fmla v30.4s, v24.4s, v18.s[2]\n"
3434 "fmla v31.4s, v24.4s, v22.s[2]\n"
3435 "ldr q24, [%[b_ptr0]]\n"
3436 "fmla v26.4s, v25.4s, v2.s[3]\n"
3437 "fmla v27.4s, v25.4s, v6.s[3]\n"
3438 "fmla v28.4s, v25.4s, v10.s[3]\n"
3439 "fmla v29.4s, v25.4s, v14.s[3]\n"
3440 "fmla v30.4s, v25.4s, v18.s[3]\n"
3441 "fmla v31.4s, v25.4s, v22.s[3]\n"
3442 "ldr q25, [%[b_ptr0], #0x10]\n"
3443 "fmla v26.4s, v24.4s, v3.s[0]\n"
3444 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3445 "fmla v27.4s, v24.4s, v7.s[0]\n"
3446 "fmla v28.4s, v24.4s, v11.s[0]\n"
3447 "fmla v29.4s, v24.4s, v15.s[0]\n"
3448 "fmla v30.4s, v24.4s, v19.s[0]\n"
3449 "fmla v31.4s, v24.4s, v23.s[0]\n"
3450 "ldr q24, [%[b_ptr0]]\n"
3451 "fmla v26.4s, v25.4s, v3.s[1]\n"
3452 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3453 "fmla v27.4s, v25.4s, v7.s[1]\n"
3454 "fmla v28.4s, v25.4s, v11.s[1]\n"
3455 "fmla v29.4s, v25.4s, v15.s[1]\n"
3456 "fmla v30.4s, v25.4s, v19.s[1]\n"
3457 "fmla v31.4s, v25.4s, v23.s[1]\n"
3458 "fmla v26.4s, v24.4s, v3.s[2]\n"
3459 "fmla v27.4s, v24.4s, v7.s[2]\n"
3460 "fmla v28.4s, v24.4s, v11.s[2]\n"
3461 "fmla v29.4s, v24.4s, v15.s[2]\n"
3462 "fmla v30.4s, v24.4s, v19.s[2]\n"
3463 "fmla v31.4s, v24.4s, v23.s[2]\n"
3464 "b.eq 3f\n"
3465 "4:\n"
3466 "ld1r {v24.4s}, [%[minptr]]\n"
3467 "subs %[loops], %[loops], #0x1\n"
3468 "ld1r {v25.4s}, [%[maxptr]]\n"
3469 "fmax v26.4s, v26.4s, v24.4s\n"
3470 "fmax v27.4s, v27.4s, v24.4s\n"
3471 "fmax v28.4s, v28.4s, v24.4s\n"
3472 "fmax v29.4s, v29.4s, v24.4s\n"
3473 "fmin v26.4s, v26.4s, v25.4s\n"
3474 "fmin v27.4s, v27.4s, v25.4s\n"
3475 "fmin v28.4s, v28.4s, v25.4s\n"
3476 "fmin v29.4s, v29.4s, v25.4s\n"
3477 "str q26, [%[c_ptr0]]\n"
3478 "fmax v30.4s, v30.4s, v24.4s\n"
3479 "ldr q26, [%[biasptr]]\n"
3480 "fmax v31.4s, v31.4s, v24.4s\n"
3481 "ldr q24, [%[b_ptr0]]\n"
3482 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3483 "str q27, [c_ptr1]\n"
3484 "add c_ptr1, c_ptr1, #0x10\n"
3485 "fmin v30.4s, v30.4s, v25.4s\n"
3486 "add %[biasptr], %[biasptr], %[biasinc]\n"
3487 "fmin v31.4s, v31.4s, v25.4s\n"
3488 "str q28, [c_ptr2]\n"
3489 "mov v27.16b, v26.16b\n"
3490 "ldr q25, [%[b_ptr0], #0x10]\n"
3491 "mov v28.16b, v26.16b\n"
3492 "add c_ptr2, c_ptr2, #0x10\n"
3493 "str q29, [c_ptr3]\n"
3494 "add c_ptr3, c_ptr3, #0x10\n"
3495 "mov v29.16b, v26.16b\n"
3496 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3497 "fmla v27.4s, v24.4s, v4.s[0]\n"
3498 "str q30, [c_ptr4]\n"
3499 "mov v30.16b, v26.16b\n"
3500 "add c_ptr4, c_ptr4, #0x10\n"
3501 "fmla v28.4s, v24.4s, v8.s[0]\n"
3502 "str q31, [c_ptr5]\n"
3503 "mov v31.16b, v26.16b\n"
3504 "add c_ptr5, c_ptr5, #0x10\n"
3505 "fmla v26.4s, v24.4s, v0.s[0]\n"
3506 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
3507 "fmla v29.4s, v24.4s, v12.s[0]\n"
3508 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
3509 "fmla v30.4s, v24.4s, v16.s[0]\n"
3510 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
3511 "fmla v31.4s, v24.4s, v20.s[0]\n"
3512 "ldr q24, [%[b_ptr0]]\n"
3513 "fmla v26.4s, v25.4s, v0.s[1]\n"
3514 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
3515 "fmla v27.4s, v25.4s, v4.s[1]\n"
3516 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
3517 "fmla v28.4s, v25.4s, v8.s[1]\n"
3518 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
3519 "fmla v29.4s, v25.4s, v12.s[1]\n"
3520 "fmla v30.4s, v25.4s, v16.s[1]\n"
3521 "fmla v31.4s, v25.4s, v20.s[1]\n"
3522 "ldr q25, [%[b_ptr0], #0x10]\n"
3523 "fmla v26.4s, v24.4s, v0.s[2]\n"
3524 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3525 "fmla v27.4s, v24.4s, v4.s[2]\n"
3526 "fmla v28.4s, v24.4s, v8.s[2]\n"
3527 "fmla v29.4s, v24.4s, v12.s[2]\n"
3528 "fmla v30.4s, v24.4s, v16.s[2]\n"
3529 "fmla v31.4s, v24.4s, v20.s[2]\n"
3530 "ldr q24, [%[b_ptr0]]\n"
3531 "fmla v26.4s, v25.4s, v0.s[3]\n"
3532 "fmla v27.4s, v25.4s, v4.s[3]\n"
3533 "fmla v28.4s, v25.4s, v8.s[3]\n"
3534 "fmla v29.4s, v25.4s, v12.s[3]\n"
3535 "fmla v30.4s, v25.4s, v16.s[3]\n"
3536 "fmla v31.4s, v25.4s, v20.s[3]\n"
3537 "ldr q25, [%[b_ptr0], #0x10]\n"
3538 "fmla v26.4s, v24.4s, v1.s[0]\n"
3539 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3540 "fmla v27.4s, v24.4s, v5.s[0]\n"
3541 "fmla v28.4s, v24.4s, v9.s[0]\n"
3542 "fmla v29.4s, v24.4s, v13.s[0]\n"
3543 "fmla v30.4s, v24.4s, v17.s[0]\n"
3544 "fmla v31.4s, v24.4s, v21.s[0]\n"
3545 "ldr q24, [%[b_ptr0]]\n"
3546 "fmla v26.4s, v25.4s, v1.s[1]\n"
3547 "fmla v27.4s, v25.4s, v5.s[1]\n"
3548 "fmla v28.4s, v25.4s, v9.s[1]\n"
3549 "fmla v29.4s, v25.4s, v13.s[1]\n"
3550 "fmla v30.4s, v25.4s, v17.s[1]\n"
3551 "fmla v31.4s, v25.4s, v21.s[1]\n"
3552 "ldr q25, [%[b_ptr0], #0x10]\n"
3553 "fmla v26.4s, v24.4s, v1.s[2]\n"
3554 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3555 "fmla v27.4s, v24.4s, v5.s[2]\n"
3556 "fmla v28.4s, v24.4s, v9.s[2]\n"
3557 "fmla v29.4s, v24.4s, v13.s[2]\n"
3558 "fmla v30.4s, v24.4s, v17.s[2]\n"
3559 "fmla v31.4s, v24.4s, v21.s[2]\n"
3560 "ldr q24, [%[b_ptr0]]\n"
3561 "fmla v26.4s, v25.4s, v1.s[3]\n"
3562 "fmla v27.4s, v25.4s, v5.s[3]\n"
3563 "fmla v28.4s, v25.4s, v9.s[3]\n"
3564 "fmla v29.4s, v25.4s, v13.s[3]\n"
3565 "fmla v30.4s, v25.4s, v17.s[3]\n"
3566 "fmla v31.4s, v25.4s, v21.s[3]\n"
3567 "ldr q25, [%[b_ptr0], #0x10]\n"
3568 "fmla v26.4s, v24.4s, v2.s[0]\n"
3569 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3570 "fmla v27.4s, v24.4s, v6.s[0]\n"
3571 "fmla v28.4s, v24.4s, v10.s[0]\n"
3572 "fmla v29.4s, v24.4s, v14.s[0]\n"
3573 "fmla v30.4s, v24.4s, v18.s[0]\n"
3574 "fmla v31.4s, v24.4s, v22.s[0]\n"
3575 "ldr q24, [%[b_ptr0]]\n"
3576 "fmla v26.4s, v25.4s, v2.s[1]\n"
3577 "fmla v27.4s, v25.4s, v6.s[1]\n"
3578 "fmla v28.4s, v25.4s, v10.s[1]\n"
3579 "fmla v29.4s, v25.4s, v14.s[1]\n"
3580 "fmla v30.4s, v25.4s, v18.s[1]\n"
3581 "fmla v31.4s, v25.4s, v22.s[1]\n"
3582 "ldr q25, [%[b_ptr0], #0x10]\n"
3583 "fmla v26.4s, v24.4s, v2.s[2]\n"
3584 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3585 "fmla v27.4s, v24.4s, v6.s[2]\n"
3586 "fmla v28.4s, v24.4s, v10.s[2]\n"
3587 "fmla v29.4s, v24.4s, v14.s[2]\n"
3588 "fmla v30.4s, v24.4s, v18.s[2]\n"
3589 "fmla v31.4s, v24.4s, v22.s[2]\n"
3590 "ldr q24, [%[b_ptr0]]\n"
3591 "fmla v26.4s, v25.4s, v2.s[3]\n"
3592 "fmla v27.4s, v25.4s, v6.s[3]\n"
3593 "fmla v28.4s, v25.4s, v10.s[3]\n"
3594 "fmla v29.4s, v25.4s, v14.s[3]\n"
3595 "fmla v30.4s, v25.4s, v18.s[3]\n"
3596 "fmla v31.4s, v25.4s, v22.s[3]\n"
3597 "ldr q25, [%[b_ptr0], #0x10]\n"
3598 "fmla v26.4s, v24.4s, v3.s[0]\n"
3599 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3600 "fmla v27.4s, v24.4s, v7.s[0]\n"
3601 "fmla v28.4s, v24.4s, v11.s[0]\n"
3602 "fmla v29.4s, v24.4s, v15.s[0]\n"
3603 "fmla v30.4s, v24.4s, v19.s[0]\n"
3604 "fmla v31.4s, v24.4s, v23.s[0]\n"
3605 "ldr q24, [%[b_ptr0]]\n"
3606 "fmla v26.4s, v25.4s, v3.s[1]\n"
3607 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3608 "fmla v27.4s, v25.4s, v7.s[1]\n"
3609 "fmla v28.4s, v25.4s, v11.s[1]\n"
3610 "fmla v29.4s, v25.4s, v15.s[1]\n"
3611 "fmla v30.4s, v25.4s, v19.s[1]\n"
3612 "fmla v31.4s, v25.4s, v23.s[1]\n"
3613 "fmla v26.4s, v24.4s, v3.s[2]\n"
3614 "fmla v27.4s, v24.4s, v7.s[2]\n"
3615 "fmla v28.4s, v24.4s, v11.s[2]\n"
3616 "fmla v29.4s, v24.4s, v15.s[2]\n"
3617 "fmla v30.4s, v24.4s, v19.s[2]\n"
3618 "fmla v31.4s, v24.4s, v23.s[2]\n"
3619 "b.ne 4b\n"
3620 "3:\n"
3621 "ld1r {v24.4s}, [%[minptr]]\n"
3622 "ld1r {v25.4s}, [%[maxptr]]\n"
3623 "fmax v26.4s, v26.4s, v24.4s\n"
3624 "fmax v27.4s, v27.4s, v24.4s\n"
3625 "fmax v28.4s, v28.4s, v24.4s\n"
3626 "fmax v29.4s, v29.4s, v24.4s\n"
3627 "fmin v26.4s, v26.4s, v25.4s\n"
3628 "fmin v27.4s, v27.4s, v25.4s\n"
3629 "fmin v28.4s, v28.4s, v25.4s\n"
3630 "fmin v29.4s, v29.4s, v25.4s\n"
3631 "str q26, [%[c_ptr0]]\n"
3632 "fmax v30.4s, v30.4s, v24.4s\n"
3633 "ldr q26, [%[biasptr]]\n"
3634 "fmax v31.4s, v31.4s, v24.4s\n"
3635 "ldr q24, [%[b_ptr0]]\n"
3636 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3637 "str q27, [c_ptr1]\n"
3638 "add c_ptr1, c_ptr1, #0x10\n"
3639 "fmin v30.4s, v30.4s, v25.4s\n"
3640 "add %[biasptr], %[biasptr], %[biasinc]\n"
3641 "fmin v31.4s, v31.4s, v25.4s\n"
3642 "str q28, [c_ptr2]\n"
3643 "mov v27.16b, v26.16b\n"
3644 "ldr q25, [%[b_ptr0], #0x10]\n"
3645 "mov v28.16b, v26.16b\n"
3646 "add c_ptr2, c_ptr2, #0x10\n"
3647 "str q29, [c_ptr3]\n"
3648 "add c_ptr3, c_ptr3, #0x10\n"
3649 "mov v29.16b, v26.16b\n"
3650 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3651 "fmla v27.4s, v24.4s, v4.s[0]\n"
3652 "str q30, [c_ptr4]\n"
3653 "mov v30.16b, v26.16b\n"
3654 "add c_ptr4, c_ptr4, #0x10\n"
3655 "fmla v28.4s, v24.4s, v8.s[0]\n"
3656 "str q31, [c_ptr5]\n"
3657 "mov v31.16b, v26.16b\n"
3658 "add c_ptr5, c_ptr5, #0x10\n"
3659 "fmla v26.4s, v24.4s, v0.s[0]\n"
3660 "fmla v29.4s, v24.4s, v12.s[0]\n"
3661 "fmla v30.4s, v24.4s, v16.s[0]\n"
3662 "fmla v31.4s, v24.4s, v20.s[0]\n"
3663 "ldr q24, [%[b_ptr0]]\n"
3664 "fmla v26.4s, v25.4s, v0.s[1]\n"
3665 "fmla v27.4s, v25.4s, v4.s[1]\n"
3666 "fmla v28.4s, v25.4s, v8.s[1]\n"
3667 "fmla v29.4s, v25.4s, v12.s[1]\n"
3668 "fmla v30.4s, v25.4s, v16.s[1]\n"
3669 "fmla v31.4s, v25.4s, v20.s[1]\n"
3670 "ldr q25, [%[b_ptr0], #0x10]\n"
3671 "fmla v26.4s, v24.4s, v0.s[2]\n"
3672 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3673 "fmla v27.4s, v24.4s, v4.s[2]\n"
3674 "fmla v28.4s, v24.4s, v8.s[2]\n"
3675 "fmla v29.4s, v24.4s, v12.s[2]\n"
3676 "fmla v30.4s, v24.4s, v16.s[2]\n"
3677 "fmla v31.4s, v24.4s, v20.s[2]\n"
3678 "ldr q24, [%[b_ptr0]]\n"
3679 "fmla v26.4s, v25.4s, v0.s[3]\n"
3680 "fmla v27.4s, v25.4s, v4.s[3]\n"
3681 "fmla v28.4s, v25.4s, v8.s[3]\n"
3682 "fmla v29.4s, v25.4s, v12.s[3]\n"
3683 "fmla v30.4s, v25.4s, v16.s[3]\n"
3684 "fmla v31.4s, v25.4s, v20.s[3]\n"
3685 "ldr q25, [%[b_ptr0], #0x10]\n"
3686 "fmla v26.4s, v24.4s, v1.s[0]\n"
3687 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3688 "fmla v27.4s, v24.4s, v5.s[0]\n"
3689 "fmla v28.4s, v24.4s, v9.s[0]\n"
3690 "fmla v29.4s, v24.4s, v13.s[0]\n"
3691 "fmla v30.4s, v24.4s, v17.s[0]\n"
3692 "fmla v31.4s, v24.4s, v21.s[0]\n"
3693 "ldr q24, [%[b_ptr0]]\n"
3694 "fmla v26.4s, v25.4s, v1.s[1]\n"
3695 "fmla v27.4s, v25.4s, v5.s[1]\n"
3696 "fmla v28.4s, v25.4s, v9.s[1]\n"
3697 "fmla v29.4s, v25.4s, v13.s[1]\n"
3698 "fmla v30.4s, v25.4s, v17.s[1]\n"
3699 "fmla v31.4s, v25.4s, v21.s[1]\n"
3700 "ldr q25, [%[b_ptr0], #0x10]\n"
3701 "fmla v26.4s, v24.4s, v1.s[2]\n"
3702 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3703 "fmla v27.4s, v24.4s, v5.s[2]\n"
3704 "fmla v28.4s, v24.4s, v9.s[2]\n"
3705 "fmla v29.4s, v24.4s, v13.s[2]\n"
3706 "fmla v30.4s, v24.4s, v17.s[2]\n"
3707 "fmla v31.4s, v24.4s, v21.s[2]\n"
3708 "ldr q24, [%[b_ptr0]]\n"
3709 "fmla v26.4s, v25.4s, v1.s[3]\n"
3710 "fmla v27.4s, v25.4s, v5.s[3]\n"
3711 "fmla v28.4s, v25.4s, v9.s[3]\n"
3712 "fmla v29.4s, v25.4s, v13.s[3]\n"
3713 "fmla v30.4s, v25.4s, v17.s[3]\n"
3714 "fmla v31.4s, v25.4s, v21.s[3]\n"
3715 "ldr q25, [%[b_ptr0], #0x10]\n"
3716 "fmla v26.4s, v24.4s, v2.s[0]\n"
3717 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3718 "fmla v27.4s, v24.4s, v6.s[0]\n"
3719 "fmla v28.4s, v24.4s, v10.s[0]\n"
3720 "fmla v29.4s, v24.4s, v14.s[0]\n"
3721 "fmla v30.4s, v24.4s, v18.s[0]\n"
3722 "fmla v31.4s, v24.4s, v22.s[0]\n"
3723 "ldr q24, [%[b_ptr0]]\n"
3724 "fmla v26.4s, v25.4s, v2.s[1]\n"
3725 "fmla v27.4s, v25.4s, v6.s[1]\n"
3726 "fmla v28.4s, v25.4s, v10.s[1]\n"
3727 "fmla v29.4s, v25.4s, v14.s[1]\n"
3728 "fmla v30.4s, v25.4s, v18.s[1]\n"
3729 "fmla v31.4s, v25.4s, v22.s[1]\n"
3730 "ldr q25, [%[b_ptr0], #0x10]\n"
3731 "fmla v26.4s, v24.4s, v2.s[2]\n"
3732 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3733 "fmla v27.4s, v24.4s, v6.s[2]\n"
3734 "fmla v28.4s, v24.4s, v10.s[2]\n"
3735 "fmla v29.4s, v24.4s, v14.s[2]\n"
3736 "fmla v30.4s, v24.4s, v18.s[2]\n"
3737 "fmla v31.4s, v24.4s, v22.s[2]\n"
3738 "ldr q24, [%[b_ptr0]]\n"
3739 "fmla v26.4s, v25.4s, v2.s[3]\n"
3740 "fmla v27.4s, v25.4s, v6.s[3]\n"
3741 "fmla v28.4s, v25.4s, v10.s[3]\n"
3742 "fmla v29.4s, v25.4s, v14.s[3]\n"
3743 "fmla v30.4s, v25.4s, v18.s[3]\n"
3744 "fmla v31.4s, v25.4s, v22.s[3]\n"
3745 "ldr q25, [%[b_ptr0], #0x10]\n"
3746 "fmla v26.4s, v24.4s, v3.s[0]\n"
3747 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3748 "fmla v27.4s, v24.4s, v7.s[0]\n"
3749 "fmla v28.4s, v24.4s, v11.s[0]\n"
3750 "fmla v29.4s, v24.4s, v15.s[0]\n"
3751 "fmla v30.4s, v24.4s, v19.s[0]\n"
3752 "fmla v31.4s, v24.4s, v23.s[0]\n"
3753 "ldr q24, [%[b_ptr0]]\n"
3754 "fmla v26.4s, v25.4s, v3.s[1]\n"
3755 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3756 "fmla v27.4s, v25.4s, v7.s[1]\n"
3757 "fmla v28.4s, v25.4s, v11.s[1]\n"
3758 "fmla v29.4s, v25.4s, v15.s[1]\n"
3759 "fmla v30.4s, v25.4s, v19.s[1]\n"
3760 "fmla v31.4s, v25.4s, v23.s[1]\n"
3761 "fmla v26.4s, v24.4s, v3.s[2]\n"
3762 "fmla v27.4s, v24.4s, v7.s[2]\n"
3763 "fmla v28.4s, v24.4s, v11.s[2]\n"
3764 "fmla v29.4s, v24.4s, v15.s[2]\n"
3765 "fmla v30.4s, v24.4s, v19.s[2]\n"
3766 "fmla v31.4s, v24.4s, v23.s[2]\n"
3767 "b 5f\n"
3768 "2:\n"
3769 "ldr q26, [%[biasptr]]\n"
3770 "add %[biasptr], %[biasptr], %[biasinc]\n"
3771 "mov v27.16b, v26.16b\n"
3772 "mov v28.16b, v26.16b\n"
3773 "mov v29.16b, v26.16b\n"
3774 "mov v30.16b, v26.16b\n"
3775 "mov v31.16b, v26.16b\n"
3776 "fmla v26.4s, v24.4s, v0.s[0]\n"
3777 "fmla v27.4s, v24.4s, v4.s[0]\n"
3778 "fmla v28.4s, v24.4s, v8.s[0]\n"
3779 "fmla v29.4s, v24.4s, v12.s[0]\n"
3780 "fmla v30.4s, v24.4s, v16.s[0]\n"
3781 "fmla v31.4s, v24.4s, v20.s[0]\n"
3782 "ldr q24, [%[b_ptr0]]\n"
3783 "fmla v26.4s, v25.4s, v0.s[1]\n"
3784 "fmla v27.4s, v25.4s, v4.s[1]\n"
3785 "fmla v28.4s, v25.4s, v8.s[1]\n"
3786 "fmla v29.4s, v25.4s, v12.s[1]\n"
3787 "fmla v30.4s, v25.4s, v16.s[1]\n"
3788 "fmla v31.4s, v25.4s, v20.s[1]\n"
3789 "ldr q25, [%[b_ptr0], #0x10]\n"
3790 "fmla v26.4s, v24.4s, v0.s[2]\n"
3791 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3792 "fmla v27.4s, v24.4s, v4.s[2]\n"
3793 "fmla v28.4s, v24.4s, v8.s[2]\n"
3794 "fmla v29.4s, v24.4s, v12.s[2]\n"
3795 "fmla v30.4s, v24.4s, v16.s[2]\n"
3796 "fmla v31.4s, v24.4s, v20.s[2]\n"
3797 "ldr q24, [%[b_ptr0]]\n"
3798 "fmla v26.4s, v25.4s, v0.s[3]\n"
3799 "fmla v27.4s, v25.4s, v4.s[3]\n"
3800 "fmla v28.4s, v25.4s, v8.s[3]\n"
3801 "fmla v29.4s, v25.4s, v12.s[3]\n"
3802 "fmla v30.4s, v25.4s, v16.s[3]\n"
3803 "fmla v31.4s, v25.4s, v20.s[3]\n"
3804 "ldr q25, [%[b_ptr0], #0x10]\n"
3805 "fmla v26.4s, v24.4s, v1.s[0]\n"
3806 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3807 "fmla v27.4s, v24.4s, v5.s[0]\n"
3808 "fmla v28.4s, v24.4s, v9.s[0]\n"
3809 "fmla v29.4s, v24.4s, v13.s[0]\n"
3810 "fmla v30.4s, v24.4s, v17.s[0]\n"
3811 "fmla v31.4s, v24.4s, v21.s[0]\n"
3812 "ldr q24, [%[b_ptr0]]\n"
3813 "fmla v26.4s, v25.4s, v1.s[1]\n"
3814 "fmla v27.4s, v25.4s, v5.s[1]\n"
3815 "fmla v28.4s, v25.4s, v9.s[1]\n"
3816 "fmla v29.4s, v25.4s, v13.s[1]\n"
3817 "fmla v30.4s, v25.4s, v17.s[1]\n"
3818 "fmla v31.4s, v25.4s, v21.s[1]\n"
3819 "ldr q25, [%[b_ptr0], #0x10]\n"
3820 "fmla v26.4s, v24.4s, v1.s[2]\n"
3821 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3822 "fmla v27.4s, v24.4s, v5.s[2]\n"
3823 "fmla v28.4s, v24.4s, v9.s[2]\n"
3824 "fmla v29.4s, v24.4s, v13.s[2]\n"
3825 "fmla v30.4s, v24.4s, v17.s[2]\n"
3826 "fmla v31.4s, v24.4s, v21.s[2]\n"
3827 "ldr q24, [%[b_ptr0]]\n"
3828 "fmla v26.4s, v25.4s, v1.s[3]\n"
3829 "fmla v27.4s, v25.4s, v5.s[3]\n"
3830 "fmla v28.4s, v25.4s, v9.s[3]\n"
3831 "fmla v29.4s, v25.4s, v13.s[3]\n"
3832 "fmla v30.4s, v25.4s, v17.s[3]\n"
3833 "fmla v31.4s, v25.4s, v21.s[3]\n"
3834 "ldr q25, [%[b_ptr0], #0x10]\n"
3835 "fmla v26.4s, v24.4s, v2.s[0]\n"
3836 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3837 "fmla v27.4s, v24.4s, v6.s[0]\n"
3838 "fmla v28.4s, v24.4s, v10.s[0]\n"
3839 "fmla v29.4s, v24.4s, v14.s[0]\n"
3840 "fmla v30.4s, v24.4s, v18.s[0]\n"
3841 "fmla v31.4s, v24.4s, v22.s[0]\n"
3842 "ldr q24, [%[b_ptr0]]\n"
3843 "fmla v26.4s, v25.4s, v2.s[1]\n"
3844 "fmla v27.4s, v25.4s, v6.s[1]\n"
3845 "fmla v28.4s, v25.4s, v10.s[1]\n"
3846 "fmla v29.4s, v25.4s, v14.s[1]\n"
3847 "fmla v30.4s, v25.4s, v18.s[1]\n"
3848 "fmla v31.4s, v25.4s, v22.s[1]\n"
3849 "ldr q25, [%[b_ptr0], #0x10]\n"
3850 "fmla v26.4s, v24.4s, v2.s[2]\n"
3851 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3852 "fmla v27.4s, v24.4s, v6.s[2]\n"
3853 "fmla v28.4s, v24.4s, v10.s[2]\n"
3854 "fmla v29.4s, v24.4s, v14.s[2]\n"
3855 "fmla v30.4s, v24.4s, v18.s[2]\n"
3856 "fmla v31.4s, v24.4s, v22.s[2]\n"
3857 "ldr q24, [%[b_ptr0]]\n"
3858 "fmla v26.4s, v25.4s, v2.s[3]\n"
3859 "fmla v27.4s, v25.4s, v6.s[3]\n"
3860 "fmla v28.4s, v25.4s, v10.s[3]\n"
3861 "fmla v29.4s, v25.4s, v14.s[3]\n"
3862 "fmla v30.4s, v25.4s, v18.s[3]\n"
3863 "fmla v31.4s, v25.4s, v22.s[3]\n"
3864 "ldr q25, [%[b_ptr0], #0x10]\n"
3865 "fmla v26.4s, v24.4s, v3.s[0]\n"
3866 "add %[b_ptr0], %[b_ptr0], #0x20\n"
3867 "fmla v27.4s, v24.4s, v7.s[0]\n"
3868 "fmla v28.4s, v24.4s, v11.s[0]\n"
3869 "fmla v29.4s, v24.4s, v15.s[0]\n"
3870 "fmla v30.4s, v24.4s, v19.s[0]\n"
3871 "fmla v31.4s, v24.4s, v23.s[0]\n"
3872 "ldr q24, [%[b_ptr0]]\n"
3873 "fmla v26.4s, v25.4s, v3.s[1]\n"
3874 "add %[b_ptr0], %[b_ptr0], #0x10\n"
3875 "fmla v27.4s, v25.4s, v7.s[1]\n"
3876 "fmla v28.4s, v25.4s, v11.s[1]\n"
3877 "fmla v29.4s, v25.4s, v15.s[1]\n"
3878 "fmla v30.4s, v25.4s, v19.s[1]\n"
3879 "fmla v31.4s, v25.4s, v23.s[1]\n"
3880 "fmla v26.4s, v24.4s, v3.s[2]\n"
3881 "fmla v27.4s, v24.4s, v7.s[2]\n"
3882 "fmla v28.4s, v24.4s, v11.s[2]\n"
3883 "fmla v29.4s, v24.4s, v15.s[2]\n"
3884 "fmla v30.4s, v24.4s, v19.s[2]\n"
3885 "fmla v31.4s, v24.4s, v23.s[2]\n"
3886 "5:\n"
3887 "ld1r {v24.4s}, [%[minptr]]\n"
3888 "ld1r {v25.4s}, [%[maxptr]]\n"
3889 "fmax v26.4s, v26.4s, v24.4s\n"
3890 "fmax v27.4s, v27.4s, v24.4s\n"
3891 "fmax v28.4s, v28.4s, v24.4s\n"
3892 "fmax v29.4s, v29.4s, v24.4s\n"
3893 "fmin v26.4s, v26.4s, v25.4s\n"
3894 "fmin v27.4s, v27.4s, v25.4s\n"
3895 "fmin v28.4s, v28.4s, v25.4s\n"
3896 "fmin v29.4s, v29.4s, v25.4s\n"
3897 "str q26, [%[c_ptr0]]\n"
3898 "fmax v30.4s, v30.4s, v24.4s\n"
3899 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3900 "fmax v31.4s, v31.4s, v24.4s\n"
3901 "str q27, [c_ptr1]\n"
3902 "fmin v30.4s, v30.4s, v25.4s\n"
3903 "fmin v31.4s, v31.4s, v25.4s\n"
3904 "str q28, [c_ptr2]\n"
3905 "str q29, [c_ptr3]\n"
3906 "str q30, [c_ptr4]\n"
3907 "str q31, [c_ptr5]\n"
3908 ".unreq a_ptr1\n"
3909 ".unreq a_ptr2\n"
3910 ".unreq a_ptr3\n"
3911 ".unreq a_ptr4\n"
3912 ".unreq a_ptr5\n"
3913 ".unreq c_ptr1\n"
3914 ".unreq c_ptr2\n"
3915 ".unreq c_ptr3\n"
3916 ".unreq c_ptr4\n"
3917 ".unreq c_ptr5\n"
3918 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
3919 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
3920 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
3921 );
3922 break;
3923 default:
3924 case 16:
3925 __asm __volatile (
3926 "a_ptr1 .req X0\n"
3927 "a_ptr2 .req X1\n"
3928 "a_ptr3 .req X2\n"
3929 "a_ptr4 .req X3\n"
3930 "a_ptr5 .req X4\n"
3931 "c_ptr1 .req X5\n"
3932 "c_ptr2 .req X6\n"
3933 "c_ptr3 .req X7\n"
3934 "c_ptr4 .req X8\n"
3935 "c_ptr5 .req X9\n"
3936 "add a_ptr1, %[a_ptr0], %[lda]\n"
3937 "add c_ptr1, %[c_ptr0], %[ldc]\n"
3938 "add a_ptr2, a_ptr1, %[lda]\n"
3939 "add c_ptr2, c_ptr1, %[ldc]\n"
3940 "add a_ptr3, a_ptr2, %[lda]\n"
3941 "add c_ptr3, c_ptr2, %[ldc]\n"
3942 "add a_ptr4, a_ptr3, %[lda]\n"
3943 "add c_ptr4, c_ptr3, %[ldc]\n"
3944 "add a_ptr5, a_ptr4, %[lda]\n"
3945 "add c_ptr5, c_ptr4, %[ldc]\n"
3946 "cbz %[oob_rows], 1f\n"
3947 "subs %[oob_rows], %[oob_rows], #0x1\n"
3948 "add c_ptr5, %[c_ptr0], #0x0\n"
3949 "add a_ptr5, %[a_ptr0], #0x0\n"
3950 "b.eq 1f\n"
3951 "subs %[oob_rows], %[oob_rows], #0x1\n"
3952 "add c_ptr4, %[c_ptr0], #0x0\n"
3953 "add a_ptr4, %[a_ptr0], #0x0\n"
3954 "b.eq 1f\n"
3955 "subs %[oob_rows], %[oob_rows], #0x1\n"
3956 "add c_ptr3, %[c_ptr0], #0x0\n"
3957 "add a_ptr3, %[a_ptr0], #0x0\n"
3958 "b.eq 1f\n"
3959 "subs %[oob_rows], %[oob_rows], #0x1\n"
3960 "add c_ptr2, %[c_ptr0], #0x0\n"
3961 "add a_ptr2, %[a_ptr0], #0x0\n"
3962 "b.eq 1f\n"
3963 "subs %[oob_rows], %[oob_rows], #0x1\n"
3964 "add c_ptr1, %[c_ptr0], #0x0\n"
3965 "add a_ptr1, %[a_ptr0], #0x0\n"
3966 "1:\n"
3967 "ldr q0, [%[a_ptr0]], #0x10\n"
3968 "ldr q4, [a_ptr1], #0x10\n"
3969 "ldr q8, [a_ptr2], #0x10\n"
3970 "ldr q12, [a_ptr3], #0x10\n"
3971 "ldr q16, [a_ptr4], #0x10\n"
3972 "ldr q20, [a_ptr5], #0x10\n"
3973 "ldr q1, [%[a_ptr0]], #0x10\n"
3974 "ldr q5, [a_ptr1], #0x10\n"
3975 "ldr q9, [a_ptr2], #0x10\n"
3976 "ldr q13, [a_ptr3], #0x10\n"
3977 "ldr q17, [a_ptr4], #0x10\n"
3978 "ldr q21, [a_ptr5], #0x10\n"
3979 "ldr q2, [%[a_ptr0]], #0x10\n"
3980 "ldr q6, [a_ptr1], #0x10\n"
3981 "ldr q10, [a_ptr2], #0x10\n"
3982 "ldr q14, [a_ptr3], #0x10\n"
3983 "ldr q18, [a_ptr4], #0x10\n"
3984 "ldr q22, [a_ptr5], #0x10\n"
3985 "ldr q3, [%[a_ptr0]]\n"
3986 "ldr q7, [a_ptr1]\n"
3987 "ldr q11, [a_ptr2]\n"
3988 "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
3989 "ldr q15, [a_ptr3]\n"
3990 "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
3991 "ldr q19, [a_ptr4]\n"
3992 "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
3993 "ldr q23, [a_ptr5]\n"
3994 "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
3995 "ldr q24, [%[b_ptr0]]\n"
3996 "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
3997 "ldr q25, [%[b_ptr0], #0x10]\n"
3998 "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
3999 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4000 "cbz %[loops], 2f\n"
4001 "ldr q26, [%[biasptr]]\n"
4002 "add %[biasptr], %[biasptr], %[biasinc]\n"
4003 "subs %[loops], %[loops], #0x1\n"
4004 "mov v27.16b, v26.16b\n"
4005 "mov v28.16b, v26.16b\n"
4006 "mov v29.16b, v26.16b\n"
4007 "mov v30.16b, v26.16b\n"
4008 "mov v31.16b, v26.16b\n"
4009 "fmla v26.4s, v24.4s, v0.s[0]\n"
4010 "fmla v27.4s, v24.4s, v4.s[0]\n"
4011 "fmla v28.4s, v24.4s, v8.s[0]\n"
4012 "fmla v29.4s, v24.4s, v12.s[0]\n"
4013 "fmla v30.4s, v24.4s, v16.s[0]\n"
4014 "fmla v31.4s, v24.4s, v20.s[0]\n"
4015 "ldr q24, [%[b_ptr0]]\n"
4016 "fmla v26.4s, v25.4s, v0.s[1]\n"
4017 "fmla v27.4s, v25.4s, v4.s[1]\n"
4018 "fmla v28.4s, v25.4s, v8.s[1]\n"
4019 "fmla v29.4s, v25.4s, v12.s[1]\n"
4020 "fmla v30.4s, v25.4s, v16.s[1]\n"
4021 "fmla v31.4s, v25.4s, v20.s[1]\n"
4022 "ldr q25, [%[b_ptr0], #0x10]\n"
4023 "fmla v26.4s, v24.4s, v0.s[2]\n"
4024 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4025 "fmla v27.4s, v24.4s, v4.s[2]\n"
4026 "fmla v28.4s, v24.4s, v8.s[2]\n"
4027 "fmla v29.4s, v24.4s, v12.s[2]\n"
4028 "fmla v30.4s, v24.4s, v16.s[2]\n"
4029 "fmla v31.4s, v24.4s, v20.s[2]\n"
4030 "ldr q24, [%[b_ptr0]]\n"
4031 "fmla v26.4s, v25.4s, v0.s[3]\n"
4032 "fmla v27.4s, v25.4s, v4.s[3]\n"
4033 "fmla v28.4s, v25.4s, v8.s[3]\n"
4034 "fmla v29.4s, v25.4s, v12.s[3]\n"
4035 "fmla v30.4s, v25.4s, v16.s[3]\n"
4036 "fmla v31.4s, v25.4s, v20.s[3]\n"
4037 "ldr q25, [%[b_ptr0], #0x10]\n"
4038 "fmla v26.4s, v24.4s, v1.s[0]\n"
4039 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4040 "fmla v27.4s, v24.4s, v5.s[0]\n"
4041 "fmla v28.4s, v24.4s, v9.s[0]\n"
4042 "fmla v29.4s, v24.4s, v13.s[0]\n"
4043 "fmla v30.4s, v24.4s, v17.s[0]\n"
4044 "fmla v31.4s, v24.4s, v21.s[0]\n"
4045 "ldr q24, [%[b_ptr0]]\n"
4046 "fmla v26.4s, v25.4s, v1.s[1]\n"
4047 "fmla v27.4s, v25.4s, v5.s[1]\n"
4048 "fmla v28.4s, v25.4s, v9.s[1]\n"
4049 "fmla v29.4s, v25.4s, v13.s[1]\n"
4050 "fmla v30.4s, v25.4s, v17.s[1]\n"
4051 "fmla v31.4s, v25.4s, v21.s[1]\n"
4052 "ldr q25, [%[b_ptr0], #0x10]\n"
4053 "fmla v26.4s, v24.4s, v1.s[2]\n"
4054 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4055 "fmla v27.4s, v24.4s, v5.s[2]\n"
4056 "fmla v28.4s, v24.4s, v9.s[2]\n"
4057 "fmla v29.4s, v24.4s, v13.s[2]\n"
4058 "fmla v30.4s, v24.4s, v17.s[2]\n"
4059 "fmla v31.4s, v24.4s, v21.s[2]\n"
4060 "ldr q24, [%[b_ptr0]]\n"
4061 "fmla v26.4s, v25.4s, v1.s[3]\n"
4062 "fmla v27.4s, v25.4s, v5.s[3]\n"
4063 "fmla v28.4s, v25.4s, v9.s[3]\n"
4064 "fmla v29.4s, v25.4s, v13.s[3]\n"
4065 "fmla v30.4s, v25.4s, v17.s[3]\n"
4066 "fmla v31.4s, v25.4s, v21.s[3]\n"
4067 "ldr q25, [%[b_ptr0], #0x10]\n"
4068 "fmla v26.4s, v24.4s, v2.s[0]\n"
4069 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4070 "fmla v27.4s, v24.4s, v6.s[0]\n"
4071 "fmla v28.4s, v24.4s, v10.s[0]\n"
4072 "fmla v29.4s, v24.4s, v14.s[0]\n"
4073 "fmla v30.4s, v24.4s, v18.s[0]\n"
4074 "fmla v31.4s, v24.4s, v22.s[0]\n"
4075 "ldr q24, [%[b_ptr0]]\n"
4076 "fmla v26.4s, v25.4s, v2.s[1]\n"
4077 "fmla v27.4s, v25.4s, v6.s[1]\n"
4078 "fmla v28.4s, v25.4s, v10.s[1]\n"
4079 "fmla v29.4s, v25.4s, v14.s[1]\n"
4080 "fmla v30.4s, v25.4s, v18.s[1]\n"
4081 "fmla v31.4s, v25.4s, v22.s[1]\n"
4082 "ldr q25, [%[b_ptr0], #0x10]\n"
4083 "fmla v26.4s, v24.4s, v2.s[2]\n"
4084 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4085 "fmla v27.4s, v24.4s, v6.s[2]\n"
4086 "fmla v28.4s, v24.4s, v10.s[2]\n"
4087 "fmla v29.4s, v24.4s, v14.s[2]\n"
4088 "fmla v30.4s, v24.4s, v18.s[2]\n"
4089 "fmla v31.4s, v24.4s, v22.s[2]\n"
4090 "ldr q24, [%[b_ptr0]]\n"
4091 "fmla v26.4s, v25.4s, v2.s[3]\n"
4092 "fmla v27.4s, v25.4s, v6.s[3]\n"
4093 "fmla v28.4s, v25.4s, v10.s[3]\n"
4094 "fmla v29.4s, v25.4s, v14.s[3]\n"
4095 "fmla v30.4s, v25.4s, v18.s[3]\n"
4096 "fmla v31.4s, v25.4s, v22.s[3]\n"
4097 "ldr q25, [%[b_ptr0], #0x10]\n"
4098 "fmla v26.4s, v24.4s, v3.s[0]\n"
4099 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4100 "fmla v27.4s, v24.4s, v7.s[0]\n"
4101 "fmla v28.4s, v24.4s, v11.s[0]\n"
4102 "fmla v29.4s, v24.4s, v15.s[0]\n"
4103 "fmla v30.4s, v24.4s, v19.s[0]\n"
4104 "fmla v31.4s, v24.4s, v23.s[0]\n"
4105 "ldr q24, [%[b_ptr0]]\n"
4106 "fmla v26.4s, v25.4s, v3.s[1]\n"
4107 "fmla v27.4s, v25.4s, v7.s[1]\n"
4108 "fmla v28.4s, v25.4s, v11.s[1]\n"
4109 "fmla v29.4s, v25.4s, v15.s[1]\n"
4110 "fmla v30.4s, v25.4s, v19.s[1]\n"
4111 "fmla v31.4s, v25.4s, v23.s[1]\n"
4112 "ldr q25, [%[b_ptr0], #0x10]\n"
4113 "fmla v26.4s, v24.4s, v3.s[2]\n"
4114 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4115 "fmla v27.4s, v24.4s, v7.s[2]\n"
4116 "fmla v28.4s, v24.4s, v11.s[2]\n"
4117 "fmla v29.4s, v24.4s, v15.s[2]\n"
4118 "fmla v30.4s, v24.4s, v19.s[2]\n"
4119 "fmla v31.4s, v24.4s, v23.s[2]\n"
4120 "fmla v26.4s, v25.4s, v3.s[3]\n"
4121 "fmla v27.4s, v25.4s, v7.s[3]\n"
4122 "fmla v28.4s, v25.4s, v11.s[3]\n"
4123 "fmla v29.4s, v25.4s, v15.s[3]\n"
4124 "fmla v30.4s, v25.4s, v19.s[3]\n"
4125 "fmla v31.4s, v25.4s, v23.s[3]\n"
4126 "b.eq 3f\n"
4127 "4:\n"
4128 "ld1r {v24.4s}, [%[minptr]]\n"
4129 "subs %[loops], %[loops], #0x1\n"
4130 "ld1r {v25.4s}, [%[maxptr]]\n"
4131 "fmax v26.4s, v26.4s, v24.4s\n"
4132 "fmax v27.4s, v27.4s, v24.4s\n"
4133 "fmax v28.4s, v28.4s, v24.4s\n"
4134 "fmax v29.4s, v29.4s, v24.4s\n"
4135 "fmin v26.4s, v26.4s, v25.4s\n"
4136 "fmin v27.4s, v27.4s, v25.4s\n"
4137 "fmin v28.4s, v28.4s, v25.4s\n"
4138 "fmin v29.4s, v29.4s, v25.4s\n"
4139 "str q26, [%[c_ptr0]]\n"
4140 "fmax v30.4s, v30.4s, v24.4s\n"
4141 "ldr q26, [%[biasptr]]\n"
4142 "fmax v31.4s, v31.4s, v24.4s\n"
4143 "ldr q24, [%[b_ptr0]]\n"
4144 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4145 "str q27, [c_ptr1]\n"
4146 "add c_ptr1, c_ptr1, #0x10\n"
4147 "fmin v30.4s, v30.4s, v25.4s\n"
4148 "add %[biasptr], %[biasptr], %[biasinc]\n"
4149 "fmin v31.4s, v31.4s, v25.4s\n"
4150 "str q28, [c_ptr2]\n"
4151 "mov v27.16b, v26.16b\n"
4152 "ldr q25, [%[b_ptr0], #0x10]\n"
4153 "mov v28.16b, v26.16b\n"
4154 "add c_ptr2, c_ptr2, #0x10\n"
4155 "str q29, [c_ptr3]\n"
4156 "add c_ptr3, c_ptr3, #0x10\n"
4157 "mov v29.16b, v26.16b\n"
4158 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4159 "fmla v27.4s, v24.4s, v4.s[0]\n"
4160 "str q30, [c_ptr4]\n"
4161 "mov v30.16b, v26.16b\n"
4162 "add c_ptr4, c_ptr4, #0x10\n"
4163 "fmla v28.4s, v24.4s, v8.s[0]\n"
4164 "str q31, [c_ptr5]\n"
4165 "mov v31.16b, v26.16b\n"
4166 "add c_ptr5, c_ptr5, #0x10\n"
4167 "fmla v26.4s, v24.4s, v0.s[0]\n"
4168 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
4169 "fmla v29.4s, v24.4s, v12.s[0]\n"
4170 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
4171 "fmla v30.4s, v24.4s, v16.s[0]\n"
4172 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
4173 "fmla v31.4s, v24.4s, v20.s[0]\n"
4174 "ldr q24, [%[b_ptr0]]\n"
4175 "fmla v26.4s, v25.4s, v0.s[1]\n"
4176 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
4177 "fmla v27.4s, v25.4s, v4.s[1]\n"
4178 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
4179 "fmla v28.4s, v25.4s, v8.s[1]\n"
4180 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
4181 "fmla v29.4s, v25.4s, v12.s[1]\n"
4182 "fmla v30.4s, v25.4s, v16.s[1]\n"
4183 "fmla v31.4s, v25.4s, v20.s[1]\n"
4184 "ldr q25, [%[b_ptr0], #0x10]\n"
4185 "fmla v26.4s, v24.4s, v0.s[2]\n"
4186 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4187 "fmla v27.4s, v24.4s, v4.s[2]\n"
4188 "fmla v28.4s, v24.4s, v8.s[2]\n"
4189 "fmla v29.4s, v24.4s, v12.s[2]\n"
4190 "fmla v30.4s, v24.4s, v16.s[2]\n"
4191 "fmla v31.4s, v24.4s, v20.s[2]\n"
4192 "ldr q24, [%[b_ptr0]]\n"
4193 "fmla v26.4s, v25.4s, v0.s[3]\n"
4194 "fmla v27.4s, v25.4s, v4.s[3]\n"
4195 "fmla v28.4s, v25.4s, v8.s[3]\n"
4196 "fmla v29.4s, v25.4s, v12.s[3]\n"
4197 "fmla v30.4s, v25.4s, v16.s[3]\n"
4198 "fmla v31.4s, v25.4s, v20.s[3]\n"
4199 "ldr q25, [%[b_ptr0], #0x10]\n"
4200 "fmla v26.4s, v24.4s, v1.s[0]\n"
4201 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4202 "fmla v27.4s, v24.4s, v5.s[0]\n"
4203 "fmla v28.4s, v24.4s, v9.s[0]\n"
4204 "fmla v29.4s, v24.4s, v13.s[0]\n"
4205 "fmla v30.4s, v24.4s, v17.s[0]\n"
4206 "fmla v31.4s, v24.4s, v21.s[0]\n"
4207 "ldr q24, [%[b_ptr0]]\n"
4208 "fmla v26.4s, v25.4s, v1.s[1]\n"
4209 "fmla v27.4s, v25.4s, v5.s[1]\n"
4210 "fmla v28.4s, v25.4s, v9.s[1]\n"
4211 "fmla v29.4s, v25.4s, v13.s[1]\n"
4212 "fmla v30.4s, v25.4s, v17.s[1]\n"
4213 "fmla v31.4s, v25.4s, v21.s[1]\n"
4214 "ldr q25, [%[b_ptr0], #0x10]\n"
4215 "fmla v26.4s, v24.4s, v1.s[2]\n"
4216 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4217 "fmla v27.4s, v24.4s, v5.s[2]\n"
4218 "fmla v28.4s, v24.4s, v9.s[2]\n"
4219 "fmla v29.4s, v24.4s, v13.s[2]\n"
4220 "fmla v30.4s, v24.4s, v17.s[2]\n"
4221 "fmla v31.4s, v24.4s, v21.s[2]\n"
4222 "ldr q24, [%[b_ptr0]]\n"
4223 "fmla v26.4s, v25.4s, v1.s[3]\n"
4224 "fmla v27.4s, v25.4s, v5.s[3]\n"
4225 "fmla v28.4s, v25.4s, v9.s[3]\n"
4226 "fmla v29.4s, v25.4s, v13.s[3]\n"
4227 "fmla v30.4s, v25.4s, v17.s[3]\n"
4228 "fmla v31.4s, v25.4s, v21.s[3]\n"
4229 "ldr q25, [%[b_ptr0], #0x10]\n"
4230 "fmla v26.4s, v24.4s, v2.s[0]\n"
4231 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4232 "fmla v27.4s, v24.4s, v6.s[0]\n"
4233 "fmla v28.4s, v24.4s, v10.s[0]\n"
4234 "fmla v29.4s, v24.4s, v14.s[0]\n"
4235 "fmla v30.4s, v24.4s, v18.s[0]\n"
4236 "fmla v31.4s, v24.4s, v22.s[0]\n"
4237 "ldr q24, [%[b_ptr0]]\n"
4238 "fmla v26.4s, v25.4s, v2.s[1]\n"
4239 "fmla v27.4s, v25.4s, v6.s[1]\n"
4240 "fmla v28.4s, v25.4s, v10.s[1]\n"
4241 "fmla v29.4s, v25.4s, v14.s[1]\n"
4242 "fmla v30.4s, v25.4s, v18.s[1]\n"
4243 "fmla v31.4s, v25.4s, v22.s[1]\n"
4244 "ldr q25, [%[b_ptr0], #0x10]\n"
4245 "fmla v26.4s, v24.4s, v2.s[2]\n"
4246 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4247 "fmla v27.4s, v24.4s, v6.s[2]\n"
4248 "fmla v28.4s, v24.4s, v10.s[2]\n"
4249 "fmla v29.4s, v24.4s, v14.s[2]\n"
4250 "fmla v30.4s, v24.4s, v18.s[2]\n"
4251 "fmla v31.4s, v24.4s, v22.s[2]\n"
4252 "ldr q24, [%[b_ptr0]]\n"
4253 "fmla v26.4s, v25.4s, v2.s[3]\n"
4254 "fmla v27.4s, v25.4s, v6.s[3]\n"
4255 "fmla v28.4s, v25.4s, v10.s[3]\n"
4256 "fmla v29.4s, v25.4s, v14.s[3]\n"
4257 "fmla v30.4s, v25.4s, v18.s[3]\n"
4258 "fmla v31.4s, v25.4s, v22.s[3]\n"
4259 "ldr q25, [%[b_ptr0], #0x10]\n"
4260 "fmla v26.4s, v24.4s, v3.s[0]\n"
4261 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4262 "fmla v27.4s, v24.4s, v7.s[0]\n"
4263 "fmla v28.4s, v24.4s, v11.s[0]\n"
4264 "fmla v29.4s, v24.4s, v15.s[0]\n"
4265 "fmla v30.4s, v24.4s, v19.s[0]\n"
4266 "fmla v31.4s, v24.4s, v23.s[0]\n"
4267 "ldr q24, [%[b_ptr0]]\n"
4268 "fmla v26.4s, v25.4s, v3.s[1]\n"
4269 "fmla v27.4s, v25.4s, v7.s[1]\n"
4270 "fmla v28.4s, v25.4s, v11.s[1]\n"
4271 "fmla v29.4s, v25.4s, v15.s[1]\n"
4272 "fmla v30.4s, v25.4s, v19.s[1]\n"
4273 "fmla v31.4s, v25.4s, v23.s[1]\n"
4274 "ldr q25, [%[b_ptr0], #0x10]\n"
4275 "fmla v26.4s, v24.4s, v3.s[2]\n"
4276 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4277 "fmla v27.4s, v24.4s, v7.s[2]\n"
4278 "fmla v28.4s, v24.4s, v11.s[2]\n"
4279 "fmla v29.4s, v24.4s, v15.s[2]\n"
4280 "fmla v30.4s, v24.4s, v19.s[2]\n"
4281 "fmla v31.4s, v24.4s, v23.s[2]\n"
4282 "fmla v26.4s, v25.4s, v3.s[3]\n"
4283 "fmla v27.4s, v25.4s, v7.s[3]\n"
4284 "fmla v28.4s, v25.4s, v11.s[3]\n"
4285 "fmla v29.4s, v25.4s, v15.s[3]\n"
4286 "fmla v30.4s, v25.4s, v19.s[3]\n"
4287 "fmla v31.4s, v25.4s, v23.s[3]\n"
4288 "b.ne 4b\n"
4289 "3:\n"
4290 "ld1r {v24.4s}, [%[minptr]]\n"
4291 "ld1r {v25.4s}, [%[maxptr]]\n"
4292 "fmax v26.4s, v26.4s, v24.4s\n"
4293 "fmax v27.4s, v27.4s, v24.4s\n"
4294 "fmax v28.4s, v28.4s, v24.4s\n"
4295 "fmax v29.4s, v29.4s, v24.4s\n"
4296 "fmin v26.4s, v26.4s, v25.4s\n"
4297 "fmin v27.4s, v27.4s, v25.4s\n"
4298 "fmin v28.4s, v28.4s, v25.4s\n"
4299 "fmin v29.4s, v29.4s, v25.4s\n"
4300 "str q26, [%[c_ptr0]]\n"
4301 "fmax v30.4s, v30.4s, v24.4s\n"
4302 "ldr q26, [%[biasptr]]\n"
4303 "fmax v31.4s, v31.4s, v24.4s\n"
4304 "ldr q24, [%[b_ptr0]]\n"
4305 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4306 "str q27, [c_ptr1]\n"
4307 "add c_ptr1, c_ptr1, #0x10\n"
4308 "fmin v30.4s, v30.4s, v25.4s\n"
4309 "add %[biasptr], %[biasptr], %[biasinc]\n"
4310 "fmin v31.4s, v31.4s, v25.4s\n"
4311 "str q28, [c_ptr2]\n"
4312 "mov v27.16b, v26.16b\n"
4313 "ldr q25, [%[b_ptr0], #0x10]\n"
4314 "mov v28.16b, v26.16b\n"
4315 "add c_ptr2, c_ptr2, #0x10\n"
4316 "str q29, [c_ptr3]\n"
4317 "add c_ptr3, c_ptr3, #0x10\n"
4318 "mov v29.16b, v26.16b\n"
4319 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4320 "fmla v27.4s, v24.4s, v4.s[0]\n"
4321 "str q30, [c_ptr4]\n"
4322 "mov v30.16b, v26.16b\n"
4323 "add c_ptr4, c_ptr4, #0x10\n"
4324 "fmla v28.4s, v24.4s, v8.s[0]\n"
4325 "str q31, [c_ptr5]\n"
4326 "mov v31.16b, v26.16b\n"
4327 "add c_ptr5, c_ptr5, #0x10\n"
4328 "fmla v26.4s, v24.4s, v0.s[0]\n"
4329 "fmla v29.4s, v24.4s, v12.s[0]\n"
4330 "fmla v30.4s, v24.4s, v16.s[0]\n"
4331 "fmla v31.4s, v24.4s, v20.s[0]\n"
4332 "ldr q24, [%[b_ptr0]]\n"
4333 "fmla v26.4s, v25.4s, v0.s[1]\n"
4334 "fmla v27.4s, v25.4s, v4.s[1]\n"
4335 "fmla v28.4s, v25.4s, v8.s[1]\n"
4336 "fmla v29.4s, v25.4s, v12.s[1]\n"
4337 "fmla v30.4s, v25.4s, v16.s[1]\n"
4338 "fmla v31.4s, v25.4s, v20.s[1]\n"
4339 "ldr q25, [%[b_ptr0], #0x10]\n"
4340 "fmla v26.4s, v24.4s, v0.s[2]\n"
4341 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4342 "fmla v27.4s, v24.4s, v4.s[2]\n"
4343 "fmla v28.4s, v24.4s, v8.s[2]\n"
4344 "fmla v29.4s, v24.4s, v12.s[2]\n"
4345 "fmla v30.4s, v24.4s, v16.s[2]\n"
4346 "fmla v31.4s, v24.4s, v20.s[2]\n"
4347 "ldr q24, [%[b_ptr0]]\n"
4348 "fmla v26.4s, v25.4s, v0.s[3]\n"
4349 "fmla v27.4s, v25.4s, v4.s[3]\n"
4350 "fmla v28.4s, v25.4s, v8.s[3]\n"
4351 "fmla v29.4s, v25.4s, v12.s[3]\n"
4352 "fmla v30.4s, v25.4s, v16.s[3]\n"
4353 "fmla v31.4s, v25.4s, v20.s[3]\n"
4354 "ldr q25, [%[b_ptr0], #0x10]\n"
4355 "fmla v26.4s, v24.4s, v1.s[0]\n"
4356 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4357 "fmla v27.4s, v24.4s, v5.s[0]\n"
4358 "fmla v28.4s, v24.4s, v9.s[0]\n"
4359 "fmla v29.4s, v24.4s, v13.s[0]\n"
4360 "fmla v30.4s, v24.4s, v17.s[0]\n"
4361 "fmla v31.4s, v24.4s, v21.s[0]\n"
4362 "ldr q24, [%[b_ptr0]]\n"
4363 "fmla v26.4s, v25.4s, v1.s[1]\n"
4364 "fmla v27.4s, v25.4s, v5.s[1]\n"
4365 "fmla v28.4s, v25.4s, v9.s[1]\n"
4366 "fmla v29.4s, v25.4s, v13.s[1]\n"
4367 "fmla v30.4s, v25.4s, v17.s[1]\n"
4368 "fmla v31.4s, v25.4s, v21.s[1]\n"
4369 "ldr q25, [%[b_ptr0], #0x10]\n"
4370 "fmla v26.4s, v24.4s, v1.s[2]\n"
4371 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4372 "fmla v27.4s, v24.4s, v5.s[2]\n"
4373 "fmla v28.4s, v24.4s, v9.s[2]\n"
4374 "fmla v29.4s, v24.4s, v13.s[2]\n"
4375 "fmla v30.4s, v24.4s, v17.s[2]\n"
4376 "fmla v31.4s, v24.4s, v21.s[2]\n"
4377 "ldr q24, [%[b_ptr0]]\n"
4378 "fmla v26.4s, v25.4s, v1.s[3]\n"
4379 "fmla v27.4s, v25.4s, v5.s[3]\n"
4380 "fmla v28.4s, v25.4s, v9.s[3]\n"
4381 "fmla v29.4s, v25.4s, v13.s[3]\n"
4382 "fmla v30.4s, v25.4s, v17.s[3]\n"
4383 "fmla v31.4s, v25.4s, v21.s[3]\n"
4384 "ldr q25, [%[b_ptr0], #0x10]\n"
4385 "fmla v26.4s, v24.4s, v2.s[0]\n"
4386 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4387 "fmla v27.4s, v24.4s, v6.s[0]\n"
4388 "fmla v28.4s, v24.4s, v10.s[0]\n"
4389 "fmla v29.4s, v24.4s, v14.s[0]\n"
4390 "fmla v30.4s, v24.4s, v18.s[0]\n"
4391 "fmla v31.4s, v24.4s, v22.s[0]\n"
4392 "ldr q24, [%[b_ptr0]]\n"
4393 "fmla v26.4s, v25.4s, v2.s[1]\n"
4394 "fmla v27.4s, v25.4s, v6.s[1]\n"
4395 "fmla v28.4s, v25.4s, v10.s[1]\n"
4396 "fmla v29.4s, v25.4s, v14.s[1]\n"
4397 "fmla v30.4s, v25.4s, v18.s[1]\n"
4398 "fmla v31.4s, v25.4s, v22.s[1]\n"
4399 "ldr q25, [%[b_ptr0], #0x10]\n"
4400 "fmla v26.4s, v24.4s, v2.s[2]\n"
4401 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4402 "fmla v27.4s, v24.4s, v6.s[2]\n"
4403 "fmla v28.4s, v24.4s, v10.s[2]\n"
4404 "fmla v29.4s, v24.4s, v14.s[2]\n"
4405 "fmla v30.4s, v24.4s, v18.s[2]\n"
4406 "fmla v31.4s, v24.4s, v22.s[2]\n"
4407 "ldr q24, [%[b_ptr0]]\n"
4408 "fmla v26.4s, v25.4s, v2.s[3]\n"
4409 "fmla v27.4s, v25.4s, v6.s[3]\n"
4410 "fmla v28.4s, v25.4s, v10.s[3]\n"
4411 "fmla v29.4s, v25.4s, v14.s[3]\n"
4412 "fmla v30.4s, v25.4s, v18.s[3]\n"
4413 "fmla v31.4s, v25.4s, v22.s[3]\n"
4414 "ldr q25, [%[b_ptr0], #0x10]\n"
4415 "fmla v26.4s, v24.4s, v3.s[0]\n"
4416 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4417 "fmla v27.4s, v24.4s, v7.s[0]\n"
4418 "fmla v28.4s, v24.4s, v11.s[0]\n"
4419 "fmla v29.4s, v24.4s, v15.s[0]\n"
4420 "fmla v30.4s, v24.4s, v19.s[0]\n"
4421 "fmla v31.4s, v24.4s, v23.s[0]\n"
4422 "ldr q24, [%[b_ptr0]]\n"
4423 "fmla v26.4s, v25.4s, v3.s[1]\n"
4424 "fmla v27.4s, v25.4s, v7.s[1]\n"
4425 "fmla v28.4s, v25.4s, v11.s[1]\n"
4426 "fmla v29.4s, v25.4s, v15.s[1]\n"
4427 "fmla v30.4s, v25.4s, v19.s[1]\n"
4428 "fmla v31.4s, v25.4s, v23.s[1]\n"
4429 "ldr q25, [%[b_ptr0], #0x10]\n"
4430 "fmla v26.4s, v24.4s, v3.s[2]\n"
4431 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4432 "fmla v27.4s, v24.4s, v7.s[2]\n"
4433 "fmla v28.4s, v24.4s, v11.s[2]\n"
4434 "fmla v29.4s, v24.4s, v15.s[2]\n"
4435 "fmla v30.4s, v24.4s, v19.s[2]\n"
4436 "fmla v31.4s, v24.4s, v23.s[2]\n"
4437 "fmla v26.4s, v25.4s, v3.s[3]\n"
4438 "fmla v27.4s, v25.4s, v7.s[3]\n"
4439 "fmla v28.4s, v25.4s, v11.s[3]\n"
4440 "fmla v29.4s, v25.4s, v15.s[3]\n"
4441 "fmla v30.4s, v25.4s, v19.s[3]\n"
4442 "fmla v31.4s, v25.4s, v23.s[3]\n"
4443 "b 5f\n"
4444 "2:\n"
4445 "ldr q26, [%[biasptr]]\n"
4446 "add %[biasptr], %[biasptr], %[biasinc]\n"
4447 "mov v27.16b, v26.16b\n"
4448 "mov v28.16b, v26.16b\n"
4449 "mov v29.16b, v26.16b\n"
4450 "mov v30.16b, v26.16b\n"
4451 "mov v31.16b, v26.16b\n"
4452 "fmla v26.4s, v24.4s, v0.s[0]\n"
4453 "fmla v27.4s, v24.4s, v4.s[0]\n"
4454 "fmla v28.4s, v24.4s, v8.s[0]\n"
4455 "fmla v29.4s, v24.4s, v12.s[0]\n"
4456 "fmla v30.4s, v24.4s, v16.s[0]\n"
4457 "fmla v31.4s, v24.4s, v20.s[0]\n"
4458 "ldr q24, [%[b_ptr0]]\n"
4459 "fmla v26.4s, v25.4s, v0.s[1]\n"
4460 "fmla v27.4s, v25.4s, v4.s[1]\n"
4461 "fmla v28.4s, v25.4s, v8.s[1]\n"
4462 "fmla v29.4s, v25.4s, v12.s[1]\n"
4463 "fmla v30.4s, v25.4s, v16.s[1]\n"
4464 "fmla v31.4s, v25.4s, v20.s[1]\n"
4465 "ldr q25, [%[b_ptr0], #0x10]\n"
4466 "fmla v26.4s, v24.4s, v0.s[2]\n"
4467 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4468 "fmla v27.4s, v24.4s, v4.s[2]\n"
4469 "fmla v28.4s, v24.4s, v8.s[2]\n"
4470 "fmla v29.4s, v24.4s, v12.s[2]\n"
4471 "fmla v30.4s, v24.4s, v16.s[2]\n"
4472 "fmla v31.4s, v24.4s, v20.s[2]\n"
4473 "ldr q24, [%[b_ptr0]]\n"
4474 "fmla v26.4s, v25.4s, v0.s[3]\n"
4475 "fmla v27.4s, v25.4s, v4.s[3]\n"
4476 "fmla v28.4s, v25.4s, v8.s[3]\n"
4477 "fmla v29.4s, v25.4s, v12.s[3]\n"
4478 "fmla v30.4s, v25.4s, v16.s[3]\n"
4479 "fmla v31.4s, v25.4s, v20.s[3]\n"
4480 "ldr q25, [%[b_ptr0], #0x10]\n"
4481 "fmla v26.4s, v24.4s, v1.s[0]\n"
4482 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4483 "fmla v27.4s, v24.4s, v5.s[0]\n"
4484 "fmla v28.4s, v24.4s, v9.s[0]\n"
4485 "fmla v29.4s, v24.4s, v13.s[0]\n"
4486 "fmla v30.4s, v24.4s, v17.s[0]\n"
4487 "fmla v31.4s, v24.4s, v21.s[0]\n"
4488 "ldr q24, [%[b_ptr0]]\n"
4489 "fmla v26.4s, v25.4s, v1.s[1]\n"
4490 "fmla v27.4s, v25.4s, v5.s[1]\n"
4491 "fmla v28.4s, v25.4s, v9.s[1]\n"
4492 "fmla v29.4s, v25.4s, v13.s[1]\n"
4493 "fmla v30.4s, v25.4s, v17.s[1]\n"
4494 "fmla v31.4s, v25.4s, v21.s[1]\n"
4495 "ldr q25, [%[b_ptr0], #0x10]\n"
4496 "fmla v26.4s, v24.4s, v1.s[2]\n"
4497 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4498 "fmla v27.4s, v24.4s, v5.s[2]\n"
4499 "fmla v28.4s, v24.4s, v9.s[2]\n"
4500 "fmla v29.4s, v24.4s, v13.s[2]\n"
4501 "fmla v30.4s, v24.4s, v17.s[2]\n"
4502 "fmla v31.4s, v24.4s, v21.s[2]\n"
4503 "ldr q24, [%[b_ptr0]]\n"
4504 "fmla v26.4s, v25.4s, v1.s[3]\n"
4505 "fmla v27.4s, v25.4s, v5.s[3]\n"
4506 "fmla v28.4s, v25.4s, v9.s[3]\n"
4507 "fmla v29.4s, v25.4s, v13.s[3]\n"
4508 "fmla v30.4s, v25.4s, v17.s[3]\n"
4509 "fmla v31.4s, v25.4s, v21.s[3]\n"
4510 "ldr q25, [%[b_ptr0], #0x10]\n"
4511 "fmla v26.4s, v24.4s, v2.s[0]\n"
4512 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4513 "fmla v27.4s, v24.4s, v6.s[0]\n"
4514 "fmla v28.4s, v24.4s, v10.s[0]\n"
4515 "fmla v29.4s, v24.4s, v14.s[0]\n"
4516 "fmla v30.4s, v24.4s, v18.s[0]\n"
4517 "fmla v31.4s, v24.4s, v22.s[0]\n"
4518 "ldr q24, [%[b_ptr0]]\n"
4519 "fmla v26.4s, v25.4s, v2.s[1]\n"
4520 "fmla v27.4s, v25.4s, v6.s[1]\n"
4521 "fmla v28.4s, v25.4s, v10.s[1]\n"
4522 "fmla v29.4s, v25.4s, v14.s[1]\n"
4523 "fmla v30.4s, v25.4s, v18.s[1]\n"
4524 "fmla v31.4s, v25.4s, v22.s[1]\n"
4525 "ldr q25, [%[b_ptr0], #0x10]\n"
4526 "fmla v26.4s, v24.4s, v2.s[2]\n"
4527 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4528 "fmla v27.4s, v24.4s, v6.s[2]\n"
4529 "fmla v28.4s, v24.4s, v10.s[2]\n"
4530 "fmla v29.4s, v24.4s, v14.s[2]\n"
4531 "fmla v30.4s, v24.4s, v18.s[2]\n"
4532 "fmla v31.4s, v24.4s, v22.s[2]\n"
4533 "ldr q24, [%[b_ptr0]]\n"
4534 "fmla v26.4s, v25.4s, v2.s[3]\n"
4535 "fmla v27.4s, v25.4s, v6.s[3]\n"
4536 "fmla v28.4s, v25.4s, v10.s[3]\n"
4537 "fmla v29.4s, v25.4s, v14.s[3]\n"
4538 "fmla v30.4s, v25.4s, v18.s[3]\n"
4539 "fmla v31.4s, v25.4s, v22.s[3]\n"
4540 "ldr q25, [%[b_ptr0], #0x10]\n"
4541 "fmla v26.4s, v24.4s, v3.s[0]\n"
4542 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4543 "fmla v27.4s, v24.4s, v7.s[0]\n"
4544 "fmla v28.4s, v24.4s, v11.s[0]\n"
4545 "fmla v29.4s, v24.4s, v15.s[0]\n"
4546 "fmla v30.4s, v24.4s, v19.s[0]\n"
4547 "fmla v31.4s, v24.4s, v23.s[0]\n"
4548 "ldr q24, [%[b_ptr0]]\n"
4549 "fmla v26.4s, v25.4s, v3.s[1]\n"
4550 "fmla v27.4s, v25.4s, v7.s[1]\n"
4551 "fmla v28.4s, v25.4s, v11.s[1]\n"
4552 "fmla v29.4s, v25.4s, v15.s[1]\n"
4553 "fmla v30.4s, v25.4s, v19.s[1]\n"
4554 "fmla v31.4s, v25.4s, v23.s[1]\n"
4555 "ldr q25, [%[b_ptr0], #0x10]\n"
4556 "fmla v26.4s, v24.4s, v3.s[2]\n"
4557 "add %[b_ptr0], %[b_ptr0], #0x20\n"
4558 "fmla v27.4s, v24.4s, v7.s[2]\n"
4559 "fmla v28.4s, v24.4s, v11.s[2]\n"
4560 "fmla v29.4s, v24.4s, v15.s[2]\n"
4561 "fmla v30.4s, v24.4s, v19.s[2]\n"
4562 "fmla v31.4s, v24.4s, v23.s[2]\n"
4563 "fmla v26.4s, v25.4s, v3.s[3]\n"
4564 "fmla v27.4s, v25.4s, v7.s[3]\n"
4565 "fmla v28.4s, v25.4s, v11.s[3]\n"
4566 "fmla v29.4s, v25.4s, v15.s[3]\n"
4567 "fmla v30.4s, v25.4s, v19.s[3]\n"
4568 "fmla v31.4s, v25.4s, v23.s[3]\n"
4569 "5:\n"
4570 "ld1r {v24.4s}, [%[minptr]]\n"
4571 "ld1r {v25.4s}, [%[maxptr]]\n"
4572 "fmax v26.4s, v26.4s, v24.4s\n"
4573 "fmax v27.4s, v27.4s, v24.4s\n"
4574 "fmax v28.4s, v28.4s, v24.4s\n"
4575 "fmax v29.4s, v29.4s, v24.4s\n"
4576 "fmin v26.4s, v26.4s, v25.4s\n"
4577 "fmin v27.4s, v27.4s, v25.4s\n"
4578 "fmin v28.4s, v28.4s, v25.4s\n"
4579 "fmin v29.4s, v29.4s, v25.4s\n"
4580 "str q26, [%[c_ptr0]]\n"
4581 "fmax v30.4s, v30.4s, v24.4s\n"
4582 "add %[c_ptr0], %[c_ptr0], #0x10\n"
4583 "fmax v31.4s, v31.4s, v24.4s\n"
4584 "str q27, [c_ptr1]\n"
4585 "fmin v30.4s, v30.4s, v25.4s\n"
4586 "fmin v31.4s, v31.4s, v25.4s\n"
4587 "str q28, [c_ptr2]\n"
4588 "str q29, [c_ptr3]\n"
4589 "str q30, [c_ptr4]\n"
4590 "str q31, [c_ptr5]\n"
4591 ".unreq a_ptr1\n"
4592 ".unreq a_ptr2\n"
4593 ".unreq a_ptr3\n"
4594 ".unreq a_ptr4\n"
4595 ".unreq a_ptr5\n"
4596 ".unreq c_ptr1\n"
4597 ".unreq c_ptr2\n"
4598 ".unreq c_ptr3\n"
4599 ".unreq c_ptr4\n"
4600 ".unreq c_ptr5\n"
4601 : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
4602 : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
4603 : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
4604 );
4605 break;
4606 }
4607 }
4608 }
4609
4610 } // namespace arm_gemm
4611
4612 #endif // __aarch64__
4613