xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #ifdef __aarch64__
27 
28 template<>
MergeResults(float * out,const float * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const float * bias,Activation act,bool append)29 void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float *bias, Activation act, bool append)
30 {
31     const float *inptr = in;
32     float nullbias[12];
33     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
34     float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
35 
36     switch(act.type)
37     {
38         default:
39         case Activation::Type::None:
40             break;
41         case Activation::Type::BoundedReLU:
42             maxval = static_cast<float>(act.param1);
43             /* fall through */
44         case Activation::Type::ReLU:
45             minval = 0.0f;
46             break;
47     }
48 
49     if (!append && !bias)
50     {
51         memset(nullbias, 0, (12 * sizeof(float)));
52     }
53 
54     for (int y=y0; y<ymax; y+=8)
55     {
56         float *outptr0 = out + (y * ldout) + x0;
57         float *outptr1 = outptr0 + ldout;
58         float *outptr2 = outptr1 + ldout;
59         float *outptr3 = outptr2 + ldout;
60         float *outptr4 = outptr3 + ldout;
61         float *outptr5 = outptr4 + ldout;
62         float *outptr6 = outptr5 + ldout;
63         float *outptr7 = outptr6 + ldout;
64 
65         const int height = ymax - y;
66 
67         for (int i=x0; i<xmax; i+=12)
68         {
69             if (append)
70             {
71                 switch(height)
72                 {
73                 case 1:
74                     {
75                         if ((i+11) >= xmax)
76                         {
77                             for (int xi=0; xi<11; xi++)
78                             {
79                                 if ((i+xi) < xmax)
80                                 {
81                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
82                                     outptr0++;
83                                 }
84                             }
85                             inptr += 96;
86                         } else {
87                             /* Optimized routine to copy an entire block */
88                             __asm __volatile (
89                                 "dup v0.4s, %[maxval].s[0]\n"
90                                 "ldr q2, [%[outptr0]]\n"
91                                 "dup v1.4s, %[minval].s[0]\n"
92                                 "ldr q10, [%[inptr]]\n"
93                                 "ldr q3, [%[outptr0], #0x10]\n"
94                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
95                                 "ldr q11, [%[inptr], #0x10]\n"
96                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
97                                 "fadd v10.4s, v10.4s, v2.4s\n"
98                                 "ldr q4, [%[outptr0], #0x20]\n"
99                                 "ldr q12, [%[inptr], #0x20]\n"
100                                 "add %[inptr], %[inptr], #0x180\n"
101                                 "fadd v11.4s, v11.4s, v3.4s\n"
102                                 "fmin v10.4s, v10.4s, v0.4s\n"
103                                 "fadd v12.4s, v12.4s, v4.4s\n"
104                                 "fmin v11.4s, v11.4s, v0.4s\n"
105                                 "fmax v10.4s, v10.4s, v1.4s\n"
106                                 "fmin v12.4s, v12.4s, v0.4s\n"
107                                 "fmax v11.4s, v11.4s, v1.4s\n"
108                                 "str q10, [%[outptr0]]\n"
109                                 "fmax v12.4s, v12.4s, v1.4s\n"
110                                 "str q11, [%[outptr0], #0x10]\n"
111                                 "str q12, [%[outptr0], #0x20]\n"
112                                 "add %[outptr0], %[outptr0], #0x30\n"
113                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
114                               [inptr] "+r" (inptr)
115                             : [minval] "w" (minval), [maxval] "w" (maxval)
116                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
117                             );
118                         }
119                     }
120                     break;
121 
122                 case 2:
123                     {
124                         if ((i+11) >= xmax)
125                         {
126                             for (int xi=0; xi<11; xi++)
127                             {
128                                 if ((i+xi) < xmax)
129                                 {
130                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
131                                     outptr0++;
132                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
133                                     outptr1++;
134                                 }
135                             }
136                             inptr += 96;
137                         } else {
138                             /* Optimized routine to copy an entire block */
139                             __asm __volatile (
140                                 "dup v0.4s, %[maxval].s[0]\n"
141                                 "ldr q2, [%[outptr0]]\n"
142                                 "dup v1.4s, %[minval].s[0]\n"
143                                 "ldr q10, [%[inptr]]\n"
144                                 "ldr q3, [%[outptr0], #0x10]\n"
145                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
146                                 "ldr q11, [%[inptr], #0x10]\n"
147                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
148                                 "fadd v10.4s, v10.4s, v2.4s\n"
149                                 "ldr q4, [%[outptr0], #0x20]\n"
150                                 "ldr q12, [%[inptr], #0x20]\n"
151                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
152                                 "fadd v11.4s, v11.4s, v3.4s\n"
153                                 "ldr q5, [%[outptr1]]\n"
154                                 "fmin v10.4s, v10.4s, v0.4s\n"
155                                 "ldr q13, [%[inptr], #0x30]\n"
156                                 "fadd v12.4s, v12.4s, v4.4s\n"
157                                 "ldr q6, [%[outptr1], #0x10]\n"
158                                 "ldr q14, [%[inptr], #0x40]\n"
159                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
160                                 "fmax v10.4s, v10.4s, v1.4s\n"
161                                 "ldr q7, [%[outptr1], #0x20]\n"
162                                 "fmin v11.4s, v11.4s, v0.4s\n"
163                                 "ldr q15, [%[inptr], #0x50]\n"
164                                 "fmin v12.4s, v12.4s, v0.4s\n"
165                                 "add %[inptr], %[inptr], #0x180\n"
166                                 "fadd v13.4s, v13.4s, v5.4s\n"
167                                 "str q10, [%[outptr0]]\n"
168                                 "fmax v11.4s, v11.4s, v1.4s\n"
169                                 "fmax v12.4s, v12.4s, v1.4s\n"
170                                 "fadd v14.4s, v14.4s, v6.4s\n"
171                                 "fmin v13.4s, v13.4s, v0.4s\n"
172                                 "str q11, [%[outptr0], #0x10]\n"
173                                 "fadd v15.4s, v15.4s, v7.4s\n"
174                                 "fmin v14.4s, v14.4s, v0.4s\n"
175                                 "str q12, [%[outptr0], #0x20]\n"
176                                 "fmax v13.4s, v13.4s, v1.4s\n"
177                                 "add %[outptr0], %[outptr0], #0x30\n"
178                                 "fmin v15.4s, v15.4s, v0.4s\n"
179                                 "fmax v14.4s, v14.4s, v1.4s\n"
180                                 "str q13, [%[outptr1]]\n"
181                                 "fmax v15.4s, v15.4s, v1.4s\n"
182                                 "str q14, [%[outptr1], #0x10]\n"
183                                 "str q15, [%[outptr1], #0x20]\n"
184                                 "add %[outptr1], %[outptr1], #0x30\n"
185                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
186                               [inptr] "+r" (inptr)
187                             : [minval] "w" (minval), [maxval] "w" (maxval)
188                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
189                             );
190                         }
191                     }
192                     break;
193 
194                 case 3:
195                     {
196                         if ((i+11) >= xmax)
197                         {
198                             for (int xi=0; xi<11; xi++)
199                             {
200                                 if ((i+xi) < xmax)
201                                 {
202                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
203                                     outptr0++;
204                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
205                                     outptr1++;
206                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
207                                     outptr2++;
208                                 }
209                             }
210                             inptr += 96;
211                         } else {
212                             /* Optimized routine to copy an entire block */
213                             __asm __volatile (
214                                 "dup v0.4s, %[maxval].s[0]\n"
215                                 "ldr q2, [%[outptr0]]\n"
216                                 "dup v1.4s, %[minval].s[0]\n"
217                                 "ldr q10, [%[inptr]]\n"
218                                 "ldr q3, [%[outptr0], #0x10]\n"
219                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
220                                 "ldr q11, [%[inptr], #0x10]\n"
221                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
222                                 "fadd v10.4s, v10.4s, v2.4s\n"
223                                 "ldr q4, [%[outptr0], #0x20]\n"
224                                 "ldr q12, [%[inptr], #0x20]\n"
225                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
226                                 "fadd v11.4s, v11.4s, v3.4s\n"
227                                 "ldr q5, [%[outptr1]]\n"
228                                 "fmin v10.4s, v10.4s, v0.4s\n"
229                                 "ldr q13, [%[inptr], #0x30]\n"
230                                 "fadd v12.4s, v12.4s, v4.4s\n"
231                                 "ldr q6, [%[outptr1], #0x10]\n"
232                                 "ldr q14, [%[inptr], #0x40]\n"
233                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
234                                 "fmax v10.4s, v10.4s, v1.4s\n"
235                                 "ldr q7, [%[outptr1], #0x20]\n"
236                                 "fmin v11.4s, v11.4s, v0.4s\n"
237                                 "ldr q15, [%[inptr], #0x50]\n"
238                                 "fmin v12.4s, v12.4s, v0.4s\n"
239                                 "ldr q8, [%[outptr2]]\n"
240                                 "fadd v13.4s, v13.4s, v5.4s\n"
241                                 "str q10, [%[outptr0]]\n"
242                                 "fadd v14.4s, v14.4s, v6.4s\n"
243                                 "ldr q16, [%[inptr], #0x60]\n"
244                                 "fmax v11.4s, v11.4s, v1.4s\n"
245                                 "ldr q9, [%[outptr2], #0x10]\n"
246                                 "fmax v12.4s, v12.4s, v1.4s\n"
247                                 "ldr q17, [%[inptr], #0x70]\n"
248                                 "fmin v13.4s, v13.4s, v0.4s\n"
249                                 "ldr q2, [%[outptr2], #0x20]\n"
250                                 "fmin v14.4s, v14.4s, v0.4s\n"
251                                 "str q11, [%[outptr0], #0x10]\n"
252                                 "fadd v15.4s, v15.4s, v7.4s\n"
253                                 "ldr q10, [%[inptr], #0x80]\n"
254                                 "fadd v16.4s, v16.4s, v8.4s\n"
255                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
256                                 "fmax v13.4s, v13.4s, v1.4s\n"
257                                 "str q12, [%[outptr0], #0x20]\n"
258                                 "fmax v14.4s, v14.4s, v1.4s\n"
259                                 "add %[outptr0], %[outptr0], #0x30\n"
260                                 "fmin v15.4s, v15.4s, v0.4s\n"
261                                 "str q13, [%[outptr1]]\n"
262                                 "fmin v16.4s, v16.4s, v0.4s\n"
263                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
264                                 "fadd v17.4s, v17.4s, v9.4s\n"
265                                 "str q14, [%[outptr1], #0x10]\n"
266                                 "fmax v15.4s, v15.4s, v1.4s\n"
267                                 "add %[inptr], %[inptr], #0x180\n"
268                                 "fmax v16.4s, v16.4s, v1.4s\n"
269                                 "fmin v17.4s, v17.4s, v0.4s\n"
270                                 "str q15, [%[outptr1], #0x20]\n"
271                                 "fadd v10.4s, v10.4s, v2.4s\n"
272                                 "add %[outptr1], %[outptr1], #0x30\n"
273                                 "fmax v17.4s, v17.4s, v1.4s\n"
274                                 "str q16, [%[outptr2]]\n"
275                                 "fmin v10.4s, v10.4s, v0.4s\n"
276                                 "str q17, [%[outptr2], #0x10]\n"
277                                 "fmax v10.4s, v10.4s, v1.4s\n"
278                                 "str q10, [%[outptr2], #0x20]\n"
279                                 "add %[outptr2], %[outptr2], #0x30\n"
280                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
281                               [inptr] "+r" (inptr)
282                             : [minval] "w" (minval), [maxval] "w" (maxval)
283                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
284                             );
285                         }
286                     }
287                     break;
288 
289                 case 4:
290                     {
291                         if ((i+11) >= xmax)
292                         {
293                             for (int xi=0; xi<11; xi++)
294                             {
295                                 if ((i+xi) < xmax)
296                                 {
297                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
298                                     outptr0++;
299                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
300                                     outptr1++;
301                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
302                                     outptr2++;
303                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
304                                     outptr3++;
305                                 }
306                             }
307                             inptr += 96;
308                         } else {
309                             /* Optimized routine to copy an entire block */
310                             __asm __volatile (
311                                 "dup v0.4s, %[maxval].s[0]\n"
312                                 "ldr q2, [%[outptr0]]\n"
313                                 "dup v1.4s, %[minval].s[0]\n"
314                                 "ldr q10, [%[inptr]]\n"
315                                 "ldr q3, [%[outptr0], #0x10]\n"
316                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
317                                 "ldr q11, [%[inptr], #0x10]\n"
318                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
319                                 "fadd v10.4s, v10.4s, v2.4s\n"
320                                 "ldr q4, [%[outptr0], #0x20]\n"
321                                 "ldr q12, [%[inptr], #0x20]\n"
322                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
323                                 "fadd v11.4s, v11.4s, v3.4s\n"
324                                 "ldr q5, [%[outptr1]]\n"
325                                 "fmin v10.4s, v10.4s, v0.4s\n"
326                                 "ldr q13, [%[inptr], #0x30]\n"
327                                 "fadd v12.4s, v12.4s, v4.4s\n"
328                                 "ldr q6, [%[outptr1], #0x10]\n"
329                                 "ldr q14, [%[inptr], #0x40]\n"
330                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
331                                 "fmax v10.4s, v10.4s, v1.4s\n"
332                                 "ldr q7, [%[outptr1], #0x20]\n"
333                                 "fmin v11.4s, v11.4s, v0.4s\n"
334                                 "ldr q15, [%[inptr], #0x50]\n"
335                                 "fmin v12.4s, v12.4s, v0.4s\n"
336                                 "ldr q8, [%[outptr2]]\n"
337                                 "fadd v13.4s, v13.4s, v5.4s\n"
338                                 "str q10, [%[outptr0]]\n"
339                                 "fadd v14.4s, v14.4s, v6.4s\n"
340                                 "ldr q16, [%[inptr], #0x60]\n"
341                                 "fmax v11.4s, v11.4s, v1.4s\n"
342                                 "ldr q9, [%[outptr2], #0x10]\n"
343                                 "fmax v12.4s, v12.4s, v1.4s\n"
344                                 "ldr q17, [%[inptr], #0x70]\n"
345                                 "fmin v13.4s, v13.4s, v0.4s\n"
346                                 "ldr q2, [%[outptr2], #0x20]\n"
347                                 "fmin v14.4s, v14.4s, v0.4s\n"
348                                 "str q11, [%[outptr0], #0x10]\n"
349                                 "fadd v15.4s, v15.4s, v7.4s\n"
350                                 "ldr q10, [%[inptr], #0x80]\n"
351                                 "fadd v16.4s, v16.4s, v8.4s\n"
352                                 "ldr q3, [%[outptr3]]\n"
353                                 "fmax v13.4s, v13.4s, v1.4s\n"
354                                 "str q12, [%[outptr0], #0x20]\n"
355                                 "fmax v14.4s, v14.4s, v1.4s\n"
356                                 "ldr q11, [%[inptr], #0x90]\n"
357                                 "fmin v15.4s, v15.4s, v0.4s\n"
358                                 "ldr q4, [%[outptr3], #0x10]\n"
359                                 "fmin v16.4s, v16.4s, v0.4s\n"
360                                 "str q13, [%[outptr1]]\n"
361                                 "fadd v17.4s, v17.4s, v9.4s\n"
362                                 "ldr q12, [%[inptr], #0xa0]\n"
363                                 "fadd v10.4s, v10.4s, v2.4s\n"
364                                 "ldr q5, [%[outptr3], #0x20]\n"
365                                 "fmax v15.4s, v15.4s, v1.4s\n"
366                                 "str q14, [%[outptr1], #0x10]\n"
367                                 "fmax v16.4s, v16.4s, v1.4s\n"
368                                 "ldr q13, [%[inptr], #0xb0]\n"
369                                 "fmin v17.4s, v17.4s, v0.4s\n"
370                                 "add %[outptr0], %[outptr0], #0x30\n"
371                                 "fmin v10.4s, v10.4s, v0.4s\n"
372                                 "str q15, [%[outptr1], #0x20]\n"
373                                 "fadd v11.4s, v11.4s, v3.4s\n"
374                                 "add %[outptr1], %[outptr1], #0x30\n"
375                                 "fmax v17.4s, v17.4s, v1.4s\n"
376                                 "str q16, [%[outptr2]]\n"
377                                 "fmax v10.4s, v10.4s, v1.4s\n"
378                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
379                                 "fmin v11.4s, v11.4s, v0.4s\n"
380                                 "str q17, [%[outptr2], #0x10]\n"
381                                 "fadd v12.4s, v12.4s, v4.4s\n"
382                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
383                                 "fadd v13.4s, v13.4s, v5.4s\n"
384                                 "str q10, [%[outptr2], #0x20]\n"
385                                 "fmax v11.4s, v11.4s, v1.4s\n"
386                                 "add %[outptr2], %[outptr2], #0x30\n"
387                                 "fmin v12.4s, v12.4s, v0.4s\n"
388                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
389                                 "fmin v13.4s, v13.4s, v0.4s\n"
390                                 "str q11, [%[outptr3]]\n"
391                                 "add %[inptr], %[inptr], #0x180\n"
392                                 "fmax v12.4s, v12.4s, v1.4s\n"
393                                 "fmax v13.4s, v13.4s, v1.4s\n"
394                                 "str q12, [%[outptr3], #0x10]\n"
395                                 "str q13, [%[outptr3], #0x20]\n"
396                                 "add %[outptr3], %[outptr3], #0x30\n"
397                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
398                               [inptr] "+r" (inptr)
399                             : [minval] "w" (minval), [maxval] "w" (maxval)
400                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
401                             );
402                         }
403                     }
404                     break;
405 
406                 case 5:
407                     {
408                         if ((i+11) >= xmax)
409                         {
410                             for (int xi=0; xi<11; xi++)
411                             {
412                                 if ((i+xi) < xmax)
413                                 {
414                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
415                                     outptr0++;
416                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
417                                     outptr1++;
418                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
419                                     outptr2++;
420                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
421                                     outptr3++;
422                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
423                                     outptr4++;
424                                 }
425                             }
426                             inptr += 96;
427                         } else {
428                             /* Optimized routine to copy an entire block */
429                             __asm __volatile (
430                                 "dup v0.4s, %[maxval].s[0]\n"
431                                 "ldr q2, [%[outptr0]]\n"
432                                 "dup v1.4s, %[minval].s[0]\n"
433                                 "ldr q10, [%[inptr]]\n"
434                                 "ldr q3, [%[outptr0], #0x10]\n"
435                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
436                                 "ldr q11, [%[inptr], #0x10]\n"
437                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
438                                 "fadd v10.4s, v10.4s, v2.4s\n"
439                                 "ldr q4, [%[outptr0], #0x20]\n"
440                                 "ldr q12, [%[inptr], #0x20]\n"
441                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
442                                 "fadd v11.4s, v11.4s, v3.4s\n"
443                                 "ldr q5, [%[outptr1]]\n"
444                                 "fmin v10.4s, v10.4s, v0.4s\n"
445                                 "ldr q13, [%[inptr], #0x30]\n"
446                                 "fadd v12.4s, v12.4s, v4.4s\n"
447                                 "ldr q6, [%[outptr1], #0x10]\n"
448                                 "ldr q14, [%[inptr], #0x40]\n"
449                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
450                                 "fmax v10.4s, v10.4s, v1.4s\n"
451                                 "ldr q7, [%[outptr1], #0x20]\n"
452                                 "fmin v11.4s, v11.4s, v0.4s\n"
453                                 "ldr q15, [%[inptr], #0x50]\n"
454                                 "fmin v12.4s, v12.4s, v0.4s\n"
455                                 "ldr q8, [%[outptr2]]\n"
456                                 "fadd v13.4s, v13.4s, v5.4s\n"
457                                 "str q10, [%[outptr0]]\n"
458                                 "fadd v14.4s, v14.4s, v6.4s\n"
459                                 "ldr q16, [%[inptr], #0x60]\n"
460                                 "fmax v11.4s, v11.4s, v1.4s\n"
461                                 "ldr q9, [%[outptr2], #0x10]\n"
462                                 "fmax v12.4s, v12.4s, v1.4s\n"
463                                 "ldr q17, [%[inptr], #0x70]\n"
464                                 "fmin v13.4s, v13.4s, v0.4s\n"
465                                 "ldr q2, [%[outptr2], #0x20]\n"
466                                 "fmin v14.4s, v14.4s, v0.4s\n"
467                                 "str q11, [%[outptr0], #0x10]\n"
468                                 "fadd v15.4s, v15.4s, v7.4s\n"
469                                 "ldr q10, [%[inptr], #0x80]\n"
470                                 "fadd v16.4s, v16.4s, v8.4s\n"
471                                 "ldr q3, [%[outptr3]]\n"
472                                 "fmax v13.4s, v13.4s, v1.4s\n"
473                                 "str q12, [%[outptr0], #0x20]\n"
474                                 "fmax v14.4s, v14.4s, v1.4s\n"
475                                 "ldr q11, [%[inptr], #0x90]\n"
476                                 "fmin v15.4s, v15.4s, v0.4s\n"
477                                 "ldr q4, [%[outptr3], #0x10]\n"
478                                 "fmin v16.4s, v16.4s, v0.4s\n"
479                                 "str q13, [%[outptr1]]\n"
480                                 "fadd v17.4s, v17.4s, v9.4s\n"
481                                 "ldr q12, [%[inptr], #0xa0]\n"
482                                 "fadd v10.4s, v10.4s, v2.4s\n"
483                                 "ldr q5, [%[outptr3], #0x20]\n"
484                                 "fmax v15.4s, v15.4s, v1.4s\n"
485                                 "str q14, [%[outptr1], #0x10]\n"
486                                 "fmax v16.4s, v16.4s, v1.4s\n"
487                                 "ldr q13, [%[inptr], #0xb0]\n"
488                                 "fmin v17.4s, v17.4s, v0.4s\n"
489                                 "ldr q6, [%[outptr4]]\n"
490                                 "fmin v10.4s, v10.4s, v0.4s\n"
491                                 "str q15, [%[outptr1], #0x20]\n"
492                                 "fadd v11.4s, v11.4s, v3.4s\n"
493                                 "ldr q14, [%[inptr], #0xc0]\n"
494                                 "fadd v12.4s, v12.4s, v4.4s\n"
495                                 "ldr q7, [%[outptr4], #0x10]\n"
496                                 "fmax v17.4s, v17.4s, v1.4s\n"
497                                 "str q16, [%[outptr2]]\n"
498                                 "fmax v10.4s, v10.4s, v1.4s\n"
499                                 "ldr q15, [%[inptr], #0xd0]\n"
500                                 "fmin v11.4s, v11.4s, v0.4s\n"
501                                 "ldr q8, [%[outptr4], #0x20]\n"
502                                 "fmin v12.4s, v12.4s, v0.4s\n"
503                                 "str q17, [%[outptr2], #0x10]\n"
504                                 "fadd v13.4s, v13.4s, v5.4s\n"
505                                 "ldr q16, [%[inptr], #0xe0]\n"
506                                 "fadd v14.4s, v14.4s, v6.4s\n"
507                                 "add %[outptr0], %[outptr0], #0x30\n"
508                                 "fmax v11.4s, v11.4s, v1.4s\n"
509                                 "str q10, [%[outptr2], #0x20]\n"
510                                 "fmax v12.4s, v12.4s, v1.4s\n"
511                                 "add %[outptr1], %[outptr1], #0x30\n"
512                                 "fmin v13.4s, v13.4s, v0.4s\n"
513                                 "str q11, [%[outptr3]]\n"
514                                 "fmin v14.4s, v14.4s, v0.4s\n"
515                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
516                                 "fadd v15.4s, v15.4s, v7.4s\n"
517                                 "str q12, [%[outptr3], #0x10]\n"
518                                 "fmax v13.4s, v13.4s, v1.4s\n"
519                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
520                                 "fmax v14.4s, v14.4s, v1.4s\n"
521                                 "add %[outptr2], %[outptr2], #0x30\n"
522                                 "fmin v15.4s, v15.4s, v0.4s\n"
523                                 "str q13, [%[outptr3], #0x20]\n"
524                                 "fadd v16.4s, v16.4s, v8.4s\n"
525                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
526                                 "add %[outptr3], %[outptr3], #0x30\n"
527                                 "fmax v15.4s, v15.4s, v1.4s\n"
528                                 "str q14, [%[outptr4]]\n"
529                                 "fmin v16.4s, v16.4s, v0.4s\n"
530                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
531                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
532                                 "str q15, [%[outptr4], #0x10]\n"
533                                 "add %[inptr], %[inptr], #0x180\n"
534                                 "fmax v16.4s, v16.4s, v1.4s\n"
535                                 "str q16, [%[outptr4], #0x20]\n"
536                                 "add %[outptr4], %[outptr4], #0x30\n"
537                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
538                               [inptr] "+r" (inptr)
539                             : [minval] "w" (minval), [maxval] "w" (maxval)
540                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
541                             );
542                         }
543                     }
544                     break;
545 
546                 case 6:
547                     {
548                         if ((i+11) >= xmax)
549                         {
550                             for (int xi=0; xi<11; xi++)
551                             {
552                                 if ((i+xi) < xmax)
553                                 {
554                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
555                                     outptr0++;
556                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
557                                     outptr1++;
558                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
559                                     outptr2++;
560                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
561                                     outptr3++;
562                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
563                                     outptr4++;
564                                     *outptr5 = std::min(std::max(minval, inptr[xi + 60] + *outptr5), maxval);
565                                     outptr5++;
566                                 }
567                             }
568                             inptr += 96;
569                         } else {
570                             /* Optimized routine to copy an entire block */
571                             __asm __volatile (
572                                 "dup v0.4s, %[maxval].s[0]\n"
573                                 "ldr q2, [%[outptr0]]\n"
574                                 "dup v1.4s, %[minval].s[0]\n"
575                                 "ldr q10, [%[inptr]]\n"
576                                 "ldr q3, [%[outptr0], #0x10]\n"
577                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
578                                 "ldr q11, [%[inptr], #0x10]\n"
579                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
580                                 "fadd v10.4s, v10.4s, v2.4s\n"
581                                 "ldr q4, [%[outptr0], #0x20]\n"
582                                 "ldr q12, [%[inptr], #0x20]\n"
583                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
584                                 "fadd v11.4s, v11.4s, v3.4s\n"
585                                 "ldr q5, [%[outptr1]]\n"
586                                 "fmin v10.4s, v10.4s, v0.4s\n"
587                                 "ldr q13, [%[inptr], #0x30]\n"
588                                 "fadd v12.4s, v12.4s, v4.4s\n"
589                                 "ldr q6, [%[outptr1], #0x10]\n"
590                                 "ldr q14, [%[inptr], #0x40]\n"
591                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
592                                 "fmax v10.4s, v10.4s, v1.4s\n"
593                                 "ldr q7, [%[outptr1], #0x20]\n"
594                                 "fmin v11.4s, v11.4s, v0.4s\n"
595                                 "ldr q15, [%[inptr], #0x50]\n"
596                                 "fmin v12.4s, v12.4s, v0.4s\n"
597                                 "ldr q8, [%[outptr2]]\n"
598                                 "fadd v13.4s, v13.4s, v5.4s\n"
599                                 "str q10, [%[outptr0]]\n"
600                                 "fadd v14.4s, v14.4s, v6.4s\n"
601                                 "ldr q16, [%[inptr], #0x60]\n"
602                                 "fmax v11.4s, v11.4s, v1.4s\n"
603                                 "ldr q9, [%[outptr2], #0x10]\n"
604                                 "fmax v12.4s, v12.4s, v1.4s\n"
605                                 "ldr q17, [%[inptr], #0x70]\n"
606                                 "fmin v13.4s, v13.4s, v0.4s\n"
607                                 "ldr q2, [%[outptr2], #0x20]\n"
608                                 "fmin v14.4s, v14.4s, v0.4s\n"
609                                 "str q11, [%[outptr0], #0x10]\n"
610                                 "fadd v15.4s, v15.4s, v7.4s\n"
611                                 "ldr q10, [%[inptr], #0x80]\n"
612                                 "fadd v16.4s, v16.4s, v8.4s\n"
613                                 "ldr q3, [%[outptr3]]\n"
614                                 "fmax v13.4s, v13.4s, v1.4s\n"
615                                 "str q12, [%[outptr0], #0x20]\n"
616                                 "fmax v14.4s, v14.4s, v1.4s\n"
617                                 "ldr q11, [%[inptr], #0x90]\n"
618                                 "fmin v15.4s, v15.4s, v0.4s\n"
619                                 "ldr q4, [%[outptr3], #0x10]\n"
620                                 "fmin v16.4s, v16.4s, v0.4s\n"
621                                 "str q13, [%[outptr1]]\n"
622                                 "fadd v17.4s, v17.4s, v9.4s\n"
623                                 "ldr q12, [%[inptr], #0xa0]\n"
624                                 "fadd v10.4s, v10.4s, v2.4s\n"
625                                 "ldr q5, [%[outptr3], #0x20]\n"
626                                 "fmax v15.4s, v15.4s, v1.4s\n"
627                                 "str q14, [%[outptr1], #0x10]\n"
628                                 "fmax v16.4s, v16.4s, v1.4s\n"
629                                 "ldr q13, [%[inptr], #0xb0]\n"
630                                 "fmin v17.4s, v17.4s, v0.4s\n"
631                                 "ldr q6, [%[outptr4]]\n"
632                                 "fmin v10.4s, v10.4s, v0.4s\n"
633                                 "str q15, [%[outptr1], #0x20]\n"
634                                 "fadd v11.4s, v11.4s, v3.4s\n"
635                                 "ldr q14, [%[inptr], #0xc0]\n"
636                                 "fadd v12.4s, v12.4s, v4.4s\n"
637                                 "ldr q7, [%[outptr4], #0x10]\n"
638                                 "fmax v17.4s, v17.4s, v1.4s\n"
639                                 "str q16, [%[outptr2]]\n"
640                                 "fmax v10.4s, v10.4s, v1.4s\n"
641                                 "ldr q15, [%[inptr], #0xd0]\n"
642                                 "fmin v11.4s, v11.4s, v0.4s\n"
643                                 "ldr q8, [%[outptr4], #0x20]\n"
644                                 "fmin v12.4s, v12.4s, v0.4s\n"
645                                 "str q17, [%[outptr2], #0x10]\n"
646                                 "fadd v13.4s, v13.4s, v5.4s\n"
647                                 "ldr q16, [%[inptr], #0xe0]\n"
648                                 "fadd v14.4s, v14.4s, v6.4s\n"
649                                 "ldr q9, [%[outptr5]]\n"
650                                 "fmax v11.4s, v11.4s, v1.4s\n"
651                                 "str q10, [%[outptr2], #0x20]\n"
652                                 "fmax v12.4s, v12.4s, v1.4s\n"
653                                 "ldr q17, [%[inptr], #0xf0]\n"
654                                 "fmin v13.4s, v13.4s, v0.4s\n"
655                                 "ldr q2, [%[outptr5], #0x10]\n"
656                                 "fmin v14.4s, v14.4s, v0.4s\n"
657                                 "str q11, [%[outptr3]]\n"
658                                 "fadd v15.4s, v15.4s, v7.4s\n"
659                                 "ldr q10, [%[inptr], #0x100]\n"
660                                 "fadd v16.4s, v16.4s, v8.4s\n"
661                                 "ldr q3, [%[outptr5], #0x20]\n"
662                                 "fmax v13.4s, v13.4s, v1.4s\n"
663                                 "str q12, [%[outptr3], #0x10]\n"
664                                 "fmax v14.4s, v14.4s, v1.4s\n"
665                                 "ldr q11, [%[inptr], #0x110]\n"
666                                 "fmin v15.4s, v15.4s, v0.4s\n"
667                                 "add %[outptr0], %[outptr0], #0x30\n"
668                                 "fmin v16.4s, v16.4s, v0.4s\n"
669                                 "str q13, [%[outptr3], #0x20]\n"
670                                 "fadd v17.4s, v17.4s, v9.4s\n"
671                                 "add %[outptr1], %[outptr1], #0x30\n"
672                                 "fmax v15.4s, v15.4s, v1.4s\n"
673                                 "str q14, [%[outptr4]]\n"
674                                 "fmax v16.4s, v16.4s, v1.4s\n"
675                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
676                                 "fmin v17.4s, v17.4s, v0.4s\n"
677                                 "str q15, [%[outptr4], #0x10]\n"
678                                 "fadd v10.4s, v10.4s, v2.4s\n"
679                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
680                                 "fadd v11.4s, v11.4s, v3.4s\n"
681                                 "str q16, [%[outptr4], #0x20]\n"
682                                 "fmax v17.4s, v17.4s, v1.4s\n"
683                                 "add %[outptr2], %[outptr2], #0x30\n"
684                                 "fmin v10.4s, v10.4s, v0.4s\n"
685                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
686                                 "fmin v11.4s, v11.4s, v0.4s\n"
687                                 "str q17, [%[outptr5]]\n"
688                                 "add %[outptr3], %[outptr3], #0x30\n"
689                                 "fmax v10.4s, v10.4s, v1.4s\n"
690                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
691                                 "fmax v11.4s, v11.4s, v1.4s\n"
692                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
693                                 "str q10, [%[outptr5], #0x10]\n"
694                                 "add %[outptr4], %[outptr4], #0x30\n"
695                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
696                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
697                                 "str q11, [%[outptr5], #0x20]\n"
698                                 "add %[outptr5], %[outptr5], #0x30\n"
699                                 "add %[inptr], %[inptr], #0x180\n"
700                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
701                               [inptr] "+r" (inptr)
702                             : [minval] "w" (minval), [maxval] "w" (maxval)
703                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
704                             );
705                         }
706                     }
707                     break;
708 
709                 case 7:
710                     {
711                         if ((i+11) >= xmax)
712                         {
713                             for (int xi=0; xi<11; xi++)
714                             {
715                                 if ((i+xi) < xmax)
716                                 {
717                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
718                                     outptr0++;
719                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
720                                     outptr1++;
721                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
722                                     outptr2++;
723                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
724                                     outptr3++;
725                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
726                                     outptr4++;
727                                     *outptr5 = std::min(std::max(minval, inptr[xi + 60] + *outptr5), maxval);
728                                     outptr5++;
729                                     *outptr6 = std::min(std::max(minval, inptr[xi + 72] + *outptr6), maxval);
730                                     outptr6++;
731                                 }
732                             }
733                             inptr += 96;
734                         } else {
735                             /* Optimized routine to copy an entire block */
736                             __asm __volatile (
737                                 "dup v0.4s, %[maxval].s[0]\n"
738                                 "ldr q2, [%[outptr0]]\n"
739                                 "dup v1.4s, %[minval].s[0]\n"
740                                 "ldr q10, [%[inptr]]\n"
741                                 "ldr q3, [%[outptr0], #0x10]\n"
742                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
743                                 "ldr q11, [%[inptr], #0x10]\n"
744                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
745                                 "fadd v10.4s, v10.4s, v2.4s\n"
746                                 "ldr q4, [%[outptr0], #0x20]\n"
747                                 "ldr q12, [%[inptr], #0x20]\n"
748                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
749                                 "fadd v11.4s, v11.4s, v3.4s\n"
750                                 "ldr q5, [%[outptr1]]\n"
751                                 "fmin v10.4s, v10.4s, v0.4s\n"
752                                 "ldr q13, [%[inptr], #0x30]\n"
753                                 "fadd v12.4s, v12.4s, v4.4s\n"
754                                 "ldr q6, [%[outptr1], #0x10]\n"
755                                 "ldr q14, [%[inptr], #0x40]\n"
756                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
757                                 "fmax v10.4s, v10.4s, v1.4s\n"
758                                 "ldr q7, [%[outptr1], #0x20]\n"
759                                 "fmin v11.4s, v11.4s, v0.4s\n"
760                                 "ldr q15, [%[inptr], #0x50]\n"
761                                 "fmin v12.4s, v12.4s, v0.4s\n"
762                                 "ldr q8, [%[outptr2]]\n"
763                                 "fadd v13.4s, v13.4s, v5.4s\n"
764                                 "str q10, [%[outptr0]]\n"
765                                 "fadd v14.4s, v14.4s, v6.4s\n"
766                                 "ldr q16, [%[inptr], #0x60]\n"
767                                 "fmax v11.4s, v11.4s, v1.4s\n"
768                                 "ldr q9, [%[outptr2], #0x10]\n"
769                                 "fmax v12.4s, v12.4s, v1.4s\n"
770                                 "ldr q17, [%[inptr], #0x70]\n"
771                                 "fmin v13.4s, v13.4s, v0.4s\n"
772                                 "ldr q2, [%[outptr2], #0x20]\n"
773                                 "fmin v14.4s, v14.4s, v0.4s\n"
774                                 "str q11, [%[outptr0], #0x10]\n"
775                                 "fadd v15.4s, v15.4s, v7.4s\n"
776                                 "ldr q10, [%[inptr], #0x80]\n"
777                                 "fadd v16.4s, v16.4s, v8.4s\n"
778                                 "ldr q3, [%[outptr3]]\n"
779                                 "fmax v13.4s, v13.4s, v1.4s\n"
780                                 "str q12, [%[outptr0], #0x20]\n"
781                                 "fmax v14.4s, v14.4s, v1.4s\n"
782                                 "ldr q11, [%[inptr], #0x90]\n"
783                                 "fmin v15.4s, v15.4s, v0.4s\n"
784                                 "ldr q4, [%[outptr3], #0x10]\n"
785                                 "fmin v16.4s, v16.4s, v0.4s\n"
786                                 "str q13, [%[outptr1]]\n"
787                                 "fadd v17.4s, v17.4s, v9.4s\n"
788                                 "ldr q12, [%[inptr], #0xa0]\n"
789                                 "fadd v10.4s, v10.4s, v2.4s\n"
790                                 "ldr q5, [%[outptr3], #0x20]\n"
791                                 "fmax v15.4s, v15.4s, v1.4s\n"
792                                 "str q14, [%[outptr1], #0x10]\n"
793                                 "fmax v16.4s, v16.4s, v1.4s\n"
794                                 "ldr q13, [%[inptr], #0xb0]\n"
795                                 "fmin v17.4s, v17.4s, v0.4s\n"
796                                 "ldr q6, [%[outptr4]]\n"
797                                 "fmin v10.4s, v10.4s, v0.4s\n"
798                                 "str q15, [%[outptr1], #0x20]\n"
799                                 "fadd v11.4s, v11.4s, v3.4s\n"
800                                 "ldr q14, [%[inptr], #0xc0]\n"
801                                 "fadd v12.4s, v12.4s, v4.4s\n"
802                                 "ldr q7, [%[outptr4], #0x10]\n"
803                                 "fmax v17.4s, v17.4s, v1.4s\n"
804                                 "str q16, [%[outptr2]]\n"
805                                 "fmax v10.4s, v10.4s, v1.4s\n"
806                                 "ldr q15, [%[inptr], #0xd0]\n"
807                                 "fmin v11.4s, v11.4s, v0.4s\n"
808                                 "ldr q8, [%[outptr4], #0x20]\n"
809                                 "fmin v12.4s, v12.4s, v0.4s\n"
810                                 "str q17, [%[outptr2], #0x10]\n"
811                                 "fadd v13.4s, v13.4s, v5.4s\n"
812                                 "ldr q16, [%[inptr], #0xe0]\n"
813                                 "fadd v14.4s, v14.4s, v6.4s\n"
814                                 "ldr q9, [%[outptr5]]\n"
815                                 "fmax v11.4s, v11.4s, v1.4s\n"
816                                 "str q10, [%[outptr2], #0x20]\n"
817                                 "fmax v12.4s, v12.4s, v1.4s\n"
818                                 "ldr q17, [%[inptr], #0xf0]\n"
819                                 "fmin v13.4s, v13.4s, v0.4s\n"
820                                 "ldr q2, [%[outptr5], #0x10]\n"
821                                 "fmin v14.4s, v14.4s, v0.4s\n"
822                                 "str q11, [%[outptr3]]\n"
823                                 "fadd v15.4s, v15.4s, v7.4s\n"
824                                 "ldr q10, [%[inptr], #0x100]\n"
825                                 "fadd v16.4s, v16.4s, v8.4s\n"
826                                 "ldr q3, [%[outptr5], #0x20]\n"
827                                 "fmax v13.4s, v13.4s, v1.4s\n"
828                                 "str q12, [%[outptr3], #0x10]\n"
829                                 "fmax v14.4s, v14.4s, v1.4s\n"
830                                 "ldr q11, [%[inptr], #0x110]\n"
831                                 "fmin v15.4s, v15.4s, v0.4s\n"
832                                 "ldr q4, [%[outptr6]]\n"
833                                 "fmin v16.4s, v16.4s, v0.4s\n"
834                                 "str q13, [%[outptr3], #0x20]\n"
835                                 "fadd v17.4s, v17.4s, v9.4s\n"
836                                 "ldr q12, [%[inptr], #0x120]\n"
837                                 "fadd v10.4s, v10.4s, v2.4s\n"
838                                 "ldr q5, [%[outptr6], #0x10]\n"
839                                 "fmax v15.4s, v15.4s, v1.4s\n"
840                                 "str q14, [%[outptr4]]\n"
841                                 "fmax v16.4s, v16.4s, v1.4s\n"
842                                 "ldr q13, [%[inptr], #0x130]\n"
843                                 "fmin v17.4s, v17.4s, v0.4s\n"
844                                 "ldr q6, [%[outptr6], #0x20]\n"
845                                 "fmin v10.4s, v10.4s, v0.4s\n"
846                                 "str q15, [%[outptr4], #0x10]\n"
847                                 "fadd v11.4s, v11.4s, v3.4s\n"
848                                 "ldr q14, [%[inptr], #0x140]\n"
849                                 "fadd v12.4s, v12.4s, v4.4s\n"
850                                 "add %[outptr0], %[outptr0], #0x30\n"
851                                 "fmax v17.4s, v17.4s, v1.4s\n"
852                                 "str q16, [%[outptr4], #0x20]\n"
853                                 "fmax v10.4s, v10.4s, v1.4s\n"
854                                 "add %[outptr1], %[outptr1], #0x30\n"
855                                 "fmin v11.4s, v11.4s, v0.4s\n"
856                                 "str q17, [%[outptr5]]\n"
857                                 "fmin v12.4s, v12.4s, v0.4s\n"
858                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
859                                 "fadd v13.4s, v13.4s, v5.4s\n"
860                                 "str q10, [%[outptr5], #0x10]\n"
861                                 "fmax v11.4s, v11.4s, v1.4s\n"
862                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
863                                 "fmax v12.4s, v12.4s, v1.4s\n"
864                                 "add %[outptr2], %[outptr2], #0x30\n"
865                                 "fmin v13.4s, v13.4s, v0.4s\n"
866                                 "str q11, [%[outptr5], #0x20]\n"
867                                 "fadd v14.4s, v14.4s, v6.4s\n"
868                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
869                                 "add %[outptr3], %[outptr3], #0x30\n"
870                                 "fmax v13.4s, v13.4s, v1.4s\n"
871                                 "str q12, [%[outptr6]]\n"
872                                 "fmin v14.4s, v14.4s, v0.4s\n"
873                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
874                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
875                                 "str q13, [%[outptr6], #0x10]\n"
876                                 "add %[outptr4], %[outptr4], #0x30\n"
877                                 "fmax v14.4s, v14.4s, v1.4s\n"
878                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
879                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
880                                 "add %[outptr5], %[outptr5], #0x30\n"
881                                 "str q14, [%[outptr6], #0x20]\n"
882                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
883                                 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
884                                 "add %[outptr6], %[outptr6], #0x30\n"
885                                 "add %[inptr], %[inptr], #0x180\n"
886                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
887                               [inptr] "+r" (inptr)
888                             : [minval] "w" (minval), [maxval] "w" (maxval)
889                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
890                             );
891                         }
892                     }
893                     break;
894 
895                 default:
896                 case 8:
897                     {
898                         if ((i+11) >= xmax)
899                         {
900                             for (int xi=0; xi<11; xi++)
901                             {
902                                 if ((i+xi) < xmax)
903                                 {
904                                     *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
905                                     outptr0++;
906                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
907                                     outptr1++;
908                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
909                                     outptr2++;
910                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
911                                     outptr3++;
912                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
913                                     outptr4++;
914                                     *outptr5 = std::min(std::max(minval, inptr[xi + 60] + *outptr5), maxval);
915                                     outptr5++;
916                                     *outptr6 = std::min(std::max(minval, inptr[xi + 72] + *outptr6), maxval);
917                                     outptr6++;
918                                     *outptr7 = std::min(std::max(minval, inptr[xi + 84] + *outptr7), maxval);
919                                     outptr7++;
920                                 }
921                             }
922                             inptr += 96;
923                         } else {
924                             /* Optimized routine to copy an entire block */
925                             __asm __volatile (
926                                 "dup v0.4s, %[maxval].s[0]\n"
927                                 "ldr q2, [%[outptr0]]\n"
928                                 "dup v1.4s, %[minval].s[0]\n"
929                                 "ldr q10, [%[inptr]]\n"
930                                 "ldr q3, [%[outptr0], #0x10]\n"
931                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
932                                 "ldr q11, [%[inptr], #0x10]\n"
933                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
934                                 "fadd v10.4s, v10.4s, v2.4s\n"
935                                 "ldr q4, [%[outptr0], #0x20]\n"
936                                 "ldr q12, [%[inptr], #0x20]\n"
937                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
938                                 "fadd v11.4s, v11.4s, v3.4s\n"
939                                 "ldr q5, [%[outptr1]]\n"
940                                 "fmin v10.4s, v10.4s, v0.4s\n"
941                                 "ldr q13, [%[inptr], #0x30]\n"
942                                 "fadd v12.4s, v12.4s, v4.4s\n"
943                                 "ldr q6, [%[outptr1], #0x10]\n"
944                                 "ldr q14, [%[inptr], #0x40]\n"
945                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
946                                 "fmax v10.4s, v10.4s, v1.4s\n"
947                                 "ldr q7, [%[outptr1], #0x20]\n"
948                                 "fmin v11.4s, v11.4s, v0.4s\n"
949                                 "ldr q15, [%[inptr], #0x50]\n"
950                                 "fmin v12.4s, v12.4s, v0.4s\n"
951                                 "ldr q8, [%[outptr2]]\n"
952                                 "fadd v13.4s, v13.4s, v5.4s\n"
953                                 "str q10, [%[outptr0]]\n"
954                                 "fadd v14.4s, v14.4s, v6.4s\n"
955                                 "ldr q16, [%[inptr], #0x60]\n"
956                                 "fmax v11.4s, v11.4s, v1.4s\n"
957                                 "ldr q9, [%[outptr2], #0x10]\n"
958                                 "fmax v12.4s, v12.4s, v1.4s\n"
959                                 "ldr q17, [%[inptr], #0x70]\n"
960                                 "fmin v13.4s, v13.4s, v0.4s\n"
961                                 "ldr q2, [%[outptr2], #0x20]\n"
962                                 "fmin v14.4s, v14.4s, v0.4s\n"
963                                 "str q11, [%[outptr0], #0x10]\n"
964                                 "fadd v15.4s, v15.4s, v7.4s\n"
965                                 "ldr q10, [%[inptr], #0x80]\n"
966                                 "fadd v16.4s, v16.4s, v8.4s\n"
967                                 "ldr q3, [%[outptr3]]\n"
968                                 "fmax v13.4s, v13.4s, v1.4s\n"
969                                 "str q12, [%[outptr0], #0x20]\n"
970                                 "fmax v14.4s, v14.4s, v1.4s\n"
971                                 "ldr q11, [%[inptr], #0x90]\n"
972                                 "fmin v15.4s, v15.4s, v0.4s\n"
973                                 "ldr q4, [%[outptr3], #0x10]\n"
974                                 "fmin v16.4s, v16.4s, v0.4s\n"
975                                 "str q13, [%[outptr1]]\n"
976                                 "fadd v17.4s, v17.4s, v9.4s\n"
977                                 "ldr q12, [%[inptr], #0xa0]\n"
978                                 "fadd v10.4s, v10.4s, v2.4s\n"
979                                 "ldr q5, [%[outptr3], #0x20]\n"
980                                 "fmax v15.4s, v15.4s, v1.4s\n"
981                                 "str q14, [%[outptr1], #0x10]\n"
982                                 "fmax v16.4s, v16.4s, v1.4s\n"
983                                 "ldr q13, [%[inptr], #0xb0]\n"
984                                 "fmin v17.4s, v17.4s, v0.4s\n"
985                                 "ldr q6, [%[outptr4]]\n"
986                                 "fmin v10.4s, v10.4s, v0.4s\n"
987                                 "str q15, [%[outptr1], #0x20]\n"
988                                 "fadd v11.4s, v11.4s, v3.4s\n"
989                                 "ldr q14, [%[inptr], #0xc0]\n"
990                                 "fadd v12.4s, v12.4s, v4.4s\n"
991                                 "ldr q7, [%[outptr4], #0x10]\n"
992                                 "fmax v17.4s, v17.4s, v1.4s\n"
993                                 "str q16, [%[outptr2]]\n"
994                                 "fmax v10.4s, v10.4s, v1.4s\n"
995                                 "ldr q15, [%[inptr], #0xd0]\n"
996                                 "fmin v11.4s, v11.4s, v0.4s\n"
997                                 "ldr q8, [%[outptr4], #0x20]\n"
998                                 "fmin v12.4s, v12.4s, v0.4s\n"
999                                 "str q17, [%[outptr2], #0x10]\n"
1000                                 "fadd v13.4s, v13.4s, v5.4s\n"
1001                                 "ldr q16, [%[inptr], #0xe0]\n"
1002                                 "fadd v14.4s, v14.4s, v6.4s\n"
1003                                 "ldr q9, [%[outptr5]]\n"
1004                                 "fmax v11.4s, v11.4s, v1.4s\n"
1005                                 "str q10, [%[outptr2], #0x20]\n"
1006                                 "fmax v12.4s, v12.4s, v1.4s\n"
1007                                 "ldr q17, [%[inptr], #0xf0]\n"
1008                                 "fmin v13.4s, v13.4s, v0.4s\n"
1009                                 "ldr q2, [%[outptr5], #0x10]\n"
1010                                 "fmin v14.4s, v14.4s, v0.4s\n"
1011                                 "str q11, [%[outptr3]]\n"
1012                                 "fadd v15.4s, v15.4s, v7.4s\n"
1013                                 "ldr q10, [%[inptr], #0x100]\n"
1014                                 "fadd v16.4s, v16.4s, v8.4s\n"
1015                                 "ldr q3, [%[outptr5], #0x20]\n"
1016                                 "fmax v13.4s, v13.4s, v1.4s\n"
1017                                 "str q12, [%[outptr3], #0x10]\n"
1018                                 "fmax v14.4s, v14.4s, v1.4s\n"
1019                                 "ldr q11, [%[inptr], #0x110]\n"
1020                                 "fmin v15.4s, v15.4s, v0.4s\n"
1021                                 "ldr q4, [%[outptr6]]\n"
1022                                 "fmin v16.4s, v16.4s, v0.4s\n"
1023                                 "str q13, [%[outptr3], #0x20]\n"
1024                                 "fadd v17.4s, v17.4s, v9.4s\n"
1025                                 "ldr q12, [%[inptr], #0x120]\n"
1026                                 "fadd v10.4s, v10.4s, v2.4s\n"
1027                                 "ldr q5, [%[outptr6], #0x10]\n"
1028                                 "fmax v15.4s, v15.4s, v1.4s\n"
1029                                 "str q14, [%[outptr4]]\n"
1030                                 "fmax v16.4s, v16.4s, v1.4s\n"
1031                                 "ldr q13, [%[inptr], #0x130]\n"
1032                                 "fmin v17.4s, v17.4s, v0.4s\n"
1033                                 "ldr q6, [%[outptr6], #0x20]\n"
1034                                 "fmin v10.4s, v10.4s, v0.4s\n"
1035                                 "str q15, [%[outptr4], #0x10]\n"
1036                                 "fadd v11.4s, v11.4s, v3.4s\n"
1037                                 "ldr q14, [%[inptr], #0x140]\n"
1038                                 "fadd v12.4s, v12.4s, v4.4s\n"
1039                                 "ldr q7, [%[outptr7]]\n"
1040                                 "fmax v17.4s, v17.4s, v1.4s\n"
1041                                 "str q16, [%[outptr4], #0x20]\n"
1042                                 "fmax v10.4s, v10.4s, v1.4s\n"
1043                                 "ldr q15, [%[inptr], #0x150]\n"
1044                                 "fmin v11.4s, v11.4s, v0.4s\n"
1045                                 "ldr q8, [%[outptr7], #0x10]\n"
1046                                 "fmin v12.4s, v12.4s, v0.4s\n"
1047                                 "str q17, [%[outptr5]]\n"
1048                                 "fadd v13.4s, v13.4s, v5.4s\n"
1049                                 "ldr q16, [%[inptr], #0x160]\n"
1050                                 "fadd v14.4s, v14.4s, v6.4s\n"
1051                                 "ldr q9, [%[outptr7], #0x20]\n"
1052                                 "fmax v11.4s, v11.4s, v1.4s\n"
1053                                 "str q10, [%[outptr5], #0x10]\n"
1054                                 "fmax v12.4s, v12.4s, v1.4s\n"
1055                                 "ldr q17, [%[inptr], #0x170]\n"
1056                                 "fmin v13.4s, v13.4s, v0.4s\n"
1057                                 "add %[outptr0], %[outptr0], #0x30\n"
1058                                 "fmin v14.4s, v14.4s, v0.4s\n"
1059                                 "str q11, [%[outptr5], #0x20]\n"
1060                                 "fadd v15.4s, v15.4s, v7.4s\n"
1061                                 "add %[outptr1], %[outptr1], #0x30\n"
1062                                 "fmax v13.4s, v13.4s, v1.4s\n"
1063                                 "str q12, [%[outptr6]]\n"
1064                                 "fmax v14.4s, v14.4s, v1.4s\n"
1065                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1066                                 "fmin v15.4s, v15.4s, v0.4s\n"
1067                                 "str q13, [%[outptr6], #0x10]\n"
1068                                 "fadd v16.4s, v16.4s, v8.4s\n"
1069                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1070                                 "fadd v17.4s, v17.4s, v9.4s\n"
1071                                 "str q14, [%[outptr6], #0x20]\n"
1072                                 "fmax v15.4s, v15.4s, v1.4s\n"
1073                                 "add %[outptr2], %[outptr2], #0x30\n"
1074                                 "fmin v16.4s, v16.4s, v0.4s\n"
1075                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1076                                 "fmin v17.4s, v17.4s, v0.4s\n"
1077                                 "str q15, [%[outptr7]]\n"
1078                                 "add %[outptr3], %[outptr3], #0x30\n"
1079                                 "fmax v16.4s, v16.4s, v1.4s\n"
1080                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1081                                 "fmax v17.4s, v17.4s, v1.4s\n"
1082                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1083                                 "str q16, [%[outptr7], #0x10]\n"
1084                                 "add %[outptr4], %[outptr4], #0x30\n"
1085                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1086                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1087                                 "str q17, [%[outptr7], #0x20]\n"
1088                                 "add %[outptr5], %[outptr5], #0x30\n"
1089                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1090                                 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1091                                 "add %[outptr6], %[outptr6], #0x30\n"
1092                                 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
1093                                 "add %[outptr7], %[outptr7], #0x30\n"
1094                                 "add %[inptr], %[inptr], #0x180\n"
1095                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1096                               [inptr] "+r" (inptr)
1097                             : [minval] "w" (minval), [maxval] "w" (maxval)
1098                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1099                             );
1100                         }
1101                     }
1102                     break;
1103 
1104 
1105                 }
1106             }
1107             else
1108             {
1109                 const float *biasptr = bias ? bias + i : nullbias;
1110 
1111                 switch(height)
1112                 {
1113                 case 1:
1114                     {
1115                         if ((i+11) >= xmax)
1116                         {
1117                             for (int xi=0; xi<11; xi++)
1118                             {
1119                                 if ((i+xi) < xmax)
1120                                 {
1121                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1122                                     outptr0++;
1123                                 }
1124                             }
1125                             inptr += 96;
1126                         } else {
1127                             /* Optimized routine to copy an entire block */
1128                             __asm __volatile (
1129                                 "dup v0.4s, %[maxval].s[0]\n"
1130                                 "ldr q2, [%[biasptr]]\n"
1131                                 "dup v1.4s, %[minval].s[0]\n"
1132                                 "ldr q3, [%[biasptr], #0x10]\n"
1133                                 "ldr q4, [%[biasptr], #0x20]\n"
1134                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1135                                 "ldr q13, [%[inptr]]\n"
1136                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1137                                 "ldr q14, [%[inptr], #0x10]\n"
1138                                 "ldr q15, [%[inptr], #0x20]\n"
1139                                 "add %[inptr], %[inptr], #0x180\n"
1140                                 "fadd v13.4s, v13.4s, v2.4s\n"
1141                                 "fadd v14.4s, v14.4s, v3.4s\n"
1142                                 "fadd v15.4s, v15.4s, v4.4s\n"
1143                                 "fmin v13.4s, v13.4s, v0.4s\n"
1144                                 "fmin v14.4s, v14.4s, v0.4s\n"
1145                                 "fmin v15.4s, v15.4s, v0.4s\n"
1146                                 "fmax v13.4s, v13.4s, v1.4s\n"
1147                                 "fmax v14.4s, v14.4s, v1.4s\n"
1148                                 "fmax v15.4s, v15.4s, v1.4s\n"
1149                                 "str q13, [%[outptr0]]\n"
1150                                 "str q14, [%[outptr0], #0x10]\n"
1151                                 "str q15, [%[outptr0], #0x20]\n"
1152                                 "add %[outptr0], %[outptr0], #0x30\n"
1153                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1154                               [inptr] "+r" (inptr)
1155                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1156                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1157                             );
1158                         }
1159                     }
1160                     break;
1161 
1162                 case 2:
1163                     {
1164                         if ((i+11) >= xmax)
1165                         {
1166                             for (int xi=0; xi<11; xi++)
1167                             {
1168                                 if ((i+xi) < xmax)
1169                                 {
1170                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1171                                     outptr0++;
1172                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1173                                     outptr1++;
1174                                 }
1175                             }
1176                             inptr += 96;
1177                         } else {
1178                             /* Optimized routine to copy an entire block */
1179                             __asm __volatile (
1180                                 "dup v0.4s, %[maxval].s[0]\n"
1181                                 "ldr q2, [%[biasptr]]\n"
1182                                 "dup v1.4s, %[minval].s[0]\n"
1183                                 "ldr q3, [%[biasptr], #0x10]\n"
1184                                 "ldr q4, [%[biasptr], #0x20]\n"
1185                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1186                                 "ldr q13, [%[inptr]]\n"
1187                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1188                                 "ldr q14, [%[inptr], #0x10]\n"
1189                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1190                                 "fadd v13.4s, v13.4s, v2.4s\n"
1191                                 "ldr q15, [%[inptr], #0x20]\n"
1192                                 "ldr q16, [%[inptr], #0x30]\n"
1193                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1194                                 "fadd v14.4s, v14.4s, v3.4s\n"
1195                                 "ldr q17, [%[inptr], #0x40]\n"
1196                                 "fmin v13.4s, v13.4s, v0.4s\n"
1197                                 "ldr q18, [%[inptr], #0x50]\n"
1198                                 "fadd v15.4s, v15.4s, v4.4s\n"
1199                                 "add %[inptr], %[inptr], #0x180\n"
1200                                 "fmin v14.4s, v14.4s, v0.4s\n"
1201                                 "fmax v13.4s, v13.4s, v1.4s\n"
1202                                 "fmin v15.4s, v15.4s, v0.4s\n"
1203                                 "fadd v16.4s, v16.4s, v2.4s\n"
1204                                 "fmax v14.4s, v14.4s, v1.4s\n"
1205                                 "str q13, [%[outptr0]]\n"
1206                                 "fadd v17.4s, v17.4s, v3.4s\n"
1207                                 "fmax v15.4s, v15.4s, v1.4s\n"
1208                                 "fmin v16.4s, v16.4s, v0.4s\n"
1209                                 "str q14, [%[outptr0], #0x10]\n"
1210                                 "fadd v18.4s, v18.4s, v4.4s\n"
1211                                 "fmin v17.4s, v17.4s, v0.4s\n"
1212                                 "fmax v16.4s, v16.4s, v1.4s\n"
1213                                 "str q15, [%[outptr0], #0x20]\n"
1214                                 "fmin v18.4s, v18.4s, v0.4s\n"
1215                                 "add %[outptr0], %[outptr0], #0x30\n"
1216                                 "fmax v17.4s, v17.4s, v1.4s\n"
1217                                 "str q16, [%[outptr1]]\n"
1218                                 "fmax v18.4s, v18.4s, v1.4s\n"
1219                                 "str q17, [%[outptr1], #0x10]\n"
1220                                 "str q18, [%[outptr1], #0x20]\n"
1221                                 "add %[outptr1], %[outptr1], #0x30\n"
1222                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1223                               [inptr] "+r" (inptr)
1224                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1225                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1226                             );
1227                         }
1228                     }
1229                     break;
1230 
1231                 case 3:
1232                     {
1233                         if ((i+11) >= xmax)
1234                         {
1235                             for (int xi=0; xi<11; xi++)
1236                             {
1237                                 if ((i+xi) < xmax)
1238                                 {
1239                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1240                                     outptr0++;
1241                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1242                                     outptr1++;
1243                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1244                                     outptr2++;
1245                                 }
1246                             }
1247                             inptr += 96;
1248                         } else {
1249                             /* Optimized routine to copy an entire block */
1250                             __asm __volatile (
1251                                 "dup v0.4s, %[maxval].s[0]\n"
1252                                 "ldr q2, [%[biasptr]]\n"
1253                                 "dup v1.4s, %[minval].s[0]\n"
1254                                 "ldr q3, [%[biasptr], #0x10]\n"
1255                                 "ldr q4, [%[biasptr], #0x20]\n"
1256                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1257                                 "ldr q13, [%[inptr]]\n"
1258                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1259                                 "ldr q14, [%[inptr], #0x10]\n"
1260                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1261                                 "fadd v13.4s, v13.4s, v2.4s\n"
1262                                 "ldr q15, [%[inptr], #0x20]\n"
1263                                 "ldr q16, [%[inptr], #0x30]\n"
1264                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1265                                 "fadd v14.4s, v14.4s, v3.4s\n"
1266                                 "ldr q17, [%[inptr], #0x40]\n"
1267                                 "fmin v13.4s, v13.4s, v0.4s\n"
1268                                 "ldr q18, [%[inptr], #0x50]\n"
1269                                 "fadd v15.4s, v15.4s, v4.4s\n"
1270                                 "ldr q19, [%[inptr], #0x60]\n"
1271                                 "fadd v16.4s, v16.4s, v2.4s\n"
1272                                 "ldr q20, [%[inptr], #0x70]\n"
1273                                 "fmin v14.4s, v14.4s, v0.4s\n"
1274                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1275                                 "fmax v13.4s, v13.4s, v1.4s\n"
1276                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1277                                 "fmax v14.4s, v14.4s, v1.4s\n"
1278                                 "fmin v15.4s, v15.4s, v0.4s\n"
1279                                 "str q13, [%[outptr0]]\n"
1280                                 "fmin v16.4s, v16.4s, v0.4s\n"
1281                                 "ldr q13, [%[inptr], #0x80]\n"
1282                                 "fadd v17.4s, v17.4s, v3.4s\n"
1283                                 "add %[inptr], %[inptr], #0x180\n"
1284                                 "fmax v15.4s, v15.4s, v1.4s\n"
1285                                 "str q14, [%[outptr0], #0x10]\n"
1286                                 "fmax v16.4s, v16.4s, v1.4s\n"
1287                                 "fmin v17.4s, v17.4s, v0.4s\n"
1288                                 "fadd v18.4s, v18.4s, v4.4s\n"
1289                                 "str q15, [%[outptr0], #0x20]\n"
1290                                 "fadd v19.4s, v19.4s, v2.4s\n"
1291                                 "add %[outptr0], %[outptr0], #0x30\n"
1292                                 "fmax v17.4s, v17.4s, v1.4s\n"
1293                                 "str q16, [%[outptr1]]\n"
1294                                 "fmin v18.4s, v18.4s, v0.4s\n"
1295                                 "fmin v19.4s, v19.4s, v0.4s\n"
1296                                 "fadd v20.4s, v20.4s, v3.4s\n"
1297                                 "str q17, [%[outptr1], #0x10]\n"
1298                                 "fadd v13.4s, v13.4s, v4.4s\n"
1299                                 "fmax v18.4s, v18.4s, v1.4s\n"
1300                                 "fmax v19.4s, v19.4s, v1.4s\n"
1301                                 "fmin v20.4s, v20.4s, v0.4s\n"
1302                                 "fmin v13.4s, v13.4s, v0.4s\n"
1303                                 "str q18, [%[outptr1], #0x20]\n"
1304                                 "add %[outptr1], %[outptr1], #0x30\n"
1305                                 "fmax v20.4s, v20.4s, v1.4s\n"
1306                                 "str q19, [%[outptr2]]\n"
1307                                 "fmax v13.4s, v13.4s, v1.4s\n"
1308                                 "str q20, [%[outptr2], #0x10]\n"
1309                                 "str q13, [%[outptr2], #0x20]\n"
1310                                 "add %[outptr2], %[outptr2], #0x30\n"
1311                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1312                               [inptr] "+r" (inptr)
1313                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1314                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1315                             );
1316                         }
1317                     }
1318                     break;
1319 
1320                 case 4:
1321                     {
1322                         if ((i+11) >= xmax)
1323                         {
1324                             for (int xi=0; xi<11; xi++)
1325                             {
1326                                 if ((i+xi) < xmax)
1327                                 {
1328                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1329                                     outptr0++;
1330                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1331                                     outptr1++;
1332                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1333                                     outptr2++;
1334                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1335                                     outptr3++;
1336                                 }
1337                             }
1338                             inptr += 96;
1339                         } else {
1340                             /* Optimized routine to copy an entire block */
1341                             __asm __volatile (
1342                                 "dup v0.4s, %[maxval].s[0]\n"
1343                                 "ldr q2, [%[biasptr]]\n"
1344                                 "dup v1.4s, %[minval].s[0]\n"
1345                                 "ldr q3, [%[biasptr], #0x10]\n"
1346                                 "ldr q4, [%[biasptr], #0x20]\n"
1347                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1348                                 "ldr q13, [%[inptr]]\n"
1349                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1350                                 "ldr q14, [%[inptr], #0x10]\n"
1351                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1352                                 "fadd v13.4s, v13.4s, v2.4s\n"
1353                                 "ldr q15, [%[inptr], #0x20]\n"
1354                                 "ldr q16, [%[inptr], #0x30]\n"
1355                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1356                                 "fadd v14.4s, v14.4s, v3.4s\n"
1357                                 "ldr q17, [%[inptr], #0x40]\n"
1358                                 "fmin v13.4s, v13.4s, v0.4s\n"
1359                                 "ldr q18, [%[inptr], #0x50]\n"
1360                                 "fadd v15.4s, v15.4s, v4.4s\n"
1361                                 "ldr q19, [%[inptr], #0x60]\n"
1362                                 "fadd v16.4s, v16.4s, v2.4s\n"
1363                                 "ldr q20, [%[inptr], #0x70]\n"
1364                                 "fmin v14.4s, v14.4s, v0.4s\n"
1365                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1366                                 "fmax v13.4s, v13.4s, v1.4s\n"
1367                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1368                                 "fmax v14.4s, v14.4s, v1.4s\n"
1369                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1370                                 "fmin v15.4s, v15.4s, v0.4s\n"
1371                                 "str q13, [%[outptr0]]\n"
1372                                 "fmin v16.4s, v16.4s, v0.4s\n"
1373                                 "ldr q13, [%[inptr], #0x80]\n"
1374                                 "fadd v17.4s, v17.4s, v3.4s\n"
1375                                 "fadd v18.4s, v18.4s, v4.4s\n"
1376                                 "str q14, [%[outptr0], #0x10]\n"
1377                                 "fmax v15.4s, v15.4s, v1.4s\n"
1378                                 "ldr q14, [%[inptr], #0x90]\n"
1379                                 "fmax v16.4s, v16.4s, v1.4s\n"
1380                                 "fmin v17.4s, v17.4s, v0.4s\n"
1381                                 "fmin v18.4s, v18.4s, v0.4s\n"
1382                                 "str q15, [%[outptr0], #0x20]\n"
1383                                 "fadd v19.4s, v19.4s, v2.4s\n"
1384                                 "ldr q15, [%[inptr], #0xa0]\n"
1385                                 "fadd v20.4s, v20.4s, v3.4s\n"
1386                                 "add %[outptr0], %[outptr0], #0x30\n"
1387                                 "fmax v17.4s, v17.4s, v1.4s\n"
1388                                 "str q16, [%[outptr1]]\n"
1389                                 "fmax v18.4s, v18.4s, v1.4s\n"
1390                                 "ldr q16, [%[inptr], #0xb0]\n"
1391                                 "fmin v19.4s, v19.4s, v0.4s\n"
1392                                 "add %[inptr], %[inptr], #0x180\n"
1393                                 "fmin v20.4s, v20.4s, v0.4s\n"
1394                                 "str q17, [%[outptr1], #0x10]\n"
1395                                 "fadd v13.4s, v13.4s, v4.4s\n"
1396                                 "fmax v19.4s, v19.4s, v1.4s\n"
1397                                 "fadd v14.4s, v14.4s, v2.4s\n"
1398                                 "str q18, [%[outptr1], #0x20]\n"
1399                                 "fmax v20.4s, v20.4s, v1.4s\n"
1400                                 "add %[outptr1], %[outptr1], #0x30\n"
1401                                 "fmin v13.4s, v13.4s, v0.4s\n"
1402                                 "str q19, [%[outptr2]]\n"
1403                                 "fmin v14.4s, v14.4s, v0.4s\n"
1404                                 "fadd v15.4s, v15.4s, v3.4s\n"
1405                                 "fadd v16.4s, v16.4s, v4.4s\n"
1406                                 "str q20, [%[outptr2], #0x10]\n"
1407                                 "fmax v13.4s, v13.4s, v1.4s\n"
1408                                 "fmax v14.4s, v14.4s, v1.4s\n"
1409                                 "fmin v15.4s, v15.4s, v0.4s\n"
1410                                 "fmin v16.4s, v16.4s, v0.4s\n"
1411                                 "str q13, [%[outptr2], #0x20]\n"
1412                                 "add %[outptr2], %[outptr2], #0x30\n"
1413                                 "fmax v15.4s, v15.4s, v1.4s\n"
1414                                 "str q14, [%[outptr3]]\n"
1415                                 "fmax v16.4s, v16.4s, v1.4s\n"
1416                                 "str q15, [%[outptr3], #0x10]\n"
1417                                 "str q16, [%[outptr3], #0x20]\n"
1418                                 "add %[outptr3], %[outptr3], #0x30\n"
1419                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1420                               [inptr] "+r" (inptr)
1421                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1422                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1423                             );
1424                         }
1425                     }
1426                     break;
1427 
1428                 case 5:
1429                     {
1430                         if ((i+11) >= xmax)
1431                         {
1432                             for (int xi=0; xi<11; xi++)
1433                             {
1434                                 if ((i+xi) < xmax)
1435                                 {
1436                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1437                                     outptr0++;
1438                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1439                                     outptr1++;
1440                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1441                                     outptr2++;
1442                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1443                                     outptr3++;
1444                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1445                                     outptr4++;
1446                                 }
1447                             }
1448                             inptr += 96;
1449                         } else {
1450                             /* Optimized routine to copy an entire block */
1451                             __asm __volatile (
1452                                 "dup v0.4s, %[maxval].s[0]\n"
1453                                 "ldr q2, [%[biasptr]]\n"
1454                                 "dup v1.4s, %[minval].s[0]\n"
1455                                 "ldr q3, [%[biasptr], #0x10]\n"
1456                                 "ldr q4, [%[biasptr], #0x20]\n"
1457                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1458                                 "ldr q13, [%[inptr]]\n"
1459                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1460                                 "ldr q14, [%[inptr], #0x10]\n"
1461                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1462                                 "fadd v13.4s, v13.4s, v2.4s\n"
1463                                 "ldr q15, [%[inptr], #0x20]\n"
1464                                 "ldr q16, [%[inptr], #0x30]\n"
1465                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1466                                 "fadd v14.4s, v14.4s, v3.4s\n"
1467                                 "ldr q17, [%[inptr], #0x40]\n"
1468                                 "fmin v13.4s, v13.4s, v0.4s\n"
1469                                 "ldr q18, [%[inptr], #0x50]\n"
1470                                 "fadd v15.4s, v15.4s, v4.4s\n"
1471                                 "ldr q19, [%[inptr], #0x60]\n"
1472                                 "fadd v16.4s, v16.4s, v2.4s\n"
1473                                 "ldr q20, [%[inptr], #0x70]\n"
1474                                 "fmin v14.4s, v14.4s, v0.4s\n"
1475                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1476                                 "fmax v13.4s, v13.4s, v1.4s\n"
1477                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1478                                 "fmax v14.4s, v14.4s, v1.4s\n"
1479                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1480                                 "fmin v15.4s, v15.4s, v0.4s\n"
1481                                 "str q13, [%[outptr0]]\n"
1482                                 "fmin v16.4s, v16.4s, v0.4s\n"
1483                                 "ldr q13, [%[inptr], #0x80]\n"
1484                                 "fadd v17.4s, v17.4s, v3.4s\n"
1485                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1486                                 "fmax v15.4s, v15.4s, v1.4s\n"
1487                                 "str q14, [%[outptr0], #0x10]\n"
1488                                 "fmax v16.4s, v16.4s, v1.4s\n"
1489                                 "ldr q14, [%[inptr], #0x90]\n"
1490                                 "fmin v17.4s, v17.4s, v0.4s\n"
1491                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1492                                 "fadd v18.4s, v18.4s, v4.4s\n"
1493                                 "str q15, [%[outptr0], #0x20]\n"
1494                                 "fadd v19.4s, v19.4s, v2.4s\n"
1495                                 "ldr q15, [%[inptr], #0xa0]\n"
1496                                 "fmax v17.4s, v17.4s, v1.4s\n"
1497                                 "add %[outptr0], %[outptr0], #0x30\n"
1498                                 "fmin v18.4s, v18.4s, v0.4s\n"
1499                                 "str q16, [%[outptr1]]\n"
1500                                 "fmin v19.4s, v19.4s, v0.4s\n"
1501                                 "ldr q16, [%[inptr], #0xb0]\n"
1502                                 "fadd v20.4s, v20.4s, v3.4s\n"
1503                                 "fadd v13.4s, v13.4s, v4.4s\n"
1504                                 "str q17, [%[outptr1], #0x10]\n"
1505                                 "fmax v18.4s, v18.4s, v1.4s\n"
1506                                 "ldr q17, [%[inptr], #0xc0]\n"
1507                                 "fmax v19.4s, v19.4s, v1.4s\n"
1508                                 "fmin v20.4s, v20.4s, v0.4s\n"
1509                                 "fmin v13.4s, v13.4s, v0.4s\n"
1510                                 "str q18, [%[outptr1], #0x20]\n"
1511                                 "fadd v14.4s, v14.4s, v2.4s\n"
1512                                 "ldr q18, [%[inptr], #0xd0]\n"
1513                                 "fadd v15.4s, v15.4s, v3.4s\n"
1514                                 "add %[outptr1], %[outptr1], #0x30\n"
1515                                 "fmax v20.4s, v20.4s, v1.4s\n"
1516                                 "str q19, [%[outptr2]]\n"
1517                                 "fmax v13.4s, v13.4s, v1.4s\n"
1518                                 "ldr q19, [%[inptr], #0xe0]\n"
1519                                 "fmin v14.4s, v14.4s, v0.4s\n"
1520                                 "add %[inptr], %[inptr], #0x180\n"
1521                                 "fmin v15.4s, v15.4s, v0.4s\n"
1522                                 "str q20, [%[outptr2], #0x10]\n"
1523                                 "fadd v16.4s, v16.4s, v4.4s\n"
1524                                 "fmax v14.4s, v14.4s, v1.4s\n"
1525                                 "fadd v17.4s, v17.4s, v2.4s\n"
1526                                 "str q13, [%[outptr2], #0x20]\n"
1527                                 "fmax v15.4s, v15.4s, v1.4s\n"
1528                                 "add %[outptr2], %[outptr2], #0x30\n"
1529                                 "fmin v16.4s, v16.4s, v0.4s\n"
1530                                 "str q14, [%[outptr3]]\n"
1531                                 "fmin v17.4s, v17.4s, v0.4s\n"
1532                                 "fadd v18.4s, v18.4s, v3.4s\n"
1533                                 "fadd v19.4s, v19.4s, v4.4s\n"
1534                                 "str q15, [%[outptr3], #0x10]\n"
1535                                 "fmax v16.4s, v16.4s, v1.4s\n"
1536                                 "fmax v17.4s, v17.4s, v1.4s\n"
1537                                 "fmin v18.4s, v18.4s, v0.4s\n"
1538                                 "fmin v19.4s, v19.4s, v0.4s\n"
1539                                 "str q16, [%[outptr3], #0x20]\n"
1540                                 "add %[outptr3], %[outptr3], #0x30\n"
1541                                 "fmax v18.4s, v18.4s, v1.4s\n"
1542                                 "str q17, [%[outptr4]]\n"
1543                                 "fmax v19.4s, v19.4s, v1.4s\n"
1544                                 "str q18, [%[outptr4], #0x10]\n"
1545                                 "str q19, [%[outptr4], #0x20]\n"
1546                                 "add %[outptr4], %[outptr4], #0x30\n"
1547                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1548                               [inptr] "+r" (inptr)
1549                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1550                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1551                             );
1552                         }
1553                     }
1554                     break;
1555 
1556                 case 6:
1557                     {
1558                         if ((i+11) >= xmax)
1559                         {
1560                             for (int xi=0; xi<11; xi++)
1561                             {
1562                                 if ((i+xi) < xmax)
1563                                 {
1564                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1565                                     outptr0++;
1566                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1567                                     outptr1++;
1568                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1569                                     outptr2++;
1570                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1571                                     outptr3++;
1572                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1573                                     outptr4++;
1574                                     *outptr5 = std::min(std::max(minval, inptr[xi + 60] + biasptr[xi]), maxval);
1575                                     outptr5++;
1576                                 }
1577                             }
1578                             inptr += 96;
1579                         } else {
1580                             /* Optimized routine to copy an entire block */
1581                             __asm __volatile (
1582                                 "dup v0.4s, %[maxval].s[0]\n"
1583                                 "ldr q2, [%[biasptr]]\n"
1584                                 "dup v1.4s, %[minval].s[0]\n"
1585                                 "ldr q3, [%[biasptr], #0x10]\n"
1586                                 "ldr q4, [%[biasptr], #0x20]\n"
1587                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1588                                 "ldr q13, [%[inptr]]\n"
1589                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1590                                 "ldr q14, [%[inptr], #0x10]\n"
1591                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1592                                 "fadd v13.4s, v13.4s, v2.4s\n"
1593                                 "ldr q15, [%[inptr], #0x20]\n"
1594                                 "ldr q16, [%[inptr], #0x30]\n"
1595                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1596                                 "fadd v14.4s, v14.4s, v3.4s\n"
1597                                 "ldr q17, [%[inptr], #0x40]\n"
1598                                 "fmin v13.4s, v13.4s, v0.4s\n"
1599                                 "ldr q18, [%[inptr], #0x50]\n"
1600                                 "fadd v15.4s, v15.4s, v4.4s\n"
1601                                 "ldr q19, [%[inptr], #0x60]\n"
1602                                 "fadd v16.4s, v16.4s, v2.4s\n"
1603                                 "ldr q20, [%[inptr], #0x70]\n"
1604                                 "fmin v14.4s, v14.4s, v0.4s\n"
1605                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1606                                 "fmax v13.4s, v13.4s, v1.4s\n"
1607                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1608                                 "fmax v14.4s, v14.4s, v1.4s\n"
1609                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1610                                 "fmin v15.4s, v15.4s, v0.4s\n"
1611                                 "str q13, [%[outptr0]]\n"
1612                                 "fmin v16.4s, v16.4s, v0.4s\n"
1613                                 "ldr q13, [%[inptr], #0x80]\n"
1614                                 "fadd v17.4s, v17.4s, v3.4s\n"
1615                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1616                                 "fmax v15.4s, v15.4s, v1.4s\n"
1617                                 "str q14, [%[outptr0], #0x10]\n"
1618                                 "fmax v16.4s, v16.4s, v1.4s\n"
1619                                 "ldr q14, [%[inptr], #0x90]\n"
1620                                 "fmin v17.4s, v17.4s, v0.4s\n"
1621                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1622                                 "fadd v18.4s, v18.4s, v4.4s\n"
1623                                 "str q15, [%[outptr0], #0x20]\n"
1624                                 "fadd v19.4s, v19.4s, v2.4s\n"
1625                                 "ldr q15, [%[inptr], #0xa0]\n"
1626                                 "fmax v17.4s, v17.4s, v1.4s\n"
1627                                 "add %[outptr0], %[outptr0], #0x30\n"
1628                                 "fmin v18.4s, v18.4s, v0.4s\n"
1629                                 "str q16, [%[outptr1]]\n"
1630                                 "fmin v19.4s, v19.4s, v0.4s\n"
1631                                 "ldr q16, [%[inptr], #0xb0]\n"
1632                                 "fadd v20.4s, v20.4s, v3.4s\n"
1633                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1634                                 "fmax v18.4s, v18.4s, v1.4s\n"
1635                                 "str q17, [%[outptr1], #0x10]\n"
1636                                 "fmax v19.4s, v19.4s, v1.4s\n"
1637                                 "ldr q17, [%[inptr], #0xc0]\n"
1638                                 "fmin v20.4s, v20.4s, v0.4s\n"
1639                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1640                                 "fadd v13.4s, v13.4s, v4.4s\n"
1641                                 "str q18, [%[outptr1], #0x20]\n"
1642                                 "fadd v14.4s, v14.4s, v2.4s\n"
1643                                 "ldr q18, [%[inptr], #0xd0]\n"
1644                                 "fmax v20.4s, v20.4s, v1.4s\n"
1645                                 "add %[outptr1], %[outptr1], #0x30\n"
1646                                 "fmin v13.4s, v13.4s, v0.4s\n"
1647                                 "str q19, [%[outptr2]]\n"
1648                                 "fmin v14.4s, v14.4s, v0.4s\n"
1649                                 "ldr q19, [%[inptr], #0xe0]\n"
1650                                 "fadd v15.4s, v15.4s, v3.4s\n"
1651                                 "fadd v16.4s, v16.4s, v4.4s\n"
1652                                 "str q20, [%[outptr2], #0x10]\n"
1653                                 "fmax v13.4s, v13.4s, v1.4s\n"
1654                                 "ldr q20, [%[inptr], #0xf0]\n"
1655                                 "fmax v14.4s, v14.4s, v1.4s\n"
1656                                 "fmin v15.4s, v15.4s, v0.4s\n"
1657                                 "fmin v16.4s, v16.4s, v0.4s\n"
1658                                 "str q13, [%[outptr2], #0x20]\n"
1659                                 "fadd v17.4s, v17.4s, v2.4s\n"
1660                                 "ldr q13, [%[inptr], #0x100]\n"
1661                                 "fadd v18.4s, v18.4s, v3.4s\n"
1662                                 "add %[outptr2], %[outptr2], #0x30\n"
1663                                 "fmax v15.4s, v15.4s, v1.4s\n"
1664                                 "str q14, [%[outptr3]]\n"
1665                                 "fmax v16.4s, v16.4s, v1.4s\n"
1666                                 "ldr q14, [%[inptr], #0x110]\n"
1667                                 "fmin v17.4s, v17.4s, v0.4s\n"
1668                                 "add %[inptr], %[inptr], #0x180\n"
1669                                 "fmin v18.4s, v18.4s, v0.4s\n"
1670                                 "str q15, [%[outptr3], #0x10]\n"
1671                                 "fadd v19.4s, v19.4s, v4.4s\n"
1672                                 "fmax v17.4s, v17.4s, v1.4s\n"
1673                                 "fadd v20.4s, v20.4s, v2.4s\n"
1674                                 "str q16, [%[outptr3], #0x20]\n"
1675                                 "fmax v18.4s, v18.4s, v1.4s\n"
1676                                 "add %[outptr3], %[outptr3], #0x30\n"
1677                                 "fmin v19.4s, v19.4s, v0.4s\n"
1678                                 "str q17, [%[outptr4]]\n"
1679                                 "fmin v20.4s, v20.4s, v0.4s\n"
1680                                 "fadd v13.4s, v13.4s, v3.4s\n"
1681                                 "fadd v14.4s, v14.4s, v4.4s\n"
1682                                 "str q18, [%[outptr4], #0x10]\n"
1683                                 "fmax v19.4s, v19.4s, v1.4s\n"
1684                                 "fmax v20.4s, v20.4s, v1.4s\n"
1685                                 "fmin v13.4s, v13.4s, v0.4s\n"
1686                                 "fmin v14.4s, v14.4s, v0.4s\n"
1687                                 "str q19, [%[outptr4], #0x20]\n"
1688                                 "add %[outptr4], %[outptr4], #0x30\n"
1689                                 "fmax v13.4s, v13.4s, v1.4s\n"
1690                                 "str q20, [%[outptr5]]\n"
1691                                 "fmax v14.4s, v14.4s, v1.4s\n"
1692                                 "str q13, [%[outptr5], #0x10]\n"
1693                                 "str q14, [%[outptr5], #0x20]\n"
1694                                 "add %[outptr5], %[outptr5], #0x30\n"
1695                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1696                               [inptr] "+r" (inptr)
1697                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1698                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1699                             );
1700                         }
1701                     }
1702                     break;
1703 
1704                 case 7:
1705                     {
1706                         if ((i+11) >= xmax)
1707                         {
1708                             for (int xi=0; xi<11; xi++)
1709                             {
1710                                 if ((i+xi) < xmax)
1711                                 {
1712                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1713                                     outptr0++;
1714                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1715                                     outptr1++;
1716                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1717                                     outptr2++;
1718                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1719                                     outptr3++;
1720                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1721                                     outptr4++;
1722                                     *outptr5 = std::min(std::max(minval, inptr[xi + 60] + biasptr[xi]), maxval);
1723                                     outptr5++;
1724                                     *outptr6 = std::min(std::max(minval, inptr[xi + 72] + biasptr[xi]), maxval);
1725                                     outptr6++;
1726                                 }
1727                             }
1728                             inptr += 96;
1729                         } else {
1730                             /* Optimized routine to copy an entire block */
1731                             __asm __volatile (
1732                                 "dup v0.4s, %[maxval].s[0]\n"
1733                                 "ldr q2, [%[biasptr]]\n"
1734                                 "dup v1.4s, %[minval].s[0]\n"
1735                                 "ldr q3, [%[biasptr], #0x10]\n"
1736                                 "ldr q4, [%[biasptr], #0x20]\n"
1737                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1738                                 "ldr q13, [%[inptr]]\n"
1739                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1740                                 "ldr q14, [%[inptr], #0x10]\n"
1741                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1742                                 "fadd v13.4s, v13.4s, v2.4s\n"
1743                                 "ldr q15, [%[inptr], #0x20]\n"
1744                                 "ldr q16, [%[inptr], #0x30]\n"
1745                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1746                                 "fadd v14.4s, v14.4s, v3.4s\n"
1747                                 "ldr q17, [%[inptr], #0x40]\n"
1748                                 "fmin v13.4s, v13.4s, v0.4s\n"
1749                                 "ldr q18, [%[inptr], #0x50]\n"
1750                                 "fadd v15.4s, v15.4s, v4.4s\n"
1751                                 "ldr q19, [%[inptr], #0x60]\n"
1752                                 "fadd v16.4s, v16.4s, v2.4s\n"
1753                                 "ldr q20, [%[inptr], #0x70]\n"
1754                                 "fmin v14.4s, v14.4s, v0.4s\n"
1755                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1756                                 "fmax v13.4s, v13.4s, v1.4s\n"
1757                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1758                                 "fmax v14.4s, v14.4s, v1.4s\n"
1759                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1760                                 "fmin v15.4s, v15.4s, v0.4s\n"
1761                                 "str q13, [%[outptr0]]\n"
1762                                 "fmin v16.4s, v16.4s, v0.4s\n"
1763                                 "ldr q13, [%[inptr], #0x80]\n"
1764                                 "fadd v17.4s, v17.4s, v3.4s\n"
1765                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1766                                 "fmax v15.4s, v15.4s, v1.4s\n"
1767                                 "str q14, [%[outptr0], #0x10]\n"
1768                                 "fmax v16.4s, v16.4s, v1.4s\n"
1769                                 "ldr q14, [%[inptr], #0x90]\n"
1770                                 "fmin v17.4s, v17.4s, v0.4s\n"
1771                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1772                                 "fadd v18.4s, v18.4s, v4.4s\n"
1773                                 "str q15, [%[outptr0], #0x20]\n"
1774                                 "fadd v19.4s, v19.4s, v2.4s\n"
1775                                 "ldr q15, [%[inptr], #0xa0]\n"
1776                                 "fmax v17.4s, v17.4s, v1.4s\n"
1777                                 "add %[outptr0], %[outptr0], #0x30\n"
1778                                 "fmin v18.4s, v18.4s, v0.4s\n"
1779                                 "str q16, [%[outptr1]]\n"
1780                                 "fmin v19.4s, v19.4s, v0.4s\n"
1781                                 "ldr q16, [%[inptr], #0xb0]\n"
1782                                 "fadd v20.4s, v20.4s, v3.4s\n"
1783                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1784                                 "fmax v18.4s, v18.4s, v1.4s\n"
1785                                 "str q17, [%[outptr1], #0x10]\n"
1786                                 "fmax v19.4s, v19.4s, v1.4s\n"
1787                                 "ldr q17, [%[inptr], #0xc0]\n"
1788                                 "fmin v20.4s, v20.4s, v0.4s\n"
1789                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1790                                 "fadd v13.4s, v13.4s, v4.4s\n"
1791                                 "str q18, [%[outptr1], #0x20]\n"
1792                                 "fadd v14.4s, v14.4s, v2.4s\n"
1793                                 "ldr q18, [%[inptr], #0xd0]\n"
1794                                 "fmax v20.4s, v20.4s, v1.4s\n"
1795                                 "add %[outptr1], %[outptr1], #0x30\n"
1796                                 "fmin v13.4s, v13.4s, v0.4s\n"
1797                                 "str q19, [%[outptr2]]\n"
1798                                 "fmin v14.4s, v14.4s, v0.4s\n"
1799                                 "ldr q19, [%[inptr], #0xe0]\n"
1800                                 "fadd v15.4s, v15.4s, v3.4s\n"
1801                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1802                                 "fmax v13.4s, v13.4s, v1.4s\n"
1803                                 "str q20, [%[outptr2], #0x10]\n"
1804                                 "fmax v14.4s, v14.4s, v1.4s\n"
1805                                 "ldr q20, [%[inptr], #0xf0]\n"
1806                                 "fmin v15.4s, v15.4s, v0.4s\n"
1807                                 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1808                                 "fadd v16.4s, v16.4s, v4.4s\n"
1809                                 "str q13, [%[outptr2], #0x20]\n"
1810                                 "fadd v17.4s, v17.4s, v2.4s\n"
1811                                 "ldr q13, [%[inptr], #0x100]\n"
1812                                 "fmax v15.4s, v15.4s, v1.4s\n"
1813                                 "add %[outptr2], %[outptr2], #0x30\n"
1814                                 "fmin v16.4s, v16.4s, v0.4s\n"
1815                                 "str q14, [%[outptr3]]\n"
1816                                 "fmin v17.4s, v17.4s, v0.4s\n"
1817                                 "ldr q14, [%[inptr], #0x110]\n"
1818                                 "fadd v18.4s, v18.4s, v3.4s\n"
1819                                 "fadd v19.4s, v19.4s, v4.4s\n"
1820                                 "str q15, [%[outptr3], #0x10]\n"
1821                                 "fmax v16.4s, v16.4s, v1.4s\n"
1822                                 "ldr q15, [%[inptr], #0x120]\n"
1823                                 "fmax v17.4s, v17.4s, v1.4s\n"
1824                                 "fmin v18.4s, v18.4s, v0.4s\n"
1825                                 "fmin v19.4s, v19.4s, v0.4s\n"
1826                                 "str q16, [%[outptr3], #0x20]\n"
1827                                 "fadd v20.4s, v20.4s, v2.4s\n"
1828                                 "ldr q16, [%[inptr], #0x130]\n"
1829                                 "fadd v13.4s, v13.4s, v3.4s\n"
1830                                 "add %[outptr3], %[outptr3], #0x30\n"
1831                                 "fmax v18.4s, v18.4s, v1.4s\n"
1832                                 "str q17, [%[outptr4]]\n"
1833                                 "fmax v19.4s, v19.4s, v1.4s\n"
1834                                 "ldr q17, [%[inptr], #0x140]\n"
1835                                 "fmin v20.4s, v20.4s, v0.4s\n"
1836                                 "add %[inptr], %[inptr], #0x180\n"
1837                                 "fmin v13.4s, v13.4s, v0.4s\n"
1838                                 "str q18, [%[outptr4], #0x10]\n"
1839                                 "fadd v14.4s, v14.4s, v4.4s\n"
1840                                 "fmax v20.4s, v20.4s, v1.4s\n"
1841                                 "fadd v15.4s, v15.4s, v2.4s\n"
1842                                 "str q19, [%[outptr4], #0x20]\n"
1843                                 "fmax v13.4s, v13.4s, v1.4s\n"
1844                                 "add %[outptr4], %[outptr4], #0x30\n"
1845                                 "fmin v14.4s, v14.4s, v0.4s\n"
1846                                 "str q20, [%[outptr5]]\n"
1847                                 "fmin v15.4s, v15.4s, v0.4s\n"
1848                                 "fadd v16.4s, v16.4s, v3.4s\n"
1849                                 "fadd v17.4s, v17.4s, v4.4s\n"
1850                                 "str q13, [%[outptr5], #0x10]\n"
1851                                 "fmax v14.4s, v14.4s, v1.4s\n"
1852                                 "fmax v15.4s, v15.4s, v1.4s\n"
1853                                 "fmin v16.4s, v16.4s, v0.4s\n"
1854                                 "fmin v17.4s, v17.4s, v0.4s\n"
1855                                 "str q14, [%[outptr5], #0x20]\n"
1856                                 "add %[outptr5], %[outptr5], #0x30\n"
1857                                 "fmax v16.4s, v16.4s, v1.4s\n"
1858                                 "str q15, [%[outptr6]]\n"
1859                                 "fmax v17.4s, v17.4s, v1.4s\n"
1860                                 "str q16, [%[outptr6], #0x10]\n"
1861                                 "str q17, [%[outptr6], #0x20]\n"
1862                                 "add %[outptr6], %[outptr6], #0x30\n"
1863                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1864                               [inptr] "+r" (inptr)
1865                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1866                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1867                             );
1868                         }
1869                     }
1870                     break;
1871 
1872                 default:
1873                 case 8:
1874                     {
1875                         if ((i+11) >= xmax)
1876                         {
1877                             for (int xi=0; xi<11; xi++)
1878                             {
1879                                 if ((i+xi) < xmax)
1880                                 {
1881                                     *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1882                                     outptr0++;
1883                                     *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1884                                     outptr1++;
1885                                     *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1886                                     outptr2++;
1887                                     *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1888                                     outptr3++;
1889                                     *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1890                                     outptr4++;
1891                                     *outptr5 = std::min(std::max(minval, inptr[xi + 60] + biasptr[xi]), maxval);
1892                                     outptr5++;
1893                                     *outptr6 = std::min(std::max(minval, inptr[xi + 72] + biasptr[xi]), maxval);
1894                                     outptr6++;
1895                                     *outptr7 = std::min(std::max(minval, inptr[xi + 84] + biasptr[xi]), maxval);
1896                                     outptr7++;
1897                                 }
1898                             }
1899                             inptr += 96;
1900                         } else {
1901                             /* Optimized routine to copy an entire block */
1902                             __asm __volatile (
1903                                 "dup v0.4s, %[maxval].s[0]\n"
1904                                 "ldr q2, [%[biasptr]]\n"
1905                                 "dup v1.4s, %[minval].s[0]\n"
1906                                 "ldr q3, [%[biasptr], #0x10]\n"
1907                                 "ldr q4, [%[biasptr], #0x20]\n"
1908                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1909                                 "ldr q13, [%[inptr]]\n"
1910                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1911                                 "ldr q14, [%[inptr], #0x10]\n"
1912                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1913                                 "fadd v13.4s, v13.4s, v2.4s\n"
1914                                 "ldr q15, [%[inptr], #0x20]\n"
1915                                 "ldr q16, [%[inptr], #0x30]\n"
1916                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1917                                 "fadd v14.4s, v14.4s, v3.4s\n"
1918                                 "ldr q17, [%[inptr], #0x40]\n"
1919                                 "fmin v13.4s, v13.4s, v0.4s\n"
1920                                 "ldr q18, [%[inptr], #0x50]\n"
1921                                 "fadd v15.4s, v15.4s, v4.4s\n"
1922                                 "ldr q19, [%[inptr], #0x60]\n"
1923                                 "fadd v16.4s, v16.4s, v2.4s\n"
1924                                 "ldr q20, [%[inptr], #0x70]\n"
1925                                 "fmin v14.4s, v14.4s, v0.4s\n"
1926                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1927                                 "fmax v13.4s, v13.4s, v1.4s\n"
1928                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1929                                 "fmax v14.4s, v14.4s, v1.4s\n"
1930                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1931                                 "fmin v15.4s, v15.4s, v0.4s\n"
1932                                 "str q13, [%[outptr0]]\n"
1933                                 "fmin v16.4s, v16.4s, v0.4s\n"
1934                                 "ldr q13, [%[inptr], #0x80]\n"
1935                                 "fadd v17.4s, v17.4s, v3.4s\n"
1936                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1937                                 "fmax v15.4s, v15.4s, v1.4s\n"
1938                                 "str q14, [%[outptr0], #0x10]\n"
1939                                 "fmax v16.4s, v16.4s, v1.4s\n"
1940                                 "ldr q14, [%[inptr], #0x90]\n"
1941                                 "fmin v17.4s, v17.4s, v0.4s\n"
1942                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1943                                 "fadd v18.4s, v18.4s, v4.4s\n"
1944                                 "str q15, [%[outptr0], #0x20]\n"
1945                                 "fadd v19.4s, v19.4s, v2.4s\n"
1946                                 "ldr q15, [%[inptr], #0xa0]\n"
1947                                 "fmax v17.4s, v17.4s, v1.4s\n"
1948                                 "add %[outptr0], %[outptr0], #0x30\n"
1949                                 "fmin v18.4s, v18.4s, v0.4s\n"
1950                                 "str q16, [%[outptr1]]\n"
1951                                 "fmin v19.4s, v19.4s, v0.4s\n"
1952                                 "ldr q16, [%[inptr], #0xb0]\n"
1953                                 "fadd v20.4s, v20.4s, v3.4s\n"
1954                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1955                                 "fmax v18.4s, v18.4s, v1.4s\n"
1956                                 "str q17, [%[outptr1], #0x10]\n"
1957                                 "fmax v19.4s, v19.4s, v1.4s\n"
1958                                 "ldr q17, [%[inptr], #0xc0]\n"
1959                                 "fmin v20.4s, v20.4s, v0.4s\n"
1960                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1961                                 "fadd v13.4s, v13.4s, v4.4s\n"
1962                                 "str q18, [%[outptr1], #0x20]\n"
1963                                 "fadd v14.4s, v14.4s, v2.4s\n"
1964                                 "ldr q18, [%[inptr], #0xd0]\n"
1965                                 "fmax v20.4s, v20.4s, v1.4s\n"
1966                                 "add %[outptr1], %[outptr1], #0x30\n"
1967                                 "fmin v13.4s, v13.4s, v0.4s\n"
1968                                 "str q19, [%[outptr2]]\n"
1969                                 "fmin v14.4s, v14.4s, v0.4s\n"
1970                                 "ldr q19, [%[inptr], #0xe0]\n"
1971                                 "fadd v15.4s, v15.4s, v3.4s\n"
1972                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1973                                 "fmax v13.4s, v13.4s, v1.4s\n"
1974                                 "str q20, [%[outptr2], #0x10]\n"
1975                                 "fmax v14.4s, v14.4s, v1.4s\n"
1976                                 "ldr q20, [%[inptr], #0xf0]\n"
1977                                 "fmin v15.4s, v15.4s, v0.4s\n"
1978                                 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1979                                 "fadd v16.4s, v16.4s, v4.4s\n"
1980                                 "str q13, [%[outptr2], #0x20]\n"
1981                                 "fadd v17.4s, v17.4s, v2.4s\n"
1982                                 "ldr q13, [%[inptr], #0x100]\n"
1983                                 "fmax v15.4s, v15.4s, v1.4s\n"
1984                                 "add %[outptr2], %[outptr2], #0x30\n"
1985                                 "fmin v16.4s, v16.4s, v0.4s\n"
1986                                 "str q14, [%[outptr3]]\n"
1987                                 "fmin v17.4s, v17.4s, v0.4s\n"
1988                                 "ldr q14, [%[inptr], #0x110]\n"
1989                                 "fadd v18.4s, v18.4s, v3.4s\n"
1990                                 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1991                                 "fmax v16.4s, v16.4s, v1.4s\n"
1992                                 "str q15, [%[outptr3], #0x10]\n"
1993                                 "fmax v17.4s, v17.4s, v1.4s\n"
1994                                 "ldr q15, [%[inptr], #0x120]\n"
1995                                 "fmin v18.4s, v18.4s, v0.4s\n"
1996                                 "fadd v19.4s, v19.4s, v4.4s\n"
1997                                 "str q16, [%[outptr3], #0x20]\n"
1998                                 "fadd v20.4s, v20.4s, v2.4s\n"
1999                                 "ldr q16, [%[inptr], #0x130]\n"
2000                                 "fadd v13.4s, v13.4s, v3.4s\n"
2001                                 "add %[outptr3], %[outptr3], #0x30\n"
2002                                 "fmax v18.4s, v18.4s, v1.4s\n"
2003                                 "str q17, [%[outptr4]]\n"
2004                                 "fmin v19.4s, v19.4s, v0.4s\n"
2005                                 "ldr q17, [%[inptr], #0x140]\n"
2006                                 "fmin v20.4s, v20.4s, v0.4s\n"
2007                                 "fmin v13.4s, v13.4s, v0.4s\n"
2008                                 "str q18, [%[outptr4], #0x10]\n"
2009                                 "fadd v14.4s, v14.4s, v4.4s\n"
2010                                 "ldr q18, [%[inptr], #0x150]\n"
2011                                 "fmax v19.4s, v19.4s, v1.4s\n"
2012                                 "fmax v20.4s, v20.4s, v1.4s\n"
2013                                 "fmax v13.4s, v13.4s, v1.4s\n"
2014                                 "fmin v14.4s, v14.4s, v0.4s\n"
2015                                 "str q19, [%[outptr4], #0x20]\n"
2016                                 "fadd v15.4s, v15.4s, v2.4s\n"
2017                                 "ldr q19, [%[inptr], #0x160]\n"
2018                                 "fadd v16.4s, v16.4s, v3.4s\n"
2019                                 "add %[outptr4], %[outptr4], #0x30\n"
2020                                 "fmax v14.4s, v14.4s, v1.4s\n"
2021                                 "str q20, [%[outptr5]]\n"
2022                                 "fmin v15.4s, v15.4s, v0.4s\n"
2023                                 "ldr q20, [%[inptr], #0x170]\n"
2024                                 "fmin v16.4s, v16.4s, v0.4s\n"
2025                                 "add %[inptr], %[inptr], #0x180\n"
2026                                 "fadd v17.4s, v17.4s, v4.4s\n"
2027                                 "str q13, [%[outptr5], #0x10]\n"
2028                                 "fmax v15.4s, v15.4s, v1.4s\n"
2029                                 "fmax v16.4s, v16.4s, v1.4s\n"
2030                                 "fadd v18.4s, v18.4s, v2.4s\n"
2031                                 "str q14, [%[outptr5], #0x20]\n"
2032                                 "fmin v17.4s, v17.4s, v0.4s\n"
2033                                 "add %[outptr5], %[outptr5], #0x30\n"
2034                                 "fadd v19.4s, v19.4s, v3.4s\n"
2035                                 "str q15, [%[outptr6]]\n"
2036                                 "fmin v18.4s, v18.4s, v0.4s\n"
2037                                 "fmax v17.4s, v17.4s, v1.4s\n"
2038                                 "fadd v20.4s, v20.4s, v4.4s\n"
2039                                 "str q16, [%[outptr6], #0x10]\n"
2040                                 "fmin v19.4s, v19.4s, v0.4s\n"
2041                                 "fmax v18.4s, v18.4s, v1.4s\n"
2042                                 "fmin v20.4s, v20.4s, v0.4s\n"
2043                                 "str q17, [%[outptr6], #0x20]\n"
2044                                 "fmax v19.4s, v19.4s, v1.4s\n"
2045                                 "add %[outptr6], %[outptr6], #0x30\n"
2046                                 "fmax v20.4s, v20.4s, v1.4s\n"
2047                                 "str q18, [%[outptr7]]\n"
2048                                 "str q19, [%[outptr7], #0x10]\n"
2049                                 "str q20, [%[outptr7], #0x20]\n"
2050                                 "add %[outptr7], %[outptr7], #0x30\n"
2051                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
2052                               [inptr] "+r" (inptr)
2053                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
2054                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
2055                             );
2056                         }
2057                     }
2058                     break;
2059 
2060 
2061                 }
2062             }
2063         }
2064     }
2065 }
2066 
2067 #endif // __aarch64__
2068