xref: /aosp_15_r20/external/ComputeLibrary/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
27 
28 template<>
MergeResults(__fp16 * out,const __fp16 * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const __fp16 * bias,Activation act,bool append)29 void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const __fp16 *bias, Activation act, bool append)
30 {
31     const __fp16 *inptr = in;
32     __fp16 nullbias[24];
33     __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
34     __fp16 maxval =   static_cast<__fp16>(std::numeric_limits<float>::infinity());
35 
36     switch(act.type)
37     {
38         default:
39         case Activation::Type::None:
40             break;
41         case Activation::Type::BoundedReLU:
42             maxval = static_cast<__fp16>(act.param1);
43             /* fall through */
44         case Activation::Type::ReLU:
45             minval = 0.0f;
46             break;
47     }
48 
49     if (!append && !bias)
50     {
51         memset(nullbias, 0, (24 * sizeof(__fp16)));
52     }
53 
54     for (int y=y0; y<ymax; y+=8)
55     {
56         __fp16 *outptr0 = out + (y * ldout) + x0;
57         __fp16 *outptr1 = outptr0 + ldout;
58         __fp16 *outptr2 = outptr1 + ldout;
59         __fp16 *outptr3 = outptr2 + ldout;
60         __fp16 *outptr4 = outptr3 + ldout;
61         __fp16 *outptr5 = outptr4 + ldout;
62         __fp16 *outptr6 = outptr5 + ldout;
63         __fp16 *outptr7 = outptr6 + ldout;
64 
65         const int height = ymax - y;
66 
67         for (int i=x0; i<xmax; i+=24)
68         {
69             if (append)
70             {
71                 switch(height)
72                 {
73                 case 1:
74                     {
75                         if ((i+23) >= xmax)
76                         {
77                             for (int xi=0; xi<23; xi++)
78                             {
79                                 if ((i+xi) < xmax)
80                                 {
81                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
82                                     outptr0++;
83                                 }
84                             }
85                             inptr += 192;
86                         } else {
87                             /* Optimized routine to copy an entire block */
88                             __asm __volatile (
89 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
90                                 ".arch  armv8.2-a+fp16\n"
91 #endif
92                                 "dup v0.8h, %[maxval].h[0]\n"
93                                 "ldr q2, [%[outptr0]]\n"
94                                 "dup v1.8h, %[minval].h[0]\n"
95                                 "ldr q10, [%[inptr]]\n"
96                                 "ldr q3, [%[outptr0], #0x10]\n"
97                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
98                                 "ldr q11, [%[inptr], #0x10]\n"
99                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
100                                 "fadd v10.8h, v10.8h, v2.8h\n"
101                                 "ldr q4, [%[outptr0], #0x20]\n"
102                                 "ldr q12, [%[inptr], #0x20]\n"
103                                 "add %[inptr], %[inptr], #0x180\n"
104                                 "fadd v11.8h, v11.8h, v3.8h\n"
105                                 "fmin v10.8h, v10.8h, v0.8h\n"
106                                 "fadd v12.8h, v12.8h, v4.8h\n"
107                                 "fmin v11.8h, v11.8h, v0.8h\n"
108                                 "fmax v10.8h, v10.8h, v1.8h\n"
109                                 "fmin v12.8h, v12.8h, v0.8h\n"
110                                 "fmax v11.8h, v11.8h, v1.8h\n"
111                                 "str q10, [%[outptr0]]\n"
112                                 "fmax v12.8h, v12.8h, v1.8h\n"
113                                 "str q11, [%[outptr0], #0x10]\n"
114                                 "str q12, [%[outptr0], #0x20]\n"
115                                 "add %[outptr0], %[outptr0], #0x30\n"
116                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
117                               [inptr] "+r" (inptr)
118                             : [minval] "w" (minval), [maxval] "w" (maxval)
119                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
120                             );
121                         }
122                     }
123                     break;
124 
125                 case 2:
126                     {
127                         if ((i+23) >= xmax)
128                         {
129                             for (int xi=0; xi<23; xi++)
130                             {
131                                 if ((i+xi) < xmax)
132                                 {
133                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
134                                     outptr0++;
135                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
136                                     outptr1++;
137                                 }
138                             }
139                             inptr += 192;
140                         } else {
141                             /* Optimized routine to copy an entire block */
142                             __asm __volatile (
143 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
144                                 ".arch  armv8.2-a+fp16\n"
145 #endif
146                                 "dup v0.8h, %[maxval].h[0]\n"
147                                 "ldr q2, [%[outptr0]]\n"
148                                 "dup v1.8h, %[minval].h[0]\n"
149                                 "ldr q10, [%[inptr]]\n"
150                                 "ldr q3, [%[outptr0], #0x10]\n"
151                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
152                                 "ldr q11, [%[inptr], #0x10]\n"
153                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
154                                 "fadd v10.8h, v10.8h, v2.8h\n"
155                                 "ldr q4, [%[outptr0], #0x20]\n"
156                                 "ldr q12, [%[inptr], #0x20]\n"
157                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
158                                 "fadd v11.8h, v11.8h, v3.8h\n"
159                                 "ldr q5, [%[outptr1]]\n"
160                                 "fmin v10.8h, v10.8h, v0.8h\n"
161                                 "ldr q13, [%[inptr], #0x30]\n"
162                                 "fadd v12.8h, v12.8h, v4.8h\n"
163                                 "ldr q6, [%[outptr1], #0x10]\n"
164                                 "ldr q14, [%[inptr], #0x40]\n"
165                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
166                                 "fmax v10.8h, v10.8h, v1.8h\n"
167                                 "ldr q7, [%[outptr1], #0x20]\n"
168                                 "fmin v11.8h, v11.8h, v0.8h\n"
169                                 "ldr q15, [%[inptr], #0x50]\n"
170                                 "fmin v12.8h, v12.8h, v0.8h\n"
171                                 "add %[inptr], %[inptr], #0x180\n"
172                                 "fadd v13.8h, v13.8h, v5.8h\n"
173                                 "str q10, [%[outptr0]]\n"
174                                 "fmax v11.8h, v11.8h, v1.8h\n"
175                                 "fmax v12.8h, v12.8h, v1.8h\n"
176                                 "fadd v14.8h, v14.8h, v6.8h\n"
177                                 "fmin v13.8h, v13.8h, v0.8h\n"
178                                 "str q11, [%[outptr0], #0x10]\n"
179                                 "fadd v15.8h, v15.8h, v7.8h\n"
180                                 "fmin v14.8h, v14.8h, v0.8h\n"
181                                 "str q12, [%[outptr0], #0x20]\n"
182                                 "fmax v13.8h, v13.8h, v1.8h\n"
183                                 "add %[outptr0], %[outptr0], #0x30\n"
184                                 "fmin v15.8h, v15.8h, v0.8h\n"
185                                 "fmax v14.8h, v14.8h, v1.8h\n"
186                                 "str q13, [%[outptr1]]\n"
187                                 "fmax v15.8h, v15.8h, v1.8h\n"
188                                 "str q14, [%[outptr1], #0x10]\n"
189                                 "str q15, [%[outptr1], #0x20]\n"
190                                 "add %[outptr1], %[outptr1], #0x30\n"
191                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
192                               [inptr] "+r" (inptr)
193                             : [minval] "w" (minval), [maxval] "w" (maxval)
194                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
195                             );
196                         }
197                     }
198                     break;
199 
200                 case 3:
201                     {
202                         if ((i+23) >= xmax)
203                         {
204                             for (int xi=0; xi<23; xi++)
205                             {
206                                 if ((i+xi) < xmax)
207                                 {
208                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
209                                     outptr0++;
210                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
211                                     outptr1++;
212                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
213                                     outptr2++;
214                                 }
215                             }
216                             inptr += 192;
217                         } else {
218                             /* Optimized routine to copy an entire block */
219                             __asm __volatile (
220 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
221                                 ".arch  armv8.2-a+fp16\n"
222 #endif
223                                 "dup v0.8h, %[maxval].h[0]\n"
224                                 "ldr q2, [%[outptr0]]\n"
225                                 "dup v1.8h, %[minval].h[0]\n"
226                                 "ldr q10, [%[inptr]]\n"
227                                 "ldr q3, [%[outptr0], #0x10]\n"
228                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
229                                 "ldr q11, [%[inptr], #0x10]\n"
230                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
231                                 "fadd v10.8h, v10.8h, v2.8h\n"
232                                 "ldr q4, [%[outptr0], #0x20]\n"
233                                 "ldr q12, [%[inptr], #0x20]\n"
234                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
235                                 "fadd v11.8h, v11.8h, v3.8h\n"
236                                 "ldr q5, [%[outptr1]]\n"
237                                 "fmin v10.8h, v10.8h, v0.8h\n"
238                                 "ldr q13, [%[inptr], #0x30]\n"
239                                 "fadd v12.8h, v12.8h, v4.8h\n"
240                                 "ldr q6, [%[outptr1], #0x10]\n"
241                                 "ldr q14, [%[inptr], #0x40]\n"
242                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
243                                 "fmax v10.8h, v10.8h, v1.8h\n"
244                                 "ldr q7, [%[outptr1], #0x20]\n"
245                                 "fmin v11.8h, v11.8h, v0.8h\n"
246                                 "ldr q15, [%[inptr], #0x50]\n"
247                                 "fmin v12.8h, v12.8h, v0.8h\n"
248                                 "ldr q8, [%[outptr2]]\n"
249                                 "fadd v13.8h, v13.8h, v5.8h\n"
250                                 "str q10, [%[outptr0]]\n"
251                                 "fadd v14.8h, v14.8h, v6.8h\n"
252                                 "ldr q16, [%[inptr], #0x60]\n"
253                                 "fmax v11.8h, v11.8h, v1.8h\n"
254                                 "ldr q9, [%[outptr2], #0x10]\n"
255                                 "fmax v12.8h, v12.8h, v1.8h\n"
256                                 "ldr q17, [%[inptr], #0x70]\n"
257                                 "fmin v13.8h, v13.8h, v0.8h\n"
258                                 "ldr q2, [%[outptr2], #0x20]\n"
259                                 "fmin v14.8h, v14.8h, v0.8h\n"
260                                 "str q11, [%[outptr0], #0x10]\n"
261                                 "fadd v15.8h, v15.8h, v7.8h\n"
262                                 "ldr q10, [%[inptr], #0x80]\n"
263                                 "fadd v16.8h, v16.8h, v8.8h\n"
264                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
265                                 "fmax v13.8h, v13.8h, v1.8h\n"
266                                 "str q12, [%[outptr0], #0x20]\n"
267                                 "fmax v14.8h, v14.8h, v1.8h\n"
268                                 "add %[outptr0], %[outptr0], #0x30\n"
269                                 "fmin v15.8h, v15.8h, v0.8h\n"
270                                 "str q13, [%[outptr1]]\n"
271                                 "fmin v16.8h, v16.8h, v0.8h\n"
272                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
273                                 "fadd v17.8h, v17.8h, v9.8h\n"
274                                 "str q14, [%[outptr1], #0x10]\n"
275                                 "fmax v15.8h, v15.8h, v1.8h\n"
276                                 "add %[inptr], %[inptr], #0x180\n"
277                                 "fmax v16.8h, v16.8h, v1.8h\n"
278                                 "fmin v17.8h, v17.8h, v0.8h\n"
279                                 "str q15, [%[outptr1], #0x20]\n"
280                                 "fadd v10.8h, v10.8h, v2.8h\n"
281                                 "add %[outptr1], %[outptr1], #0x30\n"
282                                 "fmax v17.8h, v17.8h, v1.8h\n"
283                                 "str q16, [%[outptr2]]\n"
284                                 "fmin v10.8h, v10.8h, v0.8h\n"
285                                 "str q17, [%[outptr2], #0x10]\n"
286                                 "fmax v10.8h, v10.8h, v1.8h\n"
287                                 "str q10, [%[outptr2], #0x20]\n"
288                                 "add %[outptr2], %[outptr2], #0x30\n"
289                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
290                               [inptr] "+r" (inptr)
291                             : [minval] "w" (minval), [maxval] "w" (maxval)
292                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
293                             );
294                         }
295                     }
296                     break;
297 
298                 case 4:
299                     {
300                         if ((i+23) >= xmax)
301                         {
302                             for (int xi=0; xi<23; xi++)
303                             {
304                                 if ((i+xi) < xmax)
305                                 {
306                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
307                                     outptr0++;
308                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
309                                     outptr1++;
310                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
311                                     outptr2++;
312                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
313                                     outptr3++;
314                                 }
315                             }
316                             inptr += 192;
317                         } else {
318                             /* Optimized routine to copy an entire block */
319                             __asm __volatile (
320 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
321                                 ".arch  armv8.2-a+fp16\n"
322 #endif
323                                 "dup v0.8h, %[maxval].h[0]\n"
324                                 "ldr q2, [%[outptr0]]\n"
325                                 "dup v1.8h, %[minval].h[0]\n"
326                                 "ldr q10, [%[inptr]]\n"
327                                 "ldr q3, [%[outptr0], #0x10]\n"
328                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
329                                 "ldr q11, [%[inptr], #0x10]\n"
330                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
331                                 "fadd v10.8h, v10.8h, v2.8h\n"
332                                 "ldr q4, [%[outptr0], #0x20]\n"
333                                 "ldr q12, [%[inptr], #0x20]\n"
334                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
335                                 "fadd v11.8h, v11.8h, v3.8h\n"
336                                 "ldr q5, [%[outptr1]]\n"
337                                 "fmin v10.8h, v10.8h, v0.8h\n"
338                                 "ldr q13, [%[inptr], #0x30]\n"
339                                 "fadd v12.8h, v12.8h, v4.8h\n"
340                                 "ldr q6, [%[outptr1], #0x10]\n"
341                                 "ldr q14, [%[inptr], #0x40]\n"
342                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
343                                 "fmax v10.8h, v10.8h, v1.8h\n"
344                                 "ldr q7, [%[outptr1], #0x20]\n"
345                                 "fmin v11.8h, v11.8h, v0.8h\n"
346                                 "ldr q15, [%[inptr], #0x50]\n"
347                                 "fmin v12.8h, v12.8h, v0.8h\n"
348                                 "ldr q8, [%[outptr2]]\n"
349                                 "fadd v13.8h, v13.8h, v5.8h\n"
350                                 "str q10, [%[outptr0]]\n"
351                                 "fadd v14.8h, v14.8h, v6.8h\n"
352                                 "ldr q16, [%[inptr], #0x60]\n"
353                                 "fmax v11.8h, v11.8h, v1.8h\n"
354                                 "ldr q9, [%[outptr2], #0x10]\n"
355                                 "fmax v12.8h, v12.8h, v1.8h\n"
356                                 "ldr q17, [%[inptr], #0x70]\n"
357                                 "fmin v13.8h, v13.8h, v0.8h\n"
358                                 "ldr q2, [%[outptr2], #0x20]\n"
359                                 "fmin v14.8h, v14.8h, v0.8h\n"
360                                 "str q11, [%[outptr0], #0x10]\n"
361                                 "fadd v15.8h, v15.8h, v7.8h\n"
362                                 "ldr q10, [%[inptr], #0x80]\n"
363                                 "fadd v16.8h, v16.8h, v8.8h\n"
364                                 "ldr q3, [%[outptr3]]\n"
365                                 "fmax v13.8h, v13.8h, v1.8h\n"
366                                 "str q12, [%[outptr0], #0x20]\n"
367                                 "fmax v14.8h, v14.8h, v1.8h\n"
368                                 "ldr q11, [%[inptr], #0x90]\n"
369                                 "fmin v15.8h, v15.8h, v0.8h\n"
370                                 "ldr q4, [%[outptr3], #0x10]\n"
371                                 "fmin v16.8h, v16.8h, v0.8h\n"
372                                 "str q13, [%[outptr1]]\n"
373                                 "fadd v17.8h, v17.8h, v9.8h\n"
374                                 "ldr q12, [%[inptr], #0xa0]\n"
375                                 "fadd v10.8h, v10.8h, v2.8h\n"
376                                 "ldr q5, [%[outptr3], #0x20]\n"
377                                 "fmax v15.8h, v15.8h, v1.8h\n"
378                                 "str q14, [%[outptr1], #0x10]\n"
379                                 "fmax v16.8h, v16.8h, v1.8h\n"
380                                 "ldr q13, [%[inptr], #0xb0]\n"
381                                 "fmin v17.8h, v17.8h, v0.8h\n"
382                                 "add %[outptr0], %[outptr0], #0x30\n"
383                                 "fmin v10.8h, v10.8h, v0.8h\n"
384                                 "str q15, [%[outptr1], #0x20]\n"
385                                 "fadd v11.8h, v11.8h, v3.8h\n"
386                                 "add %[outptr1], %[outptr1], #0x30\n"
387                                 "fmax v17.8h, v17.8h, v1.8h\n"
388                                 "str q16, [%[outptr2]]\n"
389                                 "fmax v10.8h, v10.8h, v1.8h\n"
390                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
391                                 "fmin v11.8h, v11.8h, v0.8h\n"
392                                 "str q17, [%[outptr2], #0x10]\n"
393                                 "fadd v12.8h, v12.8h, v4.8h\n"
394                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
395                                 "fadd v13.8h, v13.8h, v5.8h\n"
396                                 "str q10, [%[outptr2], #0x20]\n"
397                                 "fmax v11.8h, v11.8h, v1.8h\n"
398                                 "add %[outptr2], %[outptr2], #0x30\n"
399                                 "fmin v12.8h, v12.8h, v0.8h\n"
400                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
401                                 "fmin v13.8h, v13.8h, v0.8h\n"
402                                 "str q11, [%[outptr3]]\n"
403                                 "add %[inptr], %[inptr], #0x180\n"
404                                 "fmax v12.8h, v12.8h, v1.8h\n"
405                                 "fmax v13.8h, v13.8h, v1.8h\n"
406                                 "str q12, [%[outptr3], #0x10]\n"
407                                 "str q13, [%[outptr3], #0x20]\n"
408                                 "add %[outptr3], %[outptr3], #0x30\n"
409                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
410                               [inptr] "+r" (inptr)
411                             : [minval] "w" (minval), [maxval] "w" (maxval)
412                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
413                             );
414                         }
415                     }
416                     break;
417 
418                 case 5:
419                     {
420                         if ((i+23) >= xmax)
421                         {
422                             for (int xi=0; xi<23; xi++)
423                             {
424                                 if ((i+xi) < xmax)
425                                 {
426                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
427                                     outptr0++;
428                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
429                                     outptr1++;
430                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
431                                     outptr2++;
432                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
433                                     outptr3++;
434                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
435                                     outptr4++;
436                                 }
437                             }
438                             inptr += 192;
439                         } else {
440                             /* Optimized routine to copy an entire block */
441                             __asm __volatile (
442 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
443                                 ".arch  armv8.2-a+fp16\n"
444 #endif
445                                 "dup v0.8h, %[maxval].h[0]\n"
446                                 "ldr q2, [%[outptr0]]\n"
447                                 "dup v1.8h, %[minval].h[0]\n"
448                                 "ldr q10, [%[inptr]]\n"
449                                 "ldr q3, [%[outptr0], #0x10]\n"
450                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
451                                 "ldr q11, [%[inptr], #0x10]\n"
452                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
453                                 "fadd v10.8h, v10.8h, v2.8h\n"
454                                 "ldr q4, [%[outptr0], #0x20]\n"
455                                 "ldr q12, [%[inptr], #0x20]\n"
456                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
457                                 "fadd v11.8h, v11.8h, v3.8h\n"
458                                 "ldr q5, [%[outptr1]]\n"
459                                 "fmin v10.8h, v10.8h, v0.8h\n"
460                                 "ldr q13, [%[inptr], #0x30]\n"
461                                 "fadd v12.8h, v12.8h, v4.8h\n"
462                                 "ldr q6, [%[outptr1], #0x10]\n"
463                                 "ldr q14, [%[inptr], #0x40]\n"
464                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
465                                 "fmax v10.8h, v10.8h, v1.8h\n"
466                                 "ldr q7, [%[outptr1], #0x20]\n"
467                                 "fmin v11.8h, v11.8h, v0.8h\n"
468                                 "ldr q15, [%[inptr], #0x50]\n"
469                                 "fmin v12.8h, v12.8h, v0.8h\n"
470                                 "ldr q8, [%[outptr2]]\n"
471                                 "fadd v13.8h, v13.8h, v5.8h\n"
472                                 "str q10, [%[outptr0]]\n"
473                                 "fadd v14.8h, v14.8h, v6.8h\n"
474                                 "ldr q16, [%[inptr], #0x60]\n"
475                                 "fmax v11.8h, v11.8h, v1.8h\n"
476                                 "ldr q9, [%[outptr2], #0x10]\n"
477                                 "fmax v12.8h, v12.8h, v1.8h\n"
478                                 "ldr q17, [%[inptr], #0x70]\n"
479                                 "fmin v13.8h, v13.8h, v0.8h\n"
480                                 "ldr q2, [%[outptr2], #0x20]\n"
481                                 "fmin v14.8h, v14.8h, v0.8h\n"
482                                 "str q11, [%[outptr0], #0x10]\n"
483                                 "fadd v15.8h, v15.8h, v7.8h\n"
484                                 "ldr q10, [%[inptr], #0x80]\n"
485                                 "fadd v16.8h, v16.8h, v8.8h\n"
486                                 "ldr q3, [%[outptr3]]\n"
487                                 "fmax v13.8h, v13.8h, v1.8h\n"
488                                 "str q12, [%[outptr0], #0x20]\n"
489                                 "fmax v14.8h, v14.8h, v1.8h\n"
490                                 "ldr q11, [%[inptr], #0x90]\n"
491                                 "fmin v15.8h, v15.8h, v0.8h\n"
492                                 "ldr q4, [%[outptr3], #0x10]\n"
493                                 "fmin v16.8h, v16.8h, v0.8h\n"
494                                 "str q13, [%[outptr1]]\n"
495                                 "fadd v17.8h, v17.8h, v9.8h\n"
496                                 "ldr q12, [%[inptr], #0xa0]\n"
497                                 "fadd v10.8h, v10.8h, v2.8h\n"
498                                 "ldr q5, [%[outptr3], #0x20]\n"
499                                 "fmax v15.8h, v15.8h, v1.8h\n"
500                                 "str q14, [%[outptr1], #0x10]\n"
501                                 "fmax v16.8h, v16.8h, v1.8h\n"
502                                 "ldr q13, [%[inptr], #0xb0]\n"
503                                 "fmin v17.8h, v17.8h, v0.8h\n"
504                                 "ldr q6, [%[outptr4]]\n"
505                                 "fmin v10.8h, v10.8h, v0.8h\n"
506                                 "str q15, [%[outptr1], #0x20]\n"
507                                 "fadd v11.8h, v11.8h, v3.8h\n"
508                                 "ldr q14, [%[inptr], #0xc0]\n"
509                                 "fadd v12.8h, v12.8h, v4.8h\n"
510                                 "ldr q7, [%[outptr4], #0x10]\n"
511                                 "fmax v17.8h, v17.8h, v1.8h\n"
512                                 "str q16, [%[outptr2]]\n"
513                                 "fmax v10.8h, v10.8h, v1.8h\n"
514                                 "ldr q15, [%[inptr], #0xd0]\n"
515                                 "fmin v11.8h, v11.8h, v0.8h\n"
516                                 "ldr q8, [%[outptr4], #0x20]\n"
517                                 "fmin v12.8h, v12.8h, v0.8h\n"
518                                 "str q17, [%[outptr2], #0x10]\n"
519                                 "fadd v13.8h, v13.8h, v5.8h\n"
520                                 "ldr q16, [%[inptr], #0xe0]\n"
521                                 "fadd v14.8h, v14.8h, v6.8h\n"
522                                 "add %[outptr0], %[outptr0], #0x30\n"
523                                 "fmax v11.8h, v11.8h, v1.8h\n"
524                                 "str q10, [%[outptr2], #0x20]\n"
525                                 "fmax v12.8h, v12.8h, v1.8h\n"
526                                 "add %[outptr1], %[outptr1], #0x30\n"
527                                 "fmin v13.8h, v13.8h, v0.8h\n"
528                                 "str q11, [%[outptr3]]\n"
529                                 "fmin v14.8h, v14.8h, v0.8h\n"
530                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
531                                 "fadd v15.8h, v15.8h, v7.8h\n"
532                                 "str q12, [%[outptr3], #0x10]\n"
533                                 "fmax v13.8h, v13.8h, v1.8h\n"
534                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
535                                 "fmax v14.8h, v14.8h, v1.8h\n"
536                                 "add %[outptr2], %[outptr2], #0x30\n"
537                                 "fmin v15.8h, v15.8h, v0.8h\n"
538                                 "str q13, [%[outptr3], #0x20]\n"
539                                 "fadd v16.8h, v16.8h, v8.8h\n"
540                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
541                                 "add %[outptr3], %[outptr3], #0x30\n"
542                                 "fmax v15.8h, v15.8h, v1.8h\n"
543                                 "str q14, [%[outptr4]]\n"
544                                 "fmin v16.8h, v16.8h, v0.8h\n"
545                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
546                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
547                                 "str q15, [%[outptr4], #0x10]\n"
548                                 "add %[inptr], %[inptr], #0x180\n"
549                                 "fmax v16.8h, v16.8h, v1.8h\n"
550                                 "str q16, [%[outptr4], #0x20]\n"
551                                 "add %[outptr4], %[outptr4], #0x30\n"
552                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
553                               [inptr] "+r" (inptr)
554                             : [minval] "w" (minval), [maxval] "w" (maxval)
555                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
556                             );
557                         }
558                     }
559                     break;
560 
561                 case 6:
562                     {
563                         if ((i+23) >= xmax)
564                         {
565                             for (int xi=0; xi<23; xi++)
566                             {
567                                 if ((i+xi) < xmax)
568                                 {
569                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
570                                     outptr0++;
571                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
572                                     outptr1++;
573                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
574                                     outptr2++;
575                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
576                                     outptr3++;
577                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
578                                     outptr4++;
579                                     *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + *outptr5)), maxval);
580                                     outptr5++;
581                                 }
582                             }
583                             inptr += 192;
584                         } else {
585                             /* Optimized routine to copy an entire block */
586                             __asm __volatile (
587 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
588                                 ".arch  armv8.2-a+fp16\n"
589 #endif
590                                 "dup v0.8h, %[maxval].h[0]\n"
591                                 "ldr q2, [%[outptr0]]\n"
592                                 "dup v1.8h, %[minval].h[0]\n"
593                                 "ldr q10, [%[inptr]]\n"
594                                 "ldr q3, [%[outptr0], #0x10]\n"
595                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
596                                 "ldr q11, [%[inptr], #0x10]\n"
597                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
598                                 "fadd v10.8h, v10.8h, v2.8h\n"
599                                 "ldr q4, [%[outptr0], #0x20]\n"
600                                 "ldr q12, [%[inptr], #0x20]\n"
601                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
602                                 "fadd v11.8h, v11.8h, v3.8h\n"
603                                 "ldr q5, [%[outptr1]]\n"
604                                 "fmin v10.8h, v10.8h, v0.8h\n"
605                                 "ldr q13, [%[inptr], #0x30]\n"
606                                 "fadd v12.8h, v12.8h, v4.8h\n"
607                                 "ldr q6, [%[outptr1], #0x10]\n"
608                                 "ldr q14, [%[inptr], #0x40]\n"
609                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
610                                 "fmax v10.8h, v10.8h, v1.8h\n"
611                                 "ldr q7, [%[outptr1], #0x20]\n"
612                                 "fmin v11.8h, v11.8h, v0.8h\n"
613                                 "ldr q15, [%[inptr], #0x50]\n"
614                                 "fmin v12.8h, v12.8h, v0.8h\n"
615                                 "ldr q8, [%[outptr2]]\n"
616                                 "fadd v13.8h, v13.8h, v5.8h\n"
617                                 "str q10, [%[outptr0]]\n"
618                                 "fadd v14.8h, v14.8h, v6.8h\n"
619                                 "ldr q16, [%[inptr], #0x60]\n"
620                                 "fmax v11.8h, v11.8h, v1.8h\n"
621                                 "ldr q9, [%[outptr2], #0x10]\n"
622                                 "fmax v12.8h, v12.8h, v1.8h\n"
623                                 "ldr q17, [%[inptr], #0x70]\n"
624                                 "fmin v13.8h, v13.8h, v0.8h\n"
625                                 "ldr q2, [%[outptr2], #0x20]\n"
626                                 "fmin v14.8h, v14.8h, v0.8h\n"
627                                 "str q11, [%[outptr0], #0x10]\n"
628                                 "fadd v15.8h, v15.8h, v7.8h\n"
629                                 "ldr q10, [%[inptr], #0x80]\n"
630                                 "fadd v16.8h, v16.8h, v8.8h\n"
631                                 "ldr q3, [%[outptr3]]\n"
632                                 "fmax v13.8h, v13.8h, v1.8h\n"
633                                 "str q12, [%[outptr0], #0x20]\n"
634                                 "fmax v14.8h, v14.8h, v1.8h\n"
635                                 "ldr q11, [%[inptr], #0x90]\n"
636                                 "fmin v15.8h, v15.8h, v0.8h\n"
637                                 "ldr q4, [%[outptr3], #0x10]\n"
638                                 "fmin v16.8h, v16.8h, v0.8h\n"
639                                 "str q13, [%[outptr1]]\n"
640                                 "fadd v17.8h, v17.8h, v9.8h\n"
641                                 "ldr q12, [%[inptr], #0xa0]\n"
642                                 "fadd v10.8h, v10.8h, v2.8h\n"
643                                 "ldr q5, [%[outptr3], #0x20]\n"
644                                 "fmax v15.8h, v15.8h, v1.8h\n"
645                                 "str q14, [%[outptr1], #0x10]\n"
646                                 "fmax v16.8h, v16.8h, v1.8h\n"
647                                 "ldr q13, [%[inptr], #0xb0]\n"
648                                 "fmin v17.8h, v17.8h, v0.8h\n"
649                                 "ldr q6, [%[outptr4]]\n"
650                                 "fmin v10.8h, v10.8h, v0.8h\n"
651                                 "str q15, [%[outptr1], #0x20]\n"
652                                 "fadd v11.8h, v11.8h, v3.8h\n"
653                                 "ldr q14, [%[inptr], #0xc0]\n"
654                                 "fadd v12.8h, v12.8h, v4.8h\n"
655                                 "ldr q7, [%[outptr4], #0x10]\n"
656                                 "fmax v17.8h, v17.8h, v1.8h\n"
657                                 "str q16, [%[outptr2]]\n"
658                                 "fmax v10.8h, v10.8h, v1.8h\n"
659                                 "ldr q15, [%[inptr], #0xd0]\n"
660                                 "fmin v11.8h, v11.8h, v0.8h\n"
661                                 "ldr q8, [%[outptr4], #0x20]\n"
662                                 "fmin v12.8h, v12.8h, v0.8h\n"
663                                 "str q17, [%[outptr2], #0x10]\n"
664                                 "fadd v13.8h, v13.8h, v5.8h\n"
665                                 "ldr q16, [%[inptr], #0xe0]\n"
666                                 "fadd v14.8h, v14.8h, v6.8h\n"
667                                 "ldr q9, [%[outptr5]]\n"
668                                 "fmax v11.8h, v11.8h, v1.8h\n"
669                                 "str q10, [%[outptr2], #0x20]\n"
670                                 "fmax v12.8h, v12.8h, v1.8h\n"
671                                 "ldr q17, [%[inptr], #0xf0]\n"
672                                 "fmin v13.8h, v13.8h, v0.8h\n"
673                                 "ldr q2, [%[outptr5], #0x10]\n"
674                                 "fmin v14.8h, v14.8h, v0.8h\n"
675                                 "str q11, [%[outptr3]]\n"
676                                 "fadd v15.8h, v15.8h, v7.8h\n"
677                                 "ldr q10, [%[inptr], #0x100]\n"
678                                 "fadd v16.8h, v16.8h, v8.8h\n"
679                                 "ldr q3, [%[outptr5], #0x20]\n"
680                                 "fmax v13.8h, v13.8h, v1.8h\n"
681                                 "str q12, [%[outptr3], #0x10]\n"
682                                 "fmax v14.8h, v14.8h, v1.8h\n"
683                                 "ldr q11, [%[inptr], #0x110]\n"
684                                 "fmin v15.8h, v15.8h, v0.8h\n"
685                                 "add %[outptr0], %[outptr0], #0x30\n"
686                                 "fmin v16.8h, v16.8h, v0.8h\n"
687                                 "str q13, [%[outptr3], #0x20]\n"
688                                 "fadd v17.8h, v17.8h, v9.8h\n"
689                                 "add %[outptr1], %[outptr1], #0x30\n"
690                                 "fmax v15.8h, v15.8h, v1.8h\n"
691                                 "str q14, [%[outptr4]]\n"
692                                 "fmax v16.8h, v16.8h, v1.8h\n"
693                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
694                                 "fmin v17.8h, v17.8h, v0.8h\n"
695                                 "str q15, [%[outptr4], #0x10]\n"
696                                 "fadd v10.8h, v10.8h, v2.8h\n"
697                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
698                                 "fadd v11.8h, v11.8h, v3.8h\n"
699                                 "str q16, [%[outptr4], #0x20]\n"
700                                 "fmax v17.8h, v17.8h, v1.8h\n"
701                                 "add %[outptr2], %[outptr2], #0x30\n"
702                                 "fmin v10.8h, v10.8h, v0.8h\n"
703                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
704                                 "fmin v11.8h, v11.8h, v0.8h\n"
705                                 "str q17, [%[outptr5]]\n"
706                                 "add %[outptr3], %[outptr3], #0x30\n"
707                                 "fmax v10.8h, v10.8h, v1.8h\n"
708                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
709                                 "fmax v11.8h, v11.8h, v1.8h\n"
710                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
711                                 "str q10, [%[outptr5], #0x10]\n"
712                                 "add %[outptr4], %[outptr4], #0x30\n"
713                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
714                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
715                                 "str q11, [%[outptr5], #0x20]\n"
716                                 "add %[outptr5], %[outptr5], #0x30\n"
717                                 "add %[inptr], %[inptr], #0x180\n"
718                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
719                               [inptr] "+r" (inptr)
720                             : [minval] "w" (minval), [maxval] "w" (maxval)
721                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
722                             );
723                         }
724                     }
725                     break;
726 
727                 case 7:
728                     {
729                         if ((i+23) >= xmax)
730                         {
731                             for (int xi=0; xi<23; xi++)
732                             {
733                                 if ((i+xi) < xmax)
734                                 {
735                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
736                                     outptr0++;
737                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
738                                     outptr1++;
739                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
740                                     outptr2++;
741                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
742                                     outptr3++;
743                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
744                                     outptr4++;
745                                     *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + *outptr5)), maxval);
746                                     outptr5++;
747                                     *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + *outptr6)), maxval);
748                                     outptr6++;
749                                 }
750                             }
751                             inptr += 192;
752                         } else {
753                             /* Optimized routine to copy an entire block */
754                             __asm __volatile (
755 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
756                                 ".arch  armv8.2-a+fp16\n"
757 #endif
758                                 "dup v0.8h, %[maxval].h[0]\n"
759                                 "ldr q2, [%[outptr0]]\n"
760                                 "dup v1.8h, %[minval].h[0]\n"
761                                 "ldr q10, [%[inptr]]\n"
762                                 "ldr q3, [%[outptr0], #0x10]\n"
763                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
764                                 "ldr q11, [%[inptr], #0x10]\n"
765                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
766                                 "fadd v10.8h, v10.8h, v2.8h\n"
767                                 "ldr q4, [%[outptr0], #0x20]\n"
768                                 "ldr q12, [%[inptr], #0x20]\n"
769                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
770                                 "fadd v11.8h, v11.8h, v3.8h\n"
771                                 "ldr q5, [%[outptr1]]\n"
772                                 "fmin v10.8h, v10.8h, v0.8h\n"
773                                 "ldr q13, [%[inptr], #0x30]\n"
774                                 "fadd v12.8h, v12.8h, v4.8h\n"
775                                 "ldr q6, [%[outptr1], #0x10]\n"
776                                 "ldr q14, [%[inptr], #0x40]\n"
777                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
778                                 "fmax v10.8h, v10.8h, v1.8h\n"
779                                 "ldr q7, [%[outptr1], #0x20]\n"
780                                 "fmin v11.8h, v11.8h, v0.8h\n"
781                                 "ldr q15, [%[inptr], #0x50]\n"
782                                 "fmin v12.8h, v12.8h, v0.8h\n"
783                                 "ldr q8, [%[outptr2]]\n"
784                                 "fadd v13.8h, v13.8h, v5.8h\n"
785                                 "str q10, [%[outptr0]]\n"
786                                 "fadd v14.8h, v14.8h, v6.8h\n"
787                                 "ldr q16, [%[inptr], #0x60]\n"
788                                 "fmax v11.8h, v11.8h, v1.8h\n"
789                                 "ldr q9, [%[outptr2], #0x10]\n"
790                                 "fmax v12.8h, v12.8h, v1.8h\n"
791                                 "ldr q17, [%[inptr], #0x70]\n"
792                                 "fmin v13.8h, v13.8h, v0.8h\n"
793                                 "ldr q2, [%[outptr2], #0x20]\n"
794                                 "fmin v14.8h, v14.8h, v0.8h\n"
795                                 "str q11, [%[outptr0], #0x10]\n"
796                                 "fadd v15.8h, v15.8h, v7.8h\n"
797                                 "ldr q10, [%[inptr], #0x80]\n"
798                                 "fadd v16.8h, v16.8h, v8.8h\n"
799                                 "ldr q3, [%[outptr3]]\n"
800                                 "fmax v13.8h, v13.8h, v1.8h\n"
801                                 "str q12, [%[outptr0], #0x20]\n"
802                                 "fmax v14.8h, v14.8h, v1.8h\n"
803                                 "ldr q11, [%[inptr], #0x90]\n"
804                                 "fmin v15.8h, v15.8h, v0.8h\n"
805                                 "ldr q4, [%[outptr3], #0x10]\n"
806                                 "fmin v16.8h, v16.8h, v0.8h\n"
807                                 "str q13, [%[outptr1]]\n"
808                                 "fadd v17.8h, v17.8h, v9.8h\n"
809                                 "ldr q12, [%[inptr], #0xa0]\n"
810                                 "fadd v10.8h, v10.8h, v2.8h\n"
811                                 "ldr q5, [%[outptr3], #0x20]\n"
812                                 "fmax v15.8h, v15.8h, v1.8h\n"
813                                 "str q14, [%[outptr1], #0x10]\n"
814                                 "fmax v16.8h, v16.8h, v1.8h\n"
815                                 "ldr q13, [%[inptr], #0xb0]\n"
816                                 "fmin v17.8h, v17.8h, v0.8h\n"
817                                 "ldr q6, [%[outptr4]]\n"
818                                 "fmin v10.8h, v10.8h, v0.8h\n"
819                                 "str q15, [%[outptr1], #0x20]\n"
820                                 "fadd v11.8h, v11.8h, v3.8h\n"
821                                 "ldr q14, [%[inptr], #0xc0]\n"
822                                 "fadd v12.8h, v12.8h, v4.8h\n"
823                                 "ldr q7, [%[outptr4], #0x10]\n"
824                                 "fmax v17.8h, v17.8h, v1.8h\n"
825                                 "str q16, [%[outptr2]]\n"
826                                 "fmax v10.8h, v10.8h, v1.8h\n"
827                                 "ldr q15, [%[inptr], #0xd0]\n"
828                                 "fmin v11.8h, v11.8h, v0.8h\n"
829                                 "ldr q8, [%[outptr4], #0x20]\n"
830                                 "fmin v12.8h, v12.8h, v0.8h\n"
831                                 "str q17, [%[outptr2], #0x10]\n"
832                                 "fadd v13.8h, v13.8h, v5.8h\n"
833                                 "ldr q16, [%[inptr], #0xe0]\n"
834                                 "fadd v14.8h, v14.8h, v6.8h\n"
835                                 "ldr q9, [%[outptr5]]\n"
836                                 "fmax v11.8h, v11.8h, v1.8h\n"
837                                 "str q10, [%[outptr2], #0x20]\n"
838                                 "fmax v12.8h, v12.8h, v1.8h\n"
839                                 "ldr q17, [%[inptr], #0xf0]\n"
840                                 "fmin v13.8h, v13.8h, v0.8h\n"
841                                 "ldr q2, [%[outptr5], #0x10]\n"
842                                 "fmin v14.8h, v14.8h, v0.8h\n"
843                                 "str q11, [%[outptr3]]\n"
844                                 "fadd v15.8h, v15.8h, v7.8h\n"
845                                 "ldr q10, [%[inptr], #0x100]\n"
846                                 "fadd v16.8h, v16.8h, v8.8h\n"
847                                 "ldr q3, [%[outptr5], #0x20]\n"
848                                 "fmax v13.8h, v13.8h, v1.8h\n"
849                                 "str q12, [%[outptr3], #0x10]\n"
850                                 "fmax v14.8h, v14.8h, v1.8h\n"
851                                 "ldr q11, [%[inptr], #0x110]\n"
852                                 "fmin v15.8h, v15.8h, v0.8h\n"
853                                 "ldr q4, [%[outptr6]]\n"
854                                 "fmin v16.8h, v16.8h, v0.8h\n"
855                                 "str q13, [%[outptr3], #0x20]\n"
856                                 "fadd v17.8h, v17.8h, v9.8h\n"
857                                 "ldr q12, [%[inptr], #0x120]\n"
858                                 "fadd v10.8h, v10.8h, v2.8h\n"
859                                 "ldr q5, [%[outptr6], #0x10]\n"
860                                 "fmax v15.8h, v15.8h, v1.8h\n"
861                                 "str q14, [%[outptr4]]\n"
862                                 "fmax v16.8h, v16.8h, v1.8h\n"
863                                 "ldr q13, [%[inptr], #0x130]\n"
864                                 "fmin v17.8h, v17.8h, v0.8h\n"
865                                 "ldr q6, [%[outptr6], #0x20]\n"
866                                 "fmin v10.8h, v10.8h, v0.8h\n"
867                                 "str q15, [%[outptr4], #0x10]\n"
868                                 "fadd v11.8h, v11.8h, v3.8h\n"
869                                 "ldr q14, [%[inptr], #0x140]\n"
870                                 "fadd v12.8h, v12.8h, v4.8h\n"
871                                 "add %[outptr0], %[outptr0], #0x30\n"
872                                 "fmax v17.8h, v17.8h, v1.8h\n"
873                                 "str q16, [%[outptr4], #0x20]\n"
874                                 "fmax v10.8h, v10.8h, v1.8h\n"
875                                 "add %[outptr1], %[outptr1], #0x30\n"
876                                 "fmin v11.8h, v11.8h, v0.8h\n"
877                                 "str q17, [%[outptr5]]\n"
878                                 "fmin v12.8h, v12.8h, v0.8h\n"
879                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
880                                 "fadd v13.8h, v13.8h, v5.8h\n"
881                                 "str q10, [%[outptr5], #0x10]\n"
882                                 "fmax v11.8h, v11.8h, v1.8h\n"
883                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
884                                 "fmax v12.8h, v12.8h, v1.8h\n"
885                                 "add %[outptr2], %[outptr2], #0x30\n"
886                                 "fmin v13.8h, v13.8h, v0.8h\n"
887                                 "str q11, [%[outptr5], #0x20]\n"
888                                 "fadd v14.8h, v14.8h, v6.8h\n"
889                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
890                                 "add %[outptr3], %[outptr3], #0x30\n"
891                                 "fmax v13.8h, v13.8h, v1.8h\n"
892                                 "str q12, [%[outptr6]]\n"
893                                 "fmin v14.8h, v14.8h, v0.8h\n"
894                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
895                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
896                                 "str q13, [%[outptr6], #0x10]\n"
897                                 "add %[outptr4], %[outptr4], #0x30\n"
898                                 "fmax v14.8h, v14.8h, v1.8h\n"
899                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
900                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
901                                 "add %[outptr5], %[outptr5], #0x30\n"
902                                 "str q14, [%[outptr6], #0x20]\n"
903                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
904                                 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
905                                 "add %[outptr6], %[outptr6], #0x30\n"
906                                 "add %[inptr], %[inptr], #0x180\n"
907                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
908                               [inptr] "+r" (inptr)
909                             : [minval] "w" (minval), [maxval] "w" (maxval)
910                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
911                             );
912                         }
913                     }
914                     break;
915 
916                 default:
917                 case 8:
918                     {
919                         if ((i+23) >= xmax)
920                         {
921                             for (int xi=0; xi<23; xi++)
922                             {
923                                 if ((i+xi) < xmax)
924                                 {
925                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
926                                     outptr0++;
927                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
928                                     outptr1++;
929                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
930                                     outptr2++;
931                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
932                                     outptr3++;
933                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
934                                     outptr4++;
935                                     *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + *outptr5)), maxval);
936                                     outptr5++;
937                                     *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + *outptr6)), maxval);
938                                     outptr6++;
939                                     *outptr7 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 168] + *outptr7)), maxval);
940                                     outptr7++;
941                                 }
942                             }
943                             inptr += 192;
944                         } else {
945                             /* Optimized routine to copy an entire block */
946                             __asm __volatile (
947 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
948                                 ".arch  armv8.2-a+fp16\n"
949 #endif
950                                 "dup v0.8h, %[maxval].h[0]\n"
951                                 "ldr q2, [%[outptr0]]\n"
952                                 "dup v1.8h, %[minval].h[0]\n"
953                                 "ldr q10, [%[inptr]]\n"
954                                 "ldr q3, [%[outptr0], #0x10]\n"
955                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
956                                 "ldr q11, [%[inptr], #0x10]\n"
957                                 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
958                                 "fadd v10.8h, v10.8h, v2.8h\n"
959                                 "ldr q4, [%[outptr0], #0x20]\n"
960                                 "ldr q12, [%[inptr], #0x20]\n"
961                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
962                                 "fadd v11.8h, v11.8h, v3.8h\n"
963                                 "ldr q5, [%[outptr1]]\n"
964                                 "fmin v10.8h, v10.8h, v0.8h\n"
965                                 "ldr q13, [%[inptr], #0x30]\n"
966                                 "fadd v12.8h, v12.8h, v4.8h\n"
967                                 "ldr q6, [%[outptr1], #0x10]\n"
968                                 "ldr q14, [%[inptr], #0x40]\n"
969                                 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
970                                 "fmax v10.8h, v10.8h, v1.8h\n"
971                                 "ldr q7, [%[outptr1], #0x20]\n"
972                                 "fmin v11.8h, v11.8h, v0.8h\n"
973                                 "ldr q15, [%[inptr], #0x50]\n"
974                                 "fmin v12.8h, v12.8h, v0.8h\n"
975                                 "ldr q8, [%[outptr2]]\n"
976                                 "fadd v13.8h, v13.8h, v5.8h\n"
977                                 "str q10, [%[outptr0]]\n"
978                                 "fadd v14.8h, v14.8h, v6.8h\n"
979                                 "ldr q16, [%[inptr], #0x60]\n"
980                                 "fmax v11.8h, v11.8h, v1.8h\n"
981                                 "ldr q9, [%[outptr2], #0x10]\n"
982                                 "fmax v12.8h, v12.8h, v1.8h\n"
983                                 "ldr q17, [%[inptr], #0x70]\n"
984                                 "fmin v13.8h, v13.8h, v0.8h\n"
985                                 "ldr q2, [%[outptr2], #0x20]\n"
986                                 "fmin v14.8h, v14.8h, v0.8h\n"
987                                 "str q11, [%[outptr0], #0x10]\n"
988                                 "fadd v15.8h, v15.8h, v7.8h\n"
989                                 "ldr q10, [%[inptr], #0x80]\n"
990                                 "fadd v16.8h, v16.8h, v8.8h\n"
991                                 "ldr q3, [%[outptr3]]\n"
992                                 "fmax v13.8h, v13.8h, v1.8h\n"
993                                 "str q12, [%[outptr0], #0x20]\n"
994                                 "fmax v14.8h, v14.8h, v1.8h\n"
995                                 "ldr q11, [%[inptr], #0x90]\n"
996                                 "fmin v15.8h, v15.8h, v0.8h\n"
997                                 "ldr q4, [%[outptr3], #0x10]\n"
998                                 "fmin v16.8h, v16.8h, v0.8h\n"
999                                 "str q13, [%[outptr1]]\n"
1000                                 "fadd v17.8h, v17.8h, v9.8h\n"
1001                                 "ldr q12, [%[inptr], #0xa0]\n"
1002                                 "fadd v10.8h, v10.8h, v2.8h\n"
1003                                 "ldr q5, [%[outptr3], #0x20]\n"
1004                                 "fmax v15.8h, v15.8h, v1.8h\n"
1005                                 "str q14, [%[outptr1], #0x10]\n"
1006                                 "fmax v16.8h, v16.8h, v1.8h\n"
1007                                 "ldr q13, [%[inptr], #0xb0]\n"
1008                                 "fmin v17.8h, v17.8h, v0.8h\n"
1009                                 "ldr q6, [%[outptr4]]\n"
1010                                 "fmin v10.8h, v10.8h, v0.8h\n"
1011                                 "str q15, [%[outptr1], #0x20]\n"
1012                                 "fadd v11.8h, v11.8h, v3.8h\n"
1013                                 "ldr q14, [%[inptr], #0xc0]\n"
1014                                 "fadd v12.8h, v12.8h, v4.8h\n"
1015                                 "ldr q7, [%[outptr4], #0x10]\n"
1016                                 "fmax v17.8h, v17.8h, v1.8h\n"
1017                                 "str q16, [%[outptr2]]\n"
1018                                 "fmax v10.8h, v10.8h, v1.8h\n"
1019                                 "ldr q15, [%[inptr], #0xd0]\n"
1020                                 "fmin v11.8h, v11.8h, v0.8h\n"
1021                                 "ldr q8, [%[outptr4], #0x20]\n"
1022                                 "fmin v12.8h, v12.8h, v0.8h\n"
1023                                 "str q17, [%[outptr2], #0x10]\n"
1024                                 "fadd v13.8h, v13.8h, v5.8h\n"
1025                                 "ldr q16, [%[inptr], #0xe0]\n"
1026                                 "fadd v14.8h, v14.8h, v6.8h\n"
1027                                 "ldr q9, [%[outptr5]]\n"
1028                                 "fmax v11.8h, v11.8h, v1.8h\n"
1029                                 "str q10, [%[outptr2], #0x20]\n"
1030                                 "fmax v12.8h, v12.8h, v1.8h\n"
1031                                 "ldr q17, [%[inptr], #0xf0]\n"
1032                                 "fmin v13.8h, v13.8h, v0.8h\n"
1033                                 "ldr q2, [%[outptr5], #0x10]\n"
1034                                 "fmin v14.8h, v14.8h, v0.8h\n"
1035                                 "str q11, [%[outptr3]]\n"
1036                                 "fadd v15.8h, v15.8h, v7.8h\n"
1037                                 "ldr q10, [%[inptr], #0x100]\n"
1038                                 "fadd v16.8h, v16.8h, v8.8h\n"
1039                                 "ldr q3, [%[outptr5], #0x20]\n"
1040                                 "fmax v13.8h, v13.8h, v1.8h\n"
1041                                 "str q12, [%[outptr3], #0x10]\n"
1042                                 "fmax v14.8h, v14.8h, v1.8h\n"
1043                                 "ldr q11, [%[inptr], #0x110]\n"
1044                                 "fmin v15.8h, v15.8h, v0.8h\n"
1045                                 "ldr q4, [%[outptr6]]\n"
1046                                 "fmin v16.8h, v16.8h, v0.8h\n"
1047                                 "str q13, [%[outptr3], #0x20]\n"
1048                                 "fadd v17.8h, v17.8h, v9.8h\n"
1049                                 "ldr q12, [%[inptr], #0x120]\n"
1050                                 "fadd v10.8h, v10.8h, v2.8h\n"
1051                                 "ldr q5, [%[outptr6], #0x10]\n"
1052                                 "fmax v15.8h, v15.8h, v1.8h\n"
1053                                 "str q14, [%[outptr4]]\n"
1054                                 "fmax v16.8h, v16.8h, v1.8h\n"
1055                                 "ldr q13, [%[inptr], #0x130]\n"
1056                                 "fmin v17.8h, v17.8h, v0.8h\n"
1057                                 "ldr q6, [%[outptr6], #0x20]\n"
1058                                 "fmin v10.8h, v10.8h, v0.8h\n"
1059                                 "str q15, [%[outptr4], #0x10]\n"
1060                                 "fadd v11.8h, v11.8h, v3.8h\n"
1061                                 "ldr q14, [%[inptr], #0x140]\n"
1062                                 "fadd v12.8h, v12.8h, v4.8h\n"
1063                                 "ldr q7, [%[outptr7]]\n"
1064                                 "fmax v17.8h, v17.8h, v1.8h\n"
1065                                 "str q16, [%[outptr4], #0x20]\n"
1066                                 "fmax v10.8h, v10.8h, v1.8h\n"
1067                                 "ldr q15, [%[inptr], #0x150]\n"
1068                                 "fmin v11.8h, v11.8h, v0.8h\n"
1069                                 "ldr q8, [%[outptr7], #0x10]\n"
1070                                 "fmin v12.8h, v12.8h, v0.8h\n"
1071                                 "str q17, [%[outptr5]]\n"
1072                                 "fadd v13.8h, v13.8h, v5.8h\n"
1073                                 "ldr q16, [%[inptr], #0x160]\n"
1074                                 "fadd v14.8h, v14.8h, v6.8h\n"
1075                                 "ldr q9, [%[outptr7], #0x20]\n"
1076                                 "fmax v11.8h, v11.8h, v1.8h\n"
1077                                 "str q10, [%[outptr5], #0x10]\n"
1078                                 "fmax v12.8h, v12.8h, v1.8h\n"
1079                                 "ldr q17, [%[inptr], #0x170]\n"
1080                                 "fmin v13.8h, v13.8h, v0.8h\n"
1081                                 "add %[outptr0], %[outptr0], #0x30\n"
1082                                 "fmin v14.8h, v14.8h, v0.8h\n"
1083                                 "str q11, [%[outptr5], #0x20]\n"
1084                                 "fadd v15.8h, v15.8h, v7.8h\n"
1085                                 "add %[outptr1], %[outptr1], #0x30\n"
1086                                 "fmax v13.8h, v13.8h, v1.8h\n"
1087                                 "str q12, [%[outptr6]]\n"
1088                                 "fmax v14.8h, v14.8h, v1.8h\n"
1089                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1090                                 "fmin v15.8h, v15.8h, v0.8h\n"
1091                                 "str q13, [%[outptr6], #0x10]\n"
1092                                 "fadd v16.8h, v16.8h, v8.8h\n"
1093                                 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1094                                 "fadd v17.8h, v17.8h, v9.8h\n"
1095                                 "str q14, [%[outptr6], #0x20]\n"
1096                                 "fmax v15.8h, v15.8h, v1.8h\n"
1097                                 "add %[outptr2], %[outptr2], #0x30\n"
1098                                 "fmin v16.8h, v16.8h, v0.8h\n"
1099                                 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1100                                 "fmin v17.8h, v17.8h, v0.8h\n"
1101                                 "str q15, [%[outptr7]]\n"
1102                                 "add %[outptr3], %[outptr3], #0x30\n"
1103                                 "fmax v16.8h, v16.8h, v1.8h\n"
1104                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1105                                 "fmax v17.8h, v17.8h, v1.8h\n"
1106                                 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1107                                 "str q16, [%[outptr7], #0x10]\n"
1108                                 "add %[outptr4], %[outptr4], #0x30\n"
1109                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1110                                 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1111                                 "str q17, [%[outptr7], #0x20]\n"
1112                                 "add %[outptr5], %[outptr5], #0x30\n"
1113                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1114                                 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1115                                 "add %[outptr6], %[outptr6], #0x30\n"
1116                                 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
1117                                 "add %[outptr7], %[outptr7], #0x30\n"
1118                                 "add %[inptr], %[inptr], #0x180\n"
1119                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1120                               [inptr] "+r" (inptr)
1121                             : [minval] "w" (minval), [maxval] "w" (maxval)
1122                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1123                             );
1124                         }
1125                     }
1126                     break;
1127 
1128 
1129                 }
1130             }
1131             else
1132             {
1133                 const __fp16 *biasptr = bias ? bias + i : nullbias;
1134 
1135                 switch(height)
1136                 {
1137                 case 1:
1138                     {
1139                         if ((i+23) >= xmax)
1140                         {
1141                             for (int xi=0; xi<23; xi++)
1142                             {
1143                                 if ((i+xi) < xmax)
1144                                 {
1145                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1146                                     outptr0++;
1147                                 }
1148                             }
1149                             inptr += 192;
1150                         } else {
1151                             /* Optimized routine to copy an entire block */
1152                             __asm __volatile (
1153 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1154                                 ".arch  armv8.2-a+fp16\n"
1155 #endif
1156                                 "dup v0.8h, %[maxval].h[0]\n"
1157                                 "ldr q2, [%[biasptr]]\n"
1158                                 "dup v1.8h, %[minval].h[0]\n"
1159                                 "ldr q3, [%[biasptr], #0x10]\n"
1160                                 "ldr q4, [%[biasptr], #0x20]\n"
1161                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1162                                 "ldr q13, [%[inptr]]\n"
1163                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1164                                 "ldr q14, [%[inptr], #0x10]\n"
1165                                 "ldr q15, [%[inptr], #0x20]\n"
1166                                 "add %[inptr], %[inptr], #0x180\n"
1167                                 "fadd v13.8h, v13.8h, v2.8h\n"
1168                                 "fadd v14.8h, v14.8h, v3.8h\n"
1169                                 "fadd v15.8h, v15.8h, v4.8h\n"
1170                                 "fmin v13.8h, v13.8h, v0.8h\n"
1171                                 "fmin v14.8h, v14.8h, v0.8h\n"
1172                                 "fmin v15.8h, v15.8h, v0.8h\n"
1173                                 "fmax v13.8h, v13.8h, v1.8h\n"
1174                                 "fmax v14.8h, v14.8h, v1.8h\n"
1175                                 "fmax v15.8h, v15.8h, v1.8h\n"
1176                                 "str q13, [%[outptr0]]\n"
1177                                 "str q14, [%[outptr0], #0x10]\n"
1178                                 "str q15, [%[outptr0], #0x20]\n"
1179                                 "add %[outptr0], %[outptr0], #0x30\n"
1180                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1181                               [inptr] "+r" (inptr)
1182                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1183                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1184                             );
1185                         }
1186                     }
1187                     break;
1188 
1189                 case 2:
1190                     {
1191                         if ((i+23) >= xmax)
1192                         {
1193                             for (int xi=0; xi<23; xi++)
1194                             {
1195                                 if ((i+xi) < xmax)
1196                                 {
1197                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1198                                     outptr0++;
1199                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1200                                     outptr1++;
1201                                 }
1202                             }
1203                             inptr += 192;
1204                         } else {
1205                             /* Optimized routine to copy an entire block */
1206                             __asm __volatile (
1207 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1208                                 ".arch  armv8.2-a+fp16\n"
1209 #endif
1210                                 "dup v0.8h, %[maxval].h[0]\n"
1211                                 "ldr q2, [%[biasptr]]\n"
1212                                 "dup v1.8h, %[minval].h[0]\n"
1213                                 "ldr q3, [%[biasptr], #0x10]\n"
1214                                 "ldr q4, [%[biasptr], #0x20]\n"
1215                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1216                                 "ldr q13, [%[inptr]]\n"
1217                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1218                                 "ldr q14, [%[inptr], #0x10]\n"
1219                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1220                                 "fadd v13.8h, v13.8h, v2.8h\n"
1221                                 "ldr q15, [%[inptr], #0x20]\n"
1222                                 "ldr q16, [%[inptr], #0x30]\n"
1223                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1224                                 "fadd v14.8h, v14.8h, v3.8h\n"
1225                                 "ldr q17, [%[inptr], #0x40]\n"
1226                                 "fmin v13.8h, v13.8h, v0.8h\n"
1227                                 "ldr q18, [%[inptr], #0x50]\n"
1228                                 "fadd v15.8h, v15.8h, v4.8h\n"
1229                                 "add %[inptr], %[inptr], #0x180\n"
1230                                 "fmin v14.8h, v14.8h, v0.8h\n"
1231                                 "fmax v13.8h, v13.8h, v1.8h\n"
1232                                 "fmin v15.8h, v15.8h, v0.8h\n"
1233                                 "fadd v16.8h, v16.8h, v2.8h\n"
1234                                 "fmax v14.8h, v14.8h, v1.8h\n"
1235                                 "str q13, [%[outptr0]]\n"
1236                                 "fadd v17.8h, v17.8h, v3.8h\n"
1237                                 "fmax v15.8h, v15.8h, v1.8h\n"
1238                                 "fmin v16.8h, v16.8h, v0.8h\n"
1239                                 "str q14, [%[outptr0], #0x10]\n"
1240                                 "fadd v18.8h, v18.8h, v4.8h\n"
1241                                 "fmin v17.8h, v17.8h, v0.8h\n"
1242                                 "fmax v16.8h, v16.8h, v1.8h\n"
1243                                 "str q15, [%[outptr0], #0x20]\n"
1244                                 "fmin v18.8h, v18.8h, v0.8h\n"
1245                                 "add %[outptr0], %[outptr0], #0x30\n"
1246                                 "fmax v17.8h, v17.8h, v1.8h\n"
1247                                 "str q16, [%[outptr1]]\n"
1248                                 "fmax v18.8h, v18.8h, v1.8h\n"
1249                                 "str q17, [%[outptr1], #0x10]\n"
1250                                 "str q18, [%[outptr1], #0x20]\n"
1251                                 "add %[outptr1], %[outptr1], #0x30\n"
1252                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1253                               [inptr] "+r" (inptr)
1254                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1255                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1256                             );
1257                         }
1258                     }
1259                     break;
1260 
1261                 case 3:
1262                     {
1263                         if ((i+23) >= xmax)
1264                         {
1265                             for (int xi=0; xi<23; xi++)
1266                             {
1267                                 if ((i+xi) < xmax)
1268                                 {
1269                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1270                                     outptr0++;
1271                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1272                                     outptr1++;
1273                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1274                                     outptr2++;
1275                                 }
1276                             }
1277                             inptr += 192;
1278                         } else {
1279                             /* Optimized routine to copy an entire block */
1280                             __asm __volatile (
1281 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1282                                 ".arch  armv8.2-a+fp16\n"
1283 #endif
1284                                 "dup v0.8h, %[maxval].h[0]\n"
1285                                 "ldr q2, [%[biasptr]]\n"
1286                                 "dup v1.8h, %[minval].h[0]\n"
1287                                 "ldr q3, [%[biasptr], #0x10]\n"
1288                                 "ldr q4, [%[biasptr], #0x20]\n"
1289                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1290                                 "ldr q13, [%[inptr]]\n"
1291                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1292                                 "ldr q14, [%[inptr], #0x10]\n"
1293                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1294                                 "fadd v13.8h, v13.8h, v2.8h\n"
1295                                 "ldr q15, [%[inptr], #0x20]\n"
1296                                 "ldr q16, [%[inptr], #0x30]\n"
1297                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1298                                 "fadd v14.8h, v14.8h, v3.8h\n"
1299                                 "ldr q17, [%[inptr], #0x40]\n"
1300                                 "fmin v13.8h, v13.8h, v0.8h\n"
1301                                 "ldr q18, [%[inptr], #0x50]\n"
1302                                 "fadd v15.8h, v15.8h, v4.8h\n"
1303                                 "ldr q19, [%[inptr], #0x60]\n"
1304                                 "fadd v16.8h, v16.8h, v2.8h\n"
1305                                 "ldr q20, [%[inptr], #0x70]\n"
1306                                 "fmin v14.8h, v14.8h, v0.8h\n"
1307                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1308                                 "fmax v13.8h, v13.8h, v1.8h\n"
1309                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1310                                 "fmax v14.8h, v14.8h, v1.8h\n"
1311                                 "fmin v15.8h, v15.8h, v0.8h\n"
1312                                 "str q13, [%[outptr0]]\n"
1313                                 "fmin v16.8h, v16.8h, v0.8h\n"
1314                                 "ldr q13, [%[inptr], #0x80]\n"
1315                                 "fadd v17.8h, v17.8h, v3.8h\n"
1316                                 "add %[inptr], %[inptr], #0x180\n"
1317                                 "fmax v15.8h, v15.8h, v1.8h\n"
1318                                 "str q14, [%[outptr0], #0x10]\n"
1319                                 "fmax v16.8h, v16.8h, v1.8h\n"
1320                                 "fmin v17.8h, v17.8h, v0.8h\n"
1321                                 "fadd v18.8h, v18.8h, v4.8h\n"
1322                                 "str q15, [%[outptr0], #0x20]\n"
1323                                 "fadd v19.8h, v19.8h, v2.8h\n"
1324                                 "add %[outptr0], %[outptr0], #0x30\n"
1325                                 "fmax v17.8h, v17.8h, v1.8h\n"
1326                                 "str q16, [%[outptr1]]\n"
1327                                 "fmin v18.8h, v18.8h, v0.8h\n"
1328                                 "fmin v19.8h, v19.8h, v0.8h\n"
1329                                 "fadd v20.8h, v20.8h, v3.8h\n"
1330                                 "str q17, [%[outptr1], #0x10]\n"
1331                                 "fadd v13.8h, v13.8h, v4.8h\n"
1332                                 "fmax v18.8h, v18.8h, v1.8h\n"
1333                                 "fmax v19.8h, v19.8h, v1.8h\n"
1334                                 "fmin v20.8h, v20.8h, v0.8h\n"
1335                                 "fmin v13.8h, v13.8h, v0.8h\n"
1336                                 "str q18, [%[outptr1], #0x20]\n"
1337                                 "add %[outptr1], %[outptr1], #0x30\n"
1338                                 "fmax v20.8h, v20.8h, v1.8h\n"
1339                                 "str q19, [%[outptr2]]\n"
1340                                 "fmax v13.8h, v13.8h, v1.8h\n"
1341                                 "str q20, [%[outptr2], #0x10]\n"
1342                                 "str q13, [%[outptr2], #0x20]\n"
1343                                 "add %[outptr2], %[outptr2], #0x30\n"
1344                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1345                               [inptr] "+r" (inptr)
1346                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1347                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1348                             );
1349                         }
1350                     }
1351                     break;
1352 
1353                 case 4:
1354                     {
1355                         if ((i+23) >= xmax)
1356                         {
1357                             for (int xi=0; xi<23; xi++)
1358                             {
1359                                 if ((i+xi) < xmax)
1360                                 {
1361                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1362                                     outptr0++;
1363                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1364                                     outptr1++;
1365                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1366                                     outptr2++;
1367                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1368                                     outptr3++;
1369                                 }
1370                             }
1371                             inptr += 192;
1372                         } else {
1373                             /* Optimized routine to copy an entire block */
1374                             __asm __volatile (
1375 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1376                                 ".arch  armv8.2-a+fp16\n"
1377 #endif
1378                                 "dup v0.8h, %[maxval].h[0]\n"
1379                                 "ldr q2, [%[biasptr]]\n"
1380                                 "dup v1.8h, %[minval].h[0]\n"
1381                                 "ldr q3, [%[biasptr], #0x10]\n"
1382                                 "ldr q4, [%[biasptr], #0x20]\n"
1383                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1384                                 "ldr q13, [%[inptr]]\n"
1385                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1386                                 "ldr q14, [%[inptr], #0x10]\n"
1387                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1388                                 "fadd v13.8h, v13.8h, v2.8h\n"
1389                                 "ldr q15, [%[inptr], #0x20]\n"
1390                                 "ldr q16, [%[inptr], #0x30]\n"
1391                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1392                                 "fadd v14.8h, v14.8h, v3.8h\n"
1393                                 "ldr q17, [%[inptr], #0x40]\n"
1394                                 "fmin v13.8h, v13.8h, v0.8h\n"
1395                                 "ldr q18, [%[inptr], #0x50]\n"
1396                                 "fadd v15.8h, v15.8h, v4.8h\n"
1397                                 "ldr q19, [%[inptr], #0x60]\n"
1398                                 "fadd v16.8h, v16.8h, v2.8h\n"
1399                                 "ldr q20, [%[inptr], #0x70]\n"
1400                                 "fmin v14.8h, v14.8h, v0.8h\n"
1401                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1402                                 "fmax v13.8h, v13.8h, v1.8h\n"
1403                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1404                                 "fmax v14.8h, v14.8h, v1.8h\n"
1405                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1406                                 "fmin v15.8h, v15.8h, v0.8h\n"
1407                                 "str q13, [%[outptr0]]\n"
1408                                 "fmin v16.8h, v16.8h, v0.8h\n"
1409                                 "ldr q13, [%[inptr], #0x80]\n"
1410                                 "fadd v17.8h, v17.8h, v3.8h\n"
1411                                 "fadd v18.8h, v18.8h, v4.8h\n"
1412                                 "str q14, [%[outptr0], #0x10]\n"
1413                                 "fmax v15.8h, v15.8h, v1.8h\n"
1414                                 "ldr q14, [%[inptr], #0x90]\n"
1415                                 "fmax v16.8h, v16.8h, v1.8h\n"
1416                                 "fmin v17.8h, v17.8h, v0.8h\n"
1417                                 "fmin v18.8h, v18.8h, v0.8h\n"
1418                                 "str q15, [%[outptr0], #0x20]\n"
1419                                 "fadd v19.8h, v19.8h, v2.8h\n"
1420                                 "ldr q15, [%[inptr], #0xa0]\n"
1421                                 "fadd v20.8h, v20.8h, v3.8h\n"
1422                                 "add %[outptr0], %[outptr0], #0x30\n"
1423                                 "fmax v17.8h, v17.8h, v1.8h\n"
1424                                 "str q16, [%[outptr1]]\n"
1425                                 "fmax v18.8h, v18.8h, v1.8h\n"
1426                                 "ldr q16, [%[inptr], #0xb0]\n"
1427                                 "fmin v19.8h, v19.8h, v0.8h\n"
1428                                 "add %[inptr], %[inptr], #0x180\n"
1429                                 "fmin v20.8h, v20.8h, v0.8h\n"
1430                                 "str q17, [%[outptr1], #0x10]\n"
1431                                 "fadd v13.8h, v13.8h, v4.8h\n"
1432                                 "fmax v19.8h, v19.8h, v1.8h\n"
1433                                 "fadd v14.8h, v14.8h, v2.8h\n"
1434                                 "str q18, [%[outptr1], #0x20]\n"
1435                                 "fmax v20.8h, v20.8h, v1.8h\n"
1436                                 "add %[outptr1], %[outptr1], #0x30\n"
1437                                 "fmin v13.8h, v13.8h, v0.8h\n"
1438                                 "str q19, [%[outptr2]]\n"
1439                                 "fmin v14.8h, v14.8h, v0.8h\n"
1440                                 "fadd v15.8h, v15.8h, v3.8h\n"
1441                                 "fadd v16.8h, v16.8h, v4.8h\n"
1442                                 "str q20, [%[outptr2], #0x10]\n"
1443                                 "fmax v13.8h, v13.8h, v1.8h\n"
1444                                 "fmax v14.8h, v14.8h, v1.8h\n"
1445                                 "fmin v15.8h, v15.8h, v0.8h\n"
1446                                 "fmin v16.8h, v16.8h, v0.8h\n"
1447                                 "str q13, [%[outptr2], #0x20]\n"
1448                                 "add %[outptr2], %[outptr2], #0x30\n"
1449                                 "fmax v15.8h, v15.8h, v1.8h\n"
1450                                 "str q14, [%[outptr3]]\n"
1451                                 "fmax v16.8h, v16.8h, v1.8h\n"
1452                                 "str q15, [%[outptr3], #0x10]\n"
1453                                 "str q16, [%[outptr3], #0x20]\n"
1454                                 "add %[outptr3], %[outptr3], #0x30\n"
1455                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1456                               [inptr] "+r" (inptr)
1457                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1458                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1459                             );
1460                         }
1461                     }
1462                     break;
1463 
1464                 case 5:
1465                     {
1466                         if ((i+23) >= xmax)
1467                         {
1468                             for (int xi=0; xi<23; xi++)
1469                             {
1470                                 if ((i+xi) < xmax)
1471                                 {
1472                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1473                                     outptr0++;
1474                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1475                                     outptr1++;
1476                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1477                                     outptr2++;
1478                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1479                                     outptr3++;
1480                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1481                                     outptr4++;
1482                                 }
1483                             }
1484                             inptr += 192;
1485                         } else {
1486                             /* Optimized routine to copy an entire block */
1487                             __asm __volatile (
1488 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1489                                 ".arch  armv8.2-a+fp16\n"
1490 #endif
1491                                 "dup v0.8h, %[maxval].h[0]\n"
1492                                 "ldr q2, [%[biasptr]]\n"
1493                                 "dup v1.8h, %[minval].h[0]\n"
1494                                 "ldr q3, [%[biasptr], #0x10]\n"
1495                                 "ldr q4, [%[biasptr], #0x20]\n"
1496                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1497                                 "ldr q13, [%[inptr]]\n"
1498                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1499                                 "ldr q14, [%[inptr], #0x10]\n"
1500                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1501                                 "fadd v13.8h, v13.8h, v2.8h\n"
1502                                 "ldr q15, [%[inptr], #0x20]\n"
1503                                 "ldr q16, [%[inptr], #0x30]\n"
1504                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1505                                 "fadd v14.8h, v14.8h, v3.8h\n"
1506                                 "ldr q17, [%[inptr], #0x40]\n"
1507                                 "fmin v13.8h, v13.8h, v0.8h\n"
1508                                 "ldr q18, [%[inptr], #0x50]\n"
1509                                 "fadd v15.8h, v15.8h, v4.8h\n"
1510                                 "ldr q19, [%[inptr], #0x60]\n"
1511                                 "fadd v16.8h, v16.8h, v2.8h\n"
1512                                 "ldr q20, [%[inptr], #0x70]\n"
1513                                 "fmin v14.8h, v14.8h, v0.8h\n"
1514                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1515                                 "fmax v13.8h, v13.8h, v1.8h\n"
1516                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1517                                 "fmax v14.8h, v14.8h, v1.8h\n"
1518                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1519                                 "fmin v15.8h, v15.8h, v0.8h\n"
1520                                 "str q13, [%[outptr0]]\n"
1521                                 "fmin v16.8h, v16.8h, v0.8h\n"
1522                                 "ldr q13, [%[inptr], #0x80]\n"
1523                                 "fadd v17.8h, v17.8h, v3.8h\n"
1524                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1525                                 "fmax v15.8h, v15.8h, v1.8h\n"
1526                                 "str q14, [%[outptr0], #0x10]\n"
1527                                 "fmax v16.8h, v16.8h, v1.8h\n"
1528                                 "ldr q14, [%[inptr], #0x90]\n"
1529                                 "fmin v17.8h, v17.8h, v0.8h\n"
1530                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1531                                 "fadd v18.8h, v18.8h, v4.8h\n"
1532                                 "str q15, [%[outptr0], #0x20]\n"
1533                                 "fadd v19.8h, v19.8h, v2.8h\n"
1534                                 "ldr q15, [%[inptr], #0xa0]\n"
1535                                 "fmax v17.8h, v17.8h, v1.8h\n"
1536                                 "add %[outptr0], %[outptr0], #0x30\n"
1537                                 "fmin v18.8h, v18.8h, v0.8h\n"
1538                                 "str q16, [%[outptr1]]\n"
1539                                 "fmin v19.8h, v19.8h, v0.8h\n"
1540                                 "ldr q16, [%[inptr], #0xb0]\n"
1541                                 "fadd v20.8h, v20.8h, v3.8h\n"
1542                                 "fadd v13.8h, v13.8h, v4.8h\n"
1543                                 "str q17, [%[outptr1], #0x10]\n"
1544                                 "fmax v18.8h, v18.8h, v1.8h\n"
1545                                 "ldr q17, [%[inptr], #0xc0]\n"
1546                                 "fmax v19.8h, v19.8h, v1.8h\n"
1547                                 "fmin v20.8h, v20.8h, v0.8h\n"
1548                                 "fmin v13.8h, v13.8h, v0.8h\n"
1549                                 "str q18, [%[outptr1], #0x20]\n"
1550                                 "fadd v14.8h, v14.8h, v2.8h\n"
1551                                 "ldr q18, [%[inptr], #0xd0]\n"
1552                                 "fadd v15.8h, v15.8h, v3.8h\n"
1553                                 "add %[outptr1], %[outptr1], #0x30\n"
1554                                 "fmax v20.8h, v20.8h, v1.8h\n"
1555                                 "str q19, [%[outptr2]]\n"
1556                                 "fmax v13.8h, v13.8h, v1.8h\n"
1557                                 "ldr q19, [%[inptr], #0xe0]\n"
1558                                 "fmin v14.8h, v14.8h, v0.8h\n"
1559                                 "add %[inptr], %[inptr], #0x180\n"
1560                                 "fmin v15.8h, v15.8h, v0.8h\n"
1561                                 "str q20, [%[outptr2], #0x10]\n"
1562                                 "fadd v16.8h, v16.8h, v4.8h\n"
1563                                 "fmax v14.8h, v14.8h, v1.8h\n"
1564                                 "fadd v17.8h, v17.8h, v2.8h\n"
1565                                 "str q13, [%[outptr2], #0x20]\n"
1566                                 "fmax v15.8h, v15.8h, v1.8h\n"
1567                                 "add %[outptr2], %[outptr2], #0x30\n"
1568                                 "fmin v16.8h, v16.8h, v0.8h\n"
1569                                 "str q14, [%[outptr3]]\n"
1570                                 "fmin v17.8h, v17.8h, v0.8h\n"
1571                                 "fadd v18.8h, v18.8h, v3.8h\n"
1572                                 "fadd v19.8h, v19.8h, v4.8h\n"
1573                                 "str q15, [%[outptr3], #0x10]\n"
1574                                 "fmax v16.8h, v16.8h, v1.8h\n"
1575                                 "fmax v17.8h, v17.8h, v1.8h\n"
1576                                 "fmin v18.8h, v18.8h, v0.8h\n"
1577                                 "fmin v19.8h, v19.8h, v0.8h\n"
1578                                 "str q16, [%[outptr3], #0x20]\n"
1579                                 "add %[outptr3], %[outptr3], #0x30\n"
1580                                 "fmax v18.8h, v18.8h, v1.8h\n"
1581                                 "str q17, [%[outptr4]]\n"
1582                                 "fmax v19.8h, v19.8h, v1.8h\n"
1583                                 "str q18, [%[outptr4], #0x10]\n"
1584                                 "str q19, [%[outptr4], #0x20]\n"
1585                                 "add %[outptr4], %[outptr4], #0x30\n"
1586                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1587                               [inptr] "+r" (inptr)
1588                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1589                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1590                             );
1591                         }
1592                     }
1593                     break;
1594 
1595                 case 6:
1596                     {
1597                         if ((i+23) >= xmax)
1598                         {
1599                             for (int xi=0; xi<23; xi++)
1600                             {
1601                                 if ((i+xi) < xmax)
1602                                 {
1603                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1604                                     outptr0++;
1605                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1606                                     outptr1++;
1607                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1608                                     outptr2++;
1609                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1610                                     outptr3++;
1611                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1612                                     outptr4++;
1613                                     *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + biasptr[xi])), maxval);
1614                                     outptr5++;
1615                                 }
1616                             }
1617                             inptr += 192;
1618                         } else {
1619                             /* Optimized routine to copy an entire block */
1620                             __asm __volatile (
1621 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1622                                 ".arch  armv8.2-a+fp16\n"
1623 #endif
1624                                 "dup v0.8h, %[maxval].h[0]\n"
1625                                 "ldr q2, [%[biasptr]]\n"
1626                                 "dup v1.8h, %[minval].h[0]\n"
1627                                 "ldr q3, [%[biasptr], #0x10]\n"
1628                                 "ldr q4, [%[biasptr], #0x20]\n"
1629                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1630                                 "ldr q13, [%[inptr]]\n"
1631                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1632                                 "ldr q14, [%[inptr], #0x10]\n"
1633                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1634                                 "fadd v13.8h, v13.8h, v2.8h\n"
1635                                 "ldr q15, [%[inptr], #0x20]\n"
1636                                 "ldr q16, [%[inptr], #0x30]\n"
1637                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1638                                 "fadd v14.8h, v14.8h, v3.8h\n"
1639                                 "ldr q17, [%[inptr], #0x40]\n"
1640                                 "fmin v13.8h, v13.8h, v0.8h\n"
1641                                 "ldr q18, [%[inptr], #0x50]\n"
1642                                 "fadd v15.8h, v15.8h, v4.8h\n"
1643                                 "ldr q19, [%[inptr], #0x60]\n"
1644                                 "fadd v16.8h, v16.8h, v2.8h\n"
1645                                 "ldr q20, [%[inptr], #0x70]\n"
1646                                 "fmin v14.8h, v14.8h, v0.8h\n"
1647                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1648                                 "fmax v13.8h, v13.8h, v1.8h\n"
1649                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1650                                 "fmax v14.8h, v14.8h, v1.8h\n"
1651                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1652                                 "fmin v15.8h, v15.8h, v0.8h\n"
1653                                 "str q13, [%[outptr0]]\n"
1654                                 "fmin v16.8h, v16.8h, v0.8h\n"
1655                                 "ldr q13, [%[inptr], #0x80]\n"
1656                                 "fadd v17.8h, v17.8h, v3.8h\n"
1657                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1658                                 "fmax v15.8h, v15.8h, v1.8h\n"
1659                                 "str q14, [%[outptr0], #0x10]\n"
1660                                 "fmax v16.8h, v16.8h, v1.8h\n"
1661                                 "ldr q14, [%[inptr], #0x90]\n"
1662                                 "fmin v17.8h, v17.8h, v0.8h\n"
1663                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1664                                 "fadd v18.8h, v18.8h, v4.8h\n"
1665                                 "str q15, [%[outptr0], #0x20]\n"
1666                                 "fadd v19.8h, v19.8h, v2.8h\n"
1667                                 "ldr q15, [%[inptr], #0xa0]\n"
1668                                 "fmax v17.8h, v17.8h, v1.8h\n"
1669                                 "add %[outptr0], %[outptr0], #0x30\n"
1670                                 "fmin v18.8h, v18.8h, v0.8h\n"
1671                                 "str q16, [%[outptr1]]\n"
1672                                 "fmin v19.8h, v19.8h, v0.8h\n"
1673                                 "ldr q16, [%[inptr], #0xb0]\n"
1674                                 "fadd v20.8h, v20.8h, v3.8h\n"
1675                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1676                                 "fmax v18.8h, v18.8h, v1.8h\n"
1677                                 "str q17, [%[outptr1], #0x10]\n"
1678                                 "fmax v19.8h, v19.8h, v1.8h\n"
1679                                 "ldr q17, [%[inptr], #0xc0]\n"
1680                                 "fmin v20.8h, v20.8h, v0.8h\n"
1681                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1682                                 "fadd v13.8h, v13.8h, v4.8h\n"
1683                                 "str q18, [%[outptr1], #0x20]\n"
1684                                 "fadd v14.8h, v14.8h, v2.8h\n"
1685                                 "ldr q18, [%[inptr], #0xd0]\n"
1686                                 "fmax v20.8h, v20.8h, v1.8h\n"
1687                                 "add %[outptr1], %[outptr1], #0x30\n"
1688                                 "fmin v13.8h, v13.8h, v0.8h\n"
1689                                 "str q19, [%[outptr2]]\n"
1690                                 "fmin v14.8h, v14.8h, v0.8h\n"
1691                                 "ldr q19, [%[inptr], #0xe0]\n"
1692                                 "fadd v15.8h, v15.8h, v3.8h\n"
1693                                 "fadd v16.8h, v16.8h, v4.8h\n"
1694                                 "str q20, [%[outptr2], #0x10]\n"
1695                                 "fmax v13.8h, v13.8h, v1.8h\n"
1696                                 "ldr q20, [%[inptr], #0xf0]\n"
1697                                 "fmax v14.8h, v14.8h, v1.8h\n"
1698                                 "fmin v15.8h, v15.8h, v0.8h\n"
1699                                 "fmin v16.8h, v16.8h, v0.8h\n"
1700                                 "str q13, [%[outptr2], #0x20]\n"
1701                                 "fadd v17.8h, v17.8h, v2.8h\n"
1702                                 "ldr q13, [%[inptr], #0x100]\n"
1703                                 "fadd v18.8h, v18.8h, v3.8h\n"
1704                                 "add %[outptr2], %[outptr2], #0x30\n"
1705                                 "fmax v15.8h, v15.8h, v1.8h\n"
1706                                 "str q14, [%[outptr3]]\n"
1707                                 "fmax v16.8h, v16.8h, v1.8h\n"
1708                                 "ldr q14, [%[inptr], #0x110]\n"
1709                                 "fmin v17.8h, v17.8h, v0.8h\n"
1710                                 "add %[inptr], %[inptr], #0x180\n"
1711                                 "fmin v18.8h, v18.8h, v0.8h\n"
1712                                 "str q15, [%[outptr3], #0x10]\n"
1713                                 "fadd v19.8h, v19.8h, v4.8h\n"
1714                                 "fmax v17.8h, v17.8h, v1.8h\n"
1715                                 "fadd v20.8h, v20.8h, v2.8h\n"
1716                                 "str q16, [%[outptr3], #0x20]\n"
1717                                 "fmax v18.8h, v18.8h, v1.8h\n"
1718                                 "add %[outptr3], %[outptr3], #0x30\n"
1719                                 "fmin v19.8h, v19.8h, v0.8h\n"
1720                                 "str q17, [%[outptr4]]\n"
1721                                 "fmin v20.8h, v20.8h, v0.8h\n"
1722                                 "fadd v13.8h, v13.8h, v3.8h\n"
1723                                 "fadd v14.8h, v14.8h, v4.8h\n"
1724                                 "str q18, [%[outptr4], #0x10]\n"
1725                                 "fmax v19.8h, v19.8h, v1.8h\n"
1726                                 "fmax v20.8h, v20.8h, v1.8h\n"
1727                                 "fmin v13.8h, v13.8h, v0.8h\n"
1728                                 "fmin v14.8h, v14.8h, v0.8h\n"
1729                                 "str q19, [%[outptr4], #0x20]\n"
1730                                 "add %[outptr4], %[outptr4], #0x30\n"
1731                                 "fmax v13.8h, v13.8h, v1.8h\n"
1732                                 "str q20, [%[outptr5]]\n"
1733                                 "fmax v14.8h, v14.8h, v1.8h\n"
1734                                 "str q13, [%[outptr5], #0x10]\n"
1735                                 "str q14, [%[outptr5], #0x20]\n"
1736                                 "add %[outptr5], %[outptr5], #0x30\n"
1737                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1738                               [inptr] "+r" (inptr)
1739                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1740                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1741                             );
1742                         }
1743                     }
1744                     break;
1745 
1746                 case 7:
1747                     {
1748                         if ((i+23) >= xmax)
1749                         {
1750                             for (int xi=0; xi<23; xi++)
1751                             {
1752                                 if ((i+xi) < xmax)
1753                                 {
1754                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1755                                     outptr0++;
1756                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1757                                     outptr1++;
1758                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1759                                     outptr2++;
1760                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1761                                     outptr3++;
1762                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1763                                     outptr4++;
1764                                     *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + biasptr[xi])), maxval);
1765                                     outptr5++;
1766                                     *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + biasptr[xi])), maxval);
1767                                     outptr6++;
1768                                 }
1769                             }
1770                             inptr += 192;
1771                         } else {
1772                             /* Optimized routine to copy an entire block */
1773                             __asm __volatile (
1774 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1775                                 ".arch  armv8.2-a+fp16\n"
1776 #endif
1777                                 "dup v0.8h, %[maxval].h[0]\n"
1778                                 "ldr q2, [%[biasptr]]\n"
1779                                 "dup v1.8h, %[minval].h[0]\n"
1780                                 "ldr q3, [%[biasptr], #0x10]\n"
1781                                 "ldr q4, [%[biasptr], #0x20]\n"
1782                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1783                                 "ldr q13, [%[inptr]]\n"
1784                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1785                                 "ldr q14, [%[inptr], #0x10]\n"
1786                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1787                                 "fadd v13.8h, v13.8h, v2.8h\n"
1788                                 "ldr q15, [%[inptr], #0x20]\n"
1789                                 "ldr q16, [%[inptr], #0x30]\n"
1790                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1791                                 "fadd v14.8h, v14.8h, v3.8h\n"
1792                                 "ldr q17, [%[inptr], #0x40]\n"
1793                                 "fmin v13.8h, v13.8h, v0.8h\n"
1794                                 "ldr q18, [%[inptr], #0x50]\n"
1795                                 "fadd v15.8h, v15.8h, v4.8h\n"
1796                                 "ldr q19, [%[inptr], #0x60]\n"
1797                                 "fadd v16.8h, v16.8h, v2.8h\n"
1798                                 "ldr q20, [%[inptr], #0x70]\n"
1799                                 "fmin v14.8h, v14.8h, v0.8h\n"
1800                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1801                                 "fmax v13.8h, v13.8h, v1.8h\n"
1802                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1803                                 "fmax v14.8h, v14.8h, v1.8h\n"
1804                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1805                                 "fmin v15.8h, v15.8h, v0.8h\n"
1806                                 "str q13, [%[outptr0]]\n"
1807                                 "fmin v16.8h, v16.8h, v0.8h\n"
1808                                 "ldr q13, [%[inptr], #0x80]\n"
1809                                 "fadd v17.8h, v17.8h, v3.8h\n"
1810                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1811                                 "fmax v15.8h, v15.8h, v1.8h\n"
1812                                 "str q14, [%[outptr0], #0x10]\n"
1813                                 "fmax v16.8h, v16.8h, v1.8h\n"
1814                                 "ldr q14, [%[inptr], #0x90]\n"
1815                                 "fmin v17.8h, v17.8h, v0.8h\n"
1816                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1817                                 "fadd v18.8h, v18.8h, v4.8h\n"
1818                                 "str q15, [%[outptr0], #0x20]\n"
1819                                 "fadd v19.8h, v19.8h, v2.8h\n"
1820                                 "ldr q15, [%[inptr], #0xa0]\n"
1821                                 "fmax v17.8h, v17.8h, v1.8h\n"
1822                                 "add %[outptr0], %[outptr0], #0x30\n"
1823                                 "fmin v18.8h, v18.8h, v0.8h\n"
1824                                 "str q16, [%[outptr1]]\n"
1825                                 "fmin v19.8h, v19.8h, v0.8h\n"
1826                                 "ldr q16, [%[inptr], #0xb0]\n"
1827                                 "fadd v20.8h, v20.8h, v3.8h\n"
1828                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1829                                 "fmax v18.8h, v18.8h, v1.8h\n"
1830                                 "str q17, [%[outptr1], #0x10]\n"
1831                                 "fmax v19.8h, v19.8h, v1.8h\n"
1832                                 "ldr q17, [%[inptr], #0xc0]\n"
1833                                 "fmin v20.8h, v20.8h, v0.8h\n"
1834                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1835                                 "fadd v13.8h, v13.8h, v4.8h\n"
1836                                 "str q18, [%[outptr1], #0x20]\n"
1837                                 "fadd v14.8h, v14.8h, v2.8h\n"
1838                                 "ldr q18, [%[inptr], #0xd0]\n"
1839                                 "fmax v20.8h, v20.8h, v1.8h\n"
1840                                 "add %[outptr1], %[outptr1], #0x30\n"
1841                                 "fmin v13.8h, v13.8h, v0.8h\n"
1842                                 "str q19, [%[outptr2]]\n"
1843                                 "fmin v14.8h, v14.8h, v0.8h\n"
1844                                 "ldr q19, [%[inptr], #0xe0]\n"
1845                                 "fadd v15.8h, v15.8h, v3.8h\n"
1846                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1847                                 "fmax v13.8h, v13.8h, v1.8h\n"
1848                                 "str q20, [%[outptr2], #0x10]\n"
1849                                 "fmax v14.8h, v14.8h, v1.8h\n"
1850                                 "ldr q20, [%[inptr], #0xf0]\n"
1851                                 "fmin v15.8h, v15.8h, v0.8h\n"
1852                                 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1853                                 "fadd v16.8h, v16.8h, v4.8h\n"
1854                                 "str q13, [%[outptr2], #0x20]\n"
1855                                 "fadd v17.8h, v17.8h, v2.8h\n"
1856                                 "ldr q13, [%[inptr], #0x100]\n"
1857                                 "fmax v15.8h, v15.8h, v1.8h\n"
1858                                 "add %[outptr2], %[outptr2], #0x30\n"
1859                                 "fmin v16.8h, v16.8h, v0.8h\n"
1860                                 "str q14, [%[outptr3]]\n"
1861                                 "fmin v17.8h, v17.8h, v0.8h\n"
1862                                 "ldr q14, [%[inptr], #0x110]\n"
1863                                 "fadd v18.8h, v18.8h, v3.8h\n"
1864                                 "fadd v19.8h, v19.8h, v4.8h\n"
1865                                 "str q15, [%[outptr3], #0x10]\n"
1866                                 "fmax v16.8h, v16.8h, v1.8h\n"
1867                                 "ldr q15, [%[inptr], #0x120]\n"
1868                                 "fmax v17.8h, v17.8h, v1.8h\n"
1869                                 "fmin v18.8h, v18.8h, v0.8h\n"
1870                                 "fmin v19.8h, v19.8h, v0.8h\n"
1871                                 "str q16, [%[outptr3], #0x20]\n"
1872                                 "fadd v20.8h, v20.8h, v2.8h\n"
1873                                 "ldr q16, [%[inptr], #0x130]\n"
1874                                 "fadd v13.8h, v13.8h, v3.8h\n"
1875                                 "add %[outptr3], %[outptr3], #0x30\n"
1876                                 "fmax v18.8h, v18.8h, v1.8h\n"
1877                                 "str q17, [%[outptr4]]\n"
1878                                 "fmax v19.8h, v19.8h, v1.8h\n"
1879                                 "ldr q17, [%[inptr], #0x140]\n"
1880                                 "fmin v20.8h, v20.8h, v0.8h\n"
1881                                 "add %[inptr], %[inptr], #0x180\n"
1882                                 "fmin v13.8h, v13.8h, v0.8h\n"
1883                                 "str q18, [%[outptr4], #0x10]\n"
1884                                 "fadd v14.8h, v14.8h, v4.8h\n"
1885                                 "fmax v20.8h, v20.8h, v1.8h\n"
1886                                 "fadd v15.8h, v15.8h, v2.8h\n"
1887                                 "str q19, [%[outptr4], #0x20]\n"
1888                                 "fmax v13.8h, v13.8h, v1.8h\n"
1889                                 "add %[outptr4], %[outptr4], #0x30\n"
1890                                 "fmin v14.8h, v14.8h, v0.8h\n"
1891                                 "str q20, [%[outptr5]]\n"
1892                                 "fmin v15.8h, v15.8h, v0.8h\n"
1893                                 "fadd v16.8h, v16.8h, v3.8h\n"
1894                                 "fadd v17.8h, v17.8h, v4.8h\n"
1895                                 "str q13, [%[outptr5], #0x10]\n"
1896                                 "fmax v14.8h, v14.8h, v1.8h\n"
1897                                 "fmax v15.8h, v15.8h, v1.8h\n"
1898                                 "fmin v16.8h, v16.8h, v0.8h\n"
1899                                 "fmin v17.8h, v17.8h, v0.8h\n"
1900                                 "str q14, [%[outptr5], #0x20]\n"
1901                                 "add %[outptr5], %[outptr5], #0x30\n"
1902                                 "fmax v16.8h, v16.8h, v1.8h\n"
1903                                 "str q15, [%[outptr6]]\n"
1904                                 "fmax v17.8h, v17.8h, v1.8h\n"
1905                                 "str q16, [%[outptr6], #0x10]\n"
1906                                 "str q17, [%[outptr6], #0x20]\n"
1907                                 "add %[outptr6], %[outptr6], #0x30\n"
1908                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1909                               [inptr] "+r" (inptr)
1910                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1911                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1912                             );
1913                         }
1914                     }
1915                     break;
1916 
1917                 default:
1918                 case 8:
1919                     {
1920                         if ((i+23) >= xmax)
1921                         {
1922                             for (int xi=0; xi<23; xi++)
1923                             {
1924                                 if ((i+xi) < xmax)
1925                                 {
1926                                     *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1927                                     outptr0++;
1928                                     *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1929                                     outptr1++;
1930                                     *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1931                                     outptr2++;
1932                                     *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1933                                     outptr3++;
1934                                     *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1935                                     outptr4++;
1936                                     *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + biasptr[xi])), maxval);
1937                                     outptr5++;
1938                                     *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + biasptr[xi])), maxval);
1939                                     outptr6++;
1940                                     *outptr7 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 168] + biasptr[xi])), maxval);
1941                                     outptr7++;
1942                                 }
1943                             }
1944                             inptr += 192;
1945                         } else {
1946                             /* Optimized routine to copy an entire block */
1947                             __asm __volatile (
1948 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1949                                 ".arch  armv8.2-a+fp16\n"
1950 #endif
1951                                 "dup v0.8h, %[maxval].h[0]\n"
1952                                 "ldr q2, [%[biasptr]]\n"
1953                                 "dup v1.8h, %[minval].h[0]\n"
1954                                 "ldr q3, [%[biasptr], #0x10]\n"
1955                                 "ldr q4, [%[biasptr], #0x20]\n"
1956                                 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1957                                 "ldr q13, [%[inptr]]\n"
1958                                 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1959                                 "ldr q14, [%[inptr], #0x10]\n"
1960                                 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1961                                 "fadd v13.8h, v13.8h, v2.8h\n"
1962                                 "ldr q15, [%[inptr], #0x20]\n"
1963                                 "ldr q16, [%[inptr], #0x30]\n"
1964                                 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1965                                 "fadd v14.8h, v14.8h, v3.8h\n"
1966                                 "ldr q17, [%[inptr], #0x40]\n"
1967                                 "fmin v13.8h, v13.8h, v0.8h\n"
1968                                 "ldr q18, [%[inptr], #0x50]\n"
1969                                 "fadd v15.8h, v15.8h, v4.8h\n"
1970                                 "ldr q19, [%[inptr], #0x60]\n"
1971                                 "fadd v16.8h, v16.8h, v2.8h\n"
1972                                 "ldr q20, [%[inptr], #0x70]\n"
1973                                 "fmin v14.8h, v14.8h, v0.8h\n"
1974                                 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1975                                 "fmax v13.8h, v13.8h, v1.8h\n"
1976                                 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1977                                 "fmax v14.8h, v14.8h, v1.8h\n"
1978                                 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1979                                 "fmin v15.8h, v15.8h, v0.8h\n"
1980                                 "str q13, [%[outptr0]]\n"
1981                                 "fmin v16.8h, v16.8h, v0.8h\n"
1982                                 "ldr q13, [%[inptr], #0x80]\n"
1983                                 "fadd v17.8h, v17.8h, v3.8h\n"
1984                                 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1985                                 "fmax v15.8h, v15.8h, v1.8h\n"
1986                                 "str q14, [%[outptr0], #0x10]\n"
1987                                 "fmax v16.8h, v16.8h, v1.8h\n"
1988                                 "ldr q14, [%[inptr], #0x90]\n"
1989                                 "fmin v17.8h, v17.8h, v0.8h\n"
1990                                 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1991                                 "fadd v18.8h, v18.8h, v4.8h\n"
1992                                 "str q15, [%[outptr0], #0x20]\n"
1993                                 "fadd v19.8h, v19.8h, v2.8h\n"
1994                                 "ldr q15, [%[inptr], #0xa0]\n"
1995                                 "fmax v17.8h, v17.8h, v1.8h\n"
1996                                 "add %[outptr0], %[outptr0], #0x30\n"
1997                                 "fmin v18.8h, v18.8h, v0.8h\n"
1998                                 "str q16, [%[outptr1]]\n"
1999                                 "fmin v19.8h, v19.8h, v0.8h\n"
2000                                 "ldr q16, [%[inptr], #0xb0]\n"
2001                                 "fadd v20.8h, v20.8h, v3.8h\n"
2002                                 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
2003                                 "fmax v18.8h, v18.8h, v1.8h\n"
2004                                 "str q17, [%[outptr1], #0x10]\n"
2005                                 "fmax v19.8h, v19.8h, v1.8h\n"
2006                                 "ldr q17, [%[inptr], #0xc0]\n"
2007                                 "fmin v20.8h, v20.8h, v0.8h\n"
2008                                 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
2009                                 "fadd v13.8h, v13.8h, v4.8h\n"
2010                                 "str q18, [%[outptr1], #0x20]\n"
2011                                 "fadd v14.8h, v14.8h, v2.8h\n"
2012                                 "ldr q18, [%[inptr], #0xd0]\n"
2013                                 "fmax v20.8h, v20.8h, v1.8h\n"
2014                                 "add %[outptr1], %[outptr1], #0x30\n"
2015                                 "fmin v13.8h, v13.8h, v0.8h\n"
2016                                 "str q19, [%[outptr2]]\n"
2017                                 "fmin v14.8h, v14.8h, v0.8h\n"
2018                                 "ldr q19, [%[inptr], #0xe0]\n"
2019                                 "fadd v15.8h, v15.8h, v3.8h\n"
2020                                 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
2021                                 "fmax v13.8h, v13.8h, v1.8h\n"
2022                                 "str q20, [%[outptr2], #0x10]\n"
2023                                 "fmax v14.8h, v14.8h, v1.8h\n"
2024                                 "ldr q20, [%[inptr], #0xf0]\n"
2025                                 "fmin v15.8h, v15.8h, v0.8h\n"
2026                                 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
2027                                 "fadd v16.8h, v16.8h, v4.8h\n"
2028                                 "str q13, [%[outptr2], #0x20]\n"
2029                                 "fadd v17.8h, v17.8h, v2.8h\n"
2030                                 "ldr q13, [%[inptr], #0x100]\n"
2031                                 "fmax v15.8h, v15.8h, v1.8h\n"
2032                                 "add %[outptr2], %[outptr2], #0x30\n"
2033                                 "fmin v16.8h, v16.8h, v0.8h\n"
2034                                 "str q14, [%[outptr3]]\n"
2035                                 "fmin v17.8h, v17.8h, v0.8h\n"
2036                                 "ldr q14, [%[inptr], #0x110]\n"
2037                                 "fadd v18.8h, v18.8h, v3.8h\n"
2038                                 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
2039                                 "fmax v16.8h, v16.8h, v1.8h\n"
2040                                 "str q15, [%[outptr3], #0x10]\n"
2041                                 "fmax v17.8h, v17.8h, v1.8h\n"
2042                                 "ldr q15, [%[inptr], #0x120]\n"
2043                                 "fmin v18.8h, v18.8h, v0.8h\n"
2044                                 "fadd v19.8h, v19.8h, v4.8h\n"
2045                                 "str q16, [%[outptr3], #0x20]\n"
2046                                 "fadd v20.8h, v20.8h, v2.8h\n"
2047                                 "ldr q16, [%[inptr], #0x130]\n"
2048                                 "fadd v13.8h, v13.8h, v3.8h\n"
2049                                 "add %[outptr3], %[outptr3], #0x30\n"
2050                                 "fmax v18.8h, v18.8h, v1.8h\n"
2051                                 "str q17, [%[outptr4]]\n"
2052                                 "fmin v19.8h, v19.8h, v0.8h\n"
2053                                 "ldr q17, [%[inptr], #0x140]\n"
2054                                 "fmin v20.8h, v20.8h, v0.8h\n"
2055                                 "fmin v13.8h, v13.8h, v0.8h\n"
2056                                 "str q18, [%[outptr4], #0x10]\n"
2057                                 "fadd v14.8h, v14.8h, v4.8h\n"
2058                                 "ldr q18, [%[inptr], #0x150]\n"
2059                                 "fmax v19.8h, v19.8h, v1.8h\n"
2060                                 "fmax v20.8h, v20.8h, v1.8h\n"
2061                                 "fmax v13.8h, v13.8h, v1.8h\n"
2062                                 "fmin v14.8h, v14.8h, v0.8h\n"
2063                                 "str q19, [%[outptr4], #0x20]\n"
2064                                 "fadd v15.8h, v15.8h, v2.8h\n"
2065                                 "ldr q19, [%[inptr], #0x160]\n"
2066                                 "fadd v16.8h, v16.8h, v3.8h\n"
2067                                 "add %[outptr4], %[outptr4], #0x30\n"
2068                                 "fmax v14.8h, v14.8h, v1.8h\n"
2069                                 "str q20, [%[outptr5]]\n"
2070                                 "fmin v15.8h, v15.8h, v0.8h\n"
2071                                 "ldr q20, [%[inptr], #0x170]\n"
2072                                 "fmin v16.8h, v16.8h, v0.8h\n"
2073                                 "add %[inptr], %[inptr], #0x180\n"
2074                                 "fadd v17.8h, v17.8h, v4.8h\n"
2075                                 "str q13, [%[outptr5], #0x10]\n"
2076                                 "fmax v15.8h, v15.8h, v1.8h\n"
2077                                 "fmax v16.8h, v16.8h, v1.8h\n"
2078                                 "fadd v18.8h, v18.8h, v2.8h\n"
2079                                 "str q14, [%[outptr5], #0x20]\n"
2080                                 "fmin v17.8h, v17.8h, v0.8h\n"
2081                                 "add %[outptr5], %[outptr5], #0x30\n"
2082                                 "fadd v19.8h, v19.8h, v3.8h\n"
2083                                 "str q15, [%[outptr6]]\n"
2084                                 "fmin v18.8h, v18.8h, v0.8h\n"
2085                                 "fmax v17.8h, v17.8h, v1.8h\n"
2086                                 "fadd v20.8h, v20.8h, v4.8h\n"
2087                                 "str q16, [%[outptr6], #0x10]\n"
2088                                 "fmin v19.8h, v19.8h, v0.8h\n"
2089                                 "fmax v18.8h, v18.8h, v1.8h\n"
2090                                 "fmin v20.8h, v20.8h, v0.8h\n"
2091                                 "str q17, [%[outptr6], #0x20]\n"
2092                                 "fmax v19.8h, v19.8h, v1.8h\n"
2093                                 "add %[outptr6], %[outptr6], #0x30\n"
2094                                 "fmax v20.8h, v20.8h, v1.8h\n"
2095                                 "str q18, [%[outptr7]]\n"
2096                                 "str q19, [%[outptr7], #0x10]\n"
2097                                 "str q20, [%[outptr7], #0x20]\n"
2098                                 "add %[outptr7], %[outptr7], #0x30\n"
2099                             : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
2100                               [inptr] "+r" (inptr)
2101                             : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
2102                             : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
2103                             );
2104                         }
2105                     }
2106                     break;
2107 
2108 
2109                 }
2110             }
2111         }
2112     }
2113 }
2114 
2115 #endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
2116