1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #pragma once
25
26 #if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
27
28 template<>
MergeResults(__fp16 * out,const __fp16 * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const __fp16 * bias,Activation act,bool append)29 void MergeResults<24, 8, false>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const __fp16 *bias, Activation act, bool append)
30 {
31 const __fp16 *inptr = in;
32 __fp16 nullbias[24];
33 __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
34 __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
35
36 switch(act.type)
37 {
38 default:
39 case Activation::Type::None:
40 break;
41 case Activation::Type::BoundedReLU:
42 maxval = static_cast<__fp16>(act.param1);
43 /* fall through */
44 case Activation::Type::ReLU:
45 minval = 0.0f;
46 break;
47 }
48
49 if (!append && !bias)
50 {
51 memset(nullbias, 0, (24 * sizeof(__fp16)));
52 }
53
54 for (int y=y0; y<ymax; y+=8)
55 {
56 __fp16 *outptr0 = out + (y * ldout) + x0;
57 __fp16 *outptr1 = outptr0 + ldout;
58 __fp16 *outptr2 = outptr1 + ldout;
59 __fp16 *outptr3 = outptr2 + ldout;
60 __fp16 *outptr4 = outptr3 + ldout;
61 __fp16 *outptr5 = outptr4 + ldout;
62 __fp16 *outptr6 = outptr5 + ldout;
63 __fp16 *outptr7 = outptr6 + ldout;
64
65 const int height = ymax - y;
66
67 for (int i=x0; i<xmax; i+=24)
68 {
69 if (append)
70 {
71 switch(height)
72 {
73 case 1:
74 {
75 if ((i+23) >= xmax)
76 {
77 for (int xi=0; xi<23; xi++)
78 {
79 if ((i+xi) < xmax)
80 {
81 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
82 outptr0++;
83 }
84 }
85 inptr += 192;
86 } else {
87 /* Optimized routine to copy an entire block */
88 __asm __volatile (
89 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
90 ".arch armv8.2-a+fp16\n"
91 #endif
92 "dup v0.8h, %[maxval].h[0]\n"
93 "ldr q2, [%[outptr0]]\n"
94 "dup v1.8h, %[minval].h[0]\n"
95 "ldr q10, [%[inptr]]\n"
96 "ldr q3, [%[outptr0], #0x10]\n"
97 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
98 "ldr q11, [%[inptr], #0x10]\n"
99 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
100 "fadd v10.8h, v10.8h, v2.8h\n"
101 "ldr q4, [%[outptr0], #0x20]\n"
102 "ldr q12, [%[inptr], #0x20]\n"
103 "add %[inptr], %[inptr], #0x180\n"
104 "fadd v11.8h, v11.8h, v3.8h\n"
105 "fmin v10.8h, v10.8h, v0.8h\n"
106 "fadd v12.8h, v12.8h, v4.8h\n"
107 "fmin v11.8h, v11.8h, v0.8h\n"
108 "fmax v10.8h, v10.8h, v1.8h\n"
109 "fmin v12.8h, v12.8h, v0.8h\n"
110 "fmax v11.8h, v11.8h, v1.8h\n"
111 "str q10, [%[outptr0]]\n"
112 "fmax v12.8h, v12.8h, v1.8h\n"
113 "str q11, [%[outptr0], #0x10]\n"
114 "str q12, [%[outptr0], #0x20]\n"
115 "add %[outptr0], %[outptr0], #0x30\n"
116 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
117 [inptr] "+r" (inptr)
118 : [minval] "w" (minval), [maxval] "w" (maxval)
119 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
120 );
121 }
122 }
123 break;
124
125 case 2:
126 {
127 if ((i+23) >= xmax)
128 {
129 for (int xi=0; xi<23; xi++)
130 {
131 if ((i+xi) < xmax)
132 {
133 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
134 outptr0++;
135 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
136 outptr1++;
137 }
138 }
139 inptr += 192;
140 } else {
141 /* Optimized routine to copy an entire block */
142 __asm __volatile (
143 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
144 ".arch armv8.2-a+fp16\n"
145 #endif
146 "dup v0.8h, %[maxval].h[0]\n"
147 "ldr q2, [%[outptr0]]\n"
148 "dup v1.8h, %[minval].h[0]\n"
149 "ldr q10, [%[inptr]]\n"
150 "ldr q3, [%[outptr0], #0x10]\n"
151 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
152 "ldr q11, [%[inptr], #0x10]\n"
153 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
154 "fadd v10.8h, v10.8h, v2.8h\n"
155 "ldr q4, [%[outptr0], #0x20]\n"
156 "ldr q12, [%[inptr], #0x20]\n"
157 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
158 "fadd v11.8h, v11.8h, v3.8h\n"
159 "ldr q5, [%[outptr1]]\n"
160 "fmin v10.8h, v10.8h, v0.8h\n"
161 "ldr q13, [%[inptr], #0x30]\n"
162 "fadd v12.8h, v12.8h, v4.8h\n"
163 "ldr q6, [%[outptr1], #0x10]\n"
164 "ldr q14, [%[inptr], #0x40]\n"
165 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
166 "fmax v10.8h, v10.8h, v1.8h\n"
167 "ldr q7, [%[outptr1], #0x20]\n"
168 "fmin v11.8h, v11.8h, v0.8h\n"
169 "ldr q15, [%[inptr], #0x50]\n"
170 "fmin v12.8h, v12.8h, v0.8h\n"
171 "add %[inptr], %[inptr], #0x180\n"
172 "fadd v13.8h, v13.8h, v5.8h\n"
173 "str q10, [%[outptr0]]\n"
174 "fmax v11.8h, v11.8h, v1.8h\n"
175 "fmax v12.8h, v12.8h, v1.8h\n"
176 "fadd v14.8h, v14.8h, v6.8h\n"
177 "fmin v13.8h, v13.8h, v0.8h\n"
178 "str q11, [%[outptr0], #0x10]\n"
179 "fadd v15.8h, v15.8h, v7.8h\n"
180 "fmin v14.8h, v14.8h, v0.8h\n"
181 "str q12, [%[outptr0], #0x20]\n"
182 "fmax v13.8h, v13.8h, v1.8h\n"
183 "add %[outptr0], %[outptr0], #0x30\n"
184 "fmin v15.8h, v15.8h, v0.8h\n"
185 "fmax v14.8h, v14.8h, v1.8h\n"
186 "str q13, [%[outptr1]]\n"
187 "fmax v15.8h, v15.8h, v1.8h\n"
188 "str q14, [%[outptr1], #0x10]\n"
189 "str q15, [%[outptr1], #0x20]\n"
190 "add %[outptr1], %[outptr1], #0x30\n"
191 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
192 [inptr] "+r" (inptr)
193 : [minval] "w" (minval), [maxval] "w" (maxval)
194 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
195 );
196 }
197 }
198 break;
199
200 case 3:
201 {
202 if ((i+23) >= xmax)
203 {
204 for (int xi=0; xi<23; xi++)
205 {
206 if ((i+xi) < xmax)
207 {
208 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
209 outptr0++;
210 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
211 outptr1++;
212 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
213 outptr2++;
214 }
215 }
216 inptr += 192;
217 } else {
218 /* Optimized routine to copy an entire block */
219 __asm __volatile (
220 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
221 ".arch armv8.2-a+fp16\n"
222 #endif
223 "dup v0.8h, %[maxval].h[0]\n"
224 "ldr q2, [%[outptr0]]\n"
225 "dup v1.8h, %[minval].h[0]\n"
226 "ldr q10, [%[inptr]]\n"
227 "ldr q3, [%[outptr0], #0x10]\n"
228 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
229 "ldr q11, [%[inptr], #0x10]\n"
230 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
231 "fadd v10.8h, v10.8h, v2.8h\n"
232 "ldr q4, [%[outptr0], #0x20]\n"
233 "ldr q12, [%[inptr], #0x20]\n"
234 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
235 "fadd v11.8h, v11.8h, v3.8h\n"
236 "ldr q5, [%[outptr1]]\n"
237 "fmin v10.8h, v10.8h, v0.8h\n"
238 "ldr q13, [%[inptr], #0x30]\n"
239 "fadd v12.8h, v12.8h, v4.8h\n"
240 "ldr q6, [%[outptr1], #0x10]\n"
241 "ldr q14, [%[inptr], #0x40]\n"
242 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
243 "fmax v10.8h, v10.8h, v1.8h\n"
244 "ldr q7, [%[outptr1], #0x20]\n"
245 "fmin v11.8h, v11.8h, v0.8h\n"
246 "ldr q15, [%[inptr], #0x50]\n"
247 "fmin v12.8h, v12.8h, v0.8h\n"
248 "ldr q8, [%[outptr2]]\n"
249 "fadd v13.8h, v13.8h, v5.8h\n"
250 "str q10, [%[outptr0]]\n"
251 "fadd v14.8h, v14.8h, v6.8h\n"
252 "ldr q16, [%[inptr], #0x60]\n"
253 "fmax v11.8h, v11.8h, v1.8h\n"
254 "ldr q9, [%[outptr2], #0x10]\n"
255 "fmax v12.8h, v12.8h, v1.8h\n"
256 "ldr q17, [%[inptr], #0x70]\n"
257 "fmin v13.8h, v13.8h, v0.8h\n"
258 "ldr q2, [%[outptr2], #0x20]\n"
259 "fmin v14.8h, v14.8h, v0.8h\n"
260 "str q11, [%[outptr0], #0x10]\n"
261 "fadd v15.8h, v15.8h, v7.8h\n"
262 "ldr q10, [%[inptr], #0x80]\n"
263 "fadd v16.8h, v16.8h, v8.8h\n"
264 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
265 "fmax v13.8h, v13.8h, v1.8h\n"
266 "str q12, [%[outptr0], #0x20]\n"
267 "fmax v14.8h, v14.8h, v1.8h\n"
268 "add %[outptr0], %[outptr0], #0x30\n"
269 "fmin v15.8h, v15.8h, v0.8h\n"
270 "str q13, [%[outptr1]]\n"
271 "fmin v16.8h, v16.8h, v0.8h\n"
272 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
273 "fadd v17.8h, v17.8h, v9.8h\n"
274 "str q14, [%[outptr1], #0x10]\n"
275 "fmax v15.8h, v15.8h, v1.8h\n"
276 "add %[inptr], %[inptr], #0x180\n"
277 "fmax v16.8h, v16.8h, v1.8h\n"
278 "fmin v17.8h, v17.8h, v0.8h\n"
279 "str q15, [%[outptr1], #0x20]\n"
280 "fadd v10.8h, v10.8h, v2.8h\n"
281 "add %[outptr1], %[outptr1], #0x30\n"
282 "fmax v17.8h, v17.8h, v1.8h\n"
283 "str q16, [%[outptr2]]\n"
284 "fmin v10.8h, v10.8h, v0.8h\n"
285 "str q17, [%[outptr2], #0x10]\n"
286 "fmax v10.8h, v10.8h, v1.8h\n"
287 "str q10, [%[outptr2], #0x20]\n"
288 "add %[outptr2], %[outptr2], #0x30\n"
289 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
290 [inptr] "+r" (inptr)
291 : [minval] "w" (minval), [maxval] "w" (maxval)
292 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
293 );
294 }
295 }
296 break;
297
298 case 4:
299 {
300 if ((i+23) >= xmax)
301 {
302 for (int xi=0; xi<23; xi++)
303 {
304 if ((i+xi) < xmax)
305 {
306 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
307 outptr0++;
308 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
309 outptr1++;
310 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
311 outptr2++;
312 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
313 outptr3++;
314 }
315 }
316 inptr += 192;
317 } else {
318 /* Optimized routine to copy an entire block */
319 __asm __volatile (
320 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
321 ".arch armv8.2-a+fp16\n"
322 #endif
323 "dup v0.8h, %[maxval].h[0]\n"
324 "ldr q2, [%[outptr0]]\n"
325 "dup v1.8h, %[minval].h[0]\n"
326 "ldr q10, [%[inptr]]\n"
327 "ldr q3, [%[outptr0], #0x10]\n"
328 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
329 "ldr q11, [%[inptr], #0x10]\n"
330 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
331 "fadd v10.8h, v10.8h, v2.8h\n"
332 "ldr q4, [%[outptr0], #0x20]\n"
333 "ldr q12, [%[inptr], #0x20]\n"
334 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
335 "fadd v11.8h, v11.8h, v3.8h\n"
336 "ldr q5, [%[outptr1]]\n"
337 "fmin v10.8h, v10.8h, v0.8h\n"
338 "ldr q13, [%[inptr], #0x30]\n"
339 "fadd v12.8h, v12.8h, v4.8h\n"
340 "ldr q6, [%[outptr1], #0x10]\n"
341 "ldr q14, [%[inptr], #0x40]\n"
342 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
343 "fmax v10.8h, v10.8h, v1.8h\n"
344 "ldr q7, [%[outptr1], #0x20]\n"
345 "fmin v11.8h, v11.8h, v0.8h\n"
346 "ldr q15, [%[inptr], #0x50]\n"
347 "fmin v12.8h, v12.8h, v0.8h\n"
348 "ldr q8, [%[outptr2]]\n"
349 "fadd v13.8h, v13.8h, v5.8h\n"
350 "str q10, [%[outptr0]]\n"
351 "fadd v14.8h, v14.8h, v6.8h\n"
352 "ldr q16, [%[inptr], #0x60]\n"
353 "fmax v11.8h, v11.8h, v1.8h\n"
354 "ldr q9, [%[outptr2], #0x10]\n"
355 "fmax v12.8h, v12.8h, v1.8h\n"
356 "ldr q17, [%[inptr], #0x70]\n"
357 "fmin v13.8h, v13.8h, v0.8h\n"
358 "ldr q2, [%[outptr2], #0x20]\n"
359 "fmin v14.8h, v14.8h, v0.8h\n"
360 "str q11, [%[outptr0], #0x10]\n"
361 "fadd v15.8h, v15.8h, v7.8h\n"
362 "ldr q10, [%[inptr], #0x80]\n"
363 "fadd v16.8h, v16.8h, v8.8h\n"
364 "ldr q3, [%[outptr3]]\n"
365 "fmax v13.8h, v13.8h, v1.8h\n"
366 "str q12, [%[outptr0], #0x20]\n"
367 "fmax v14.8h, v14.8h, v1.8h\n"
368 "ldr q11, [%[inptr], #0x90]\n"
369 "fmin v15.8h, v15.8h, v0.8h\n"
370 "ldr q4, [%[outptr3], #0x10]\n"
371 "fmin v16.8h, v16.8h, v0.8h\n"
372 "str q13, [%[outptr1]]\n"
373 "fadd v17.8h, v17.8h, v9.8h\n"
374 "ldr q12, [%[inptr], #0xa0]\n"
375 "fadd v10.8h, v10.8h, v2.8h\n"
376 "ldr q5, [%[outptr3], #0x20]\n"
377 "fmax v15.8h, v15.8h, v1.8h\n"
378 "str q14, [%[outptr1], #0x10]\n"
379 "fmax v16.8h, v16.8h, v1.8h\n"
380 "ldr q13, [%[inptr], #0xb0]\n"
381 "fmin v17.8h, v17.8h, v0.8h\n"
382 "add %[outptr0], %[outptr0], #0x30\n"
383 "fmin v10.8h, v10.8h, v0.8h\n"
384 "str q15, [%[outptr1], #0x20]\n"
385 "fadd v11.8h, v11.8h, v3.8h\n"
386 "add %[outptr1], %[outptr1], #0x30\n"
387 "fmax v17.8h, v17.8h, v1.8h\n"
388 "str q16, [%[outptr2]]\n"
389 "fmax v10.8h, v10.8h, v1.8h\n"
390 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
391 "fmin v11.8h, v11.8h, v0.8h\n"
392 "str q17, [%[outptr2], #0x10]\n"
393 "fadd v12.8h, v12.8h, v4.8h\n"
394 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
395 "fadd v13.8h, v13.8h, v5.8h\n"
396 "str q10, [%[outptr2], #0x20]\n"
397 "fmax v11.8h, v11.8h, v1.8h\n"
398 "add %[outptr2], %[outptr2], #0x30\n"
399 "fmin v12.8h, v12.8h, v0.8h\n"
400 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
401 "fmin v13.8h, v13.8h, v0.8h\n"
402 "str q11, [%[outptr3]]\n"
403 "add %[inptr], %[inptr], #0x180\n"
404 "fmax v12.8h, v12.8h, v1.8h\n"
405 "fmax v13.8h, v13.8h, v1.8h\n"
406 "str q12, [%[outptr3], #0x10]\n"
407 "str q13, [%[outptr3], #0x20]\n"
408 "add %[outptr3], %[outptr3], #0x30\n"
409 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
410 [inptr] "+r" (inptr)
411 : [minval] "w" (minval), [maxval] "w" (maxval)
412 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
413 );
414 }
415 }
416 break;
417
418 case 5:
419 {
420 if ((i+23) >= xmax)
421 {
422 for (int xi=0; xi<23; xi++)
423 {
424 if ((i+xi) < xmax)
425 {
426 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
427 outptr0++;
428 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
429 outptr1++;
430 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
431 outptr2++;
432 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
433 outptr3++;
434 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
435 outptr4++;
436 }
437 }
438 inptr += 192;
439 } else {
440 /* Optimized routine to copy an entire block */
441 __asm __volatile (
442 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
443 ".arch armv8.2-a+fp16\n"
444 #endif
445 "dup v0.8h, %[maxval].h[0]\n"
446 "ldr q2, [%[outptr0]]\n"
447 "dup v1.8h, %[minval].h[0]\n"
448 "ldr q10, [%[inptr]]\n"
449 "ldr q3, [%[outptr0], #0x10]\n"
450 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
451 "ldr q11, [%[inptr], #0x10]\n"
452 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
453 "fadd v10.8h, v10.8h, v2.8h\n"
454 "ldr q4, [%[outptr0], #0x20]\n"
455 "ldr q12, [%[inptr], #0x20]\n"
456 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
457 "fadd v11.8h, v11.8h, v3.8h\n"
458 "ldr q5, [%[outptr1]]\n"
459 "fmin v10.8h, v10.8h, v0.8h\n"
460 "ldr q13, [%[inptr], #0x30]\n"
461 "fadd v12.8h, v12.8h, v4.8h\n"
462 "ldr q6, [%[outptr1], #0x10]\n"
463 "ldr q14, [%[inptr], #0x40]\n"
464 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
465 "fmax v10.8h, v10.8h, v1.8h\n"
466 "ldr q7, [%[outptr1], #0x20]\n"
467 "fmin v11.8h, v11.8h, v0.8h\n"
468 "ldr q15, [%[inptr], #0x50]\n"
469 "fmin v12.8h, v12.8h, v0.8h\n"
470 "ldr q8, [%[outptr2]]\n"
471 "fadd v13.8h, v13.8h, v5.8h\n"
472 "str q10, [%[outptr0]]\n"
473 "fadd v14.8h, v14.8h, v6.8h\n"
474 "ldr q16, [%[inptr], #0x60]\n"
475 "fmax v11.8h, v11.8h, v1.8h\n"
476 "ldr q9, [%[outptr2], #0x10]\n"
477 "fmax v12.8h, v12.8h, v1.8h\n"
478 "ldr q17, [%[inptr], #0x70]\n"
479 "fmin v13.8h, v13.8h, v0.8h\n"
480 "ldr q2, [%[outptr2], #0x20]\n"
481 "fmin v14.8h, v14.8h, v0.8h\n"
482 "str q11, [%[outptr0], #0x10]\n"
483 "fadd v15.8h, v15.8h, v7.8h\n"
484 "ldr q10, [%[inptr], #0x80]\n"
485 "fadd v16.8h, v16.8h, v8.8h\n"
486 "ldr q3, [%[outptr3]]\n"
487 "fmax v13.8h, v13.8h, v1.8h\n"
488 "str q12, [%[outptr0], #0x20]\n"
489 "fmax v14.8h, v14.8h, v1.8h\n"
490 "ldr q11, [%[inptr], #0x90]\n"
491 "fmin v15.8h, v15.8h, v0.8h\n"
492 "ldr q4, [%[outptr3], #0x10]\n"
493 "fmin v16.8h, v16.8h, v0.8h\n"
494 "str q13, [%[outptr1]]\n"
495 "fadd v17.8h, v17.8h, v9.8h\n"
496 "ldr q12, [%[inptr], #0xa0]\n"
497 "fadd v10.8h, v10.8h, v2.8h\n"
498 "ldr q5, [%[outptr3], #0x20]\n"
499 "fmax v15.8h, v15.8h, v1.8h\n"
500 "str q14, [%[outptr1], #0x10]\n"
501 "fmax v16.8h, v16.8h, v1.8h\n"
502 "ldr q13, [%[inptr], #0xb0]\n"
503 "fmin v17.8h, v17.8h, v0.8h\n"
504 "ldr q6, [%[outptr4]]\n"
505 "fmin v10.8h, v10.8h, v0.8h\n"
506 "str q15, [%[outptr1], #0x20]\n"
507 "fadd v11.8h, v11.8h, v3.8h\n"
508 "ldr q14, [%[inptr], #0xc0]\n"
509 "fadd v12.8h, v12.8h, v4.8h\n"
510 "ldr q7, [%[outptr4], #0x10]\n"
511 "fmax v17.8h, v17.8h, v1.8h\n"
512 "str q16, [%[outptr2]]\n"
513 "fmax v10.8h, v10.8h, v1.8h\n"
514 "ldr q15, [%[inptr], #0xd0]\n"
515 "fmin v11.8h, v11.8h, v0.8h\n"
516 "ldr q8, [%[outptr4], #0x20]\n"
517 "fmin v12.8h, v12.8h, v0.8h\n"
518 "str q17, [%[outptr2], #0x10]\n"
519 "fadd v13.8h, v13.8h, v5.8h\n"
520 "ldr q16, [%[inptr], #0xe0]\n"
521 "fadd v14.8h, v14.8h, v6.8h\n"
522 "add %[outptr0], %[outptr0], #0x30\n"
523 "fmax v11.8h, v11.8h, v1.8h\n"
524 "str q10, [%[outptr2], #0x20]\n"
525 "fmax v12.8h, v12.8h, v1.8h\n"
526 "add %[outptr1], %[outptr1], #0x30\n"
527 "fmin v13.8h, v13.8h, v0.8h\n"
528 "str q11, [%[outptr3]]\n"
529 "fmin v14.8h, v14.8h, v0.8h\n"
530 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
531 "fadd v15.8h, v15.8h, v7.8h\n"
532 "str q12, [%[outptr3], #0x10]\n"
533 "fmax v13.8h, v13.8h, v1.8h\n"
534 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
535 "fmax v14.8h, v14.8h, v1.8h\n"
536 "add %[outptr2], %[outptr2], #0x30\n"
537 "fmin v15.8h, v15.8h, v0.8h\n"
538 "str q13, [%[outptr3], #0x20]\n"
539 "fadd v16.8h, v16.8h, v8.8h\n"
540 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
541 "add %[outptr3], %[outptr3], #0x30\n"
542 "fmax v15.8h, v15.8h, v1.8h\n"
543 "str q14, [%[outptr4]]\n"
544 "fmin v16.8h, v16.8h, v0.8h\n"
545 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
546 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
547 "str q15, [%[outptr4], #0x10]\n"
548 "add %[inptr], %[inptr], #0x180\n"
549 "fmax v16.8h, v16.8h, v1.8h\n"
550 "str q16, [%[outptr4], #0x20]\n"
551 "add %[outptr4], %[outptr4], #0x30\n"
552 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
553 [inptr] "+r" (inptr)
554 : [minval] "w" (minval), [maxval] "w" (maxval)
555 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
556 );
557 }
558 }
559 break;
560
561 case 6:
562 {
563 if ((i+23) >= xmax)
564 {
565 for (int xi=0; xi<23; xi++)
566 {
567 if ((i+xi) < xmax)
568 {
569 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
570 outptr0++;
571 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
572 outptr1++;
573 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
574 outptr2++;
575 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
576 outptr3++;
577 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
578 outptr4++;
579 *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + *outptr5)), maxval);
580 outptr5++;
581 }
582 }
583 inptr += 192;
584 } else {
585 /* Optimized routine to copy an entire block */
586 __asm __volatile (
587 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
588 ".arch armv8.2-a+fp16\n"
589 #endif
590 "dup v0.8h, %[maxval].h[0]\n"
591 "ldr q2, [%[outptr0]]\n"
592 "dup v1.8h, %[minval].h[0]\n"
593 "ldr q10, [%[inptr]]\n"
594 "ldr q3, [%[outptr0], #0x10]\n"
595 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
596 "ldr q11, [%[inptr], #0x10]\n"
597 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
598 "fadd v10.8h, v10.8h, v2.8h\n"
599 "ldr q4, [%[outptr0], #0x20]\n"
600 "ldr q12, [%[inptr], #0x20]\n"
601 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
602 "fadd v11.8h, v11.8h, v3.8h\n"
603 "ldr q5, [%[outptr1]]\n"
604 "fmin v10.8h, v10.8h, v0.8h\n"
605 "ldr q13, [%[inptr], #0x30]\n"
606 "fadd v12.8h, v12.8h, v4.8h\n"
607 "ldr q6, [%[outptr1], #0x10]\n"
608 "ldr q14, [%[inptr], #0x40]\n"
609 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
610 "fmax v10.8h, v10.8h, v1.8h\n"
611 "ldr q7, [%[outptr1], #0x20]\n"
612 "fmin v11.8h, v11.8h, v0.8h\n"
613 "ldr q15, [%[inptr], #0x50]\n"
614 "fmin v12.8h, v12.8h, v0.8h\n"
615 "ldr q8, [%[outptr2]]\n"
616 "fadd v13.8h, v13.8h, v5.8h\n"
617 "str q10, [%[outptr0]]\n"
618 "fadd v14.8h, v14.8h, v6.8h\n"
619 "ldr q16, [%[inptr], #0x60]\n"
620 "fmax v11.8h, v11.8h, v1.8h\n"
621 "ldr q9, [%[outptr2], #0x10]\n"
622 "fmax v12.8h, v12.8h, v1.8h\n"
623 "ldr q17, [%[inptr], #0x70]\n"
624 "fmin v13.8h, v13.8h, v0.8h\n"
625 "ldr q2, [%[outptr2], #0x20]\n"
626 "fmin v14.8h, v14.8h, v0.8h\n"
627 "str q11, [%[outptr0], #0x10]\n"
628 "fadd v15.8h, v15.8h, v7.8h\n"
629 "ldr q10, [%[inptr], #0x80]\n"
630 "fadd v16.8h, v16.8h, v8.8h\n"
631 "ldr q3, [%[outptr3]]\n"
632 "fmax v13.8h, v13.8h, v1.8h\n"
633 "str q12, [%[outptr0], #0x20]\n"
634 "fmax v14.8h, v14.8h, v1.8h\n"
635 "ldr q11, [%[inptr], #0x90]\n"
636 "fmin v15.8h, v15.8h, v0.8h\n"
637 "ldr q4, [%[outptr3], #0x10]\n"
638 "fmin v16.8h, v16.8h, v0.8h\n"
639 "str q13, [%[outptr1]]\n"
640 "fadd v17.8h, v17.8h, v9.8h\n"
641 "ldr q12, [%[inptr], #0xa0]\n"
642 "fadd v10.8h, v10.8h, v2.8h\n"
643 "ldr q5, [%[outptr3], #0x20]\n"
644 "fmax v15.8h, v15.8h, v1.8h\n"
645 "str q14, [%[outptr1], #0x10]\n"
646 "fmax v16.8h, v16.8h, v1.8h\n"
647 "ldr q13, [%[inptr], #0xb0]\n"
648 "fmin v17.8h, v17.8h, v0.8h\n"
649 "ldr q6, [%[outptr4]]\n"
650 "fmin v10.8h, v10.8h, v0.8h\n"
651 "str q15, [%[outptr1], #0x20]\n"
652 "fadd v11.8h, v11.8h, v3.8h\n"
653 "ldr q14, [%[inptr], #0xc0]\n"
654 "fadd v12.8h, v12.8h, v4.8h\n"
655 "ldr q7, [%[outptr4], #0x10]\n"
656 "fmax v17.8h, v17.8h, v1.8h\n"
657 "str q16, [%[outptr2]]\n"
658 "fmax v10.8h, v10.8h, v1.8h\n"
659 "ldr q15, [%[inptr], #0xd0]\n"
660 "fmin v11.8h, v11.8h, v0.8h\n"
661 "ldr q8, [%[outptr4], #0x20]\n"
662 "fmin v12.8h, v12.8h, v0.8h\n"
663 "str q17, [%[outptr2], #0x10]\n"
664 "fadd v13.8h, v13.8h, v5.8h\n"
665 "ldr q16, [%[inptr], #0xe0]\n"
666 "fadd v14.8h, v14.8h, v6.8h\n"
667 "ldr q9, [%[outptr5]]\n"
668 "fmax v11.8h, v11.8h, v1.8h\n"
669 "str q10, [%[outptr2], #0x20]\n"
670 "fmax v12.8h, v12.8h, v1.8h\n"
671 "ldr q17, [%[inptr], #0xf0]\n"
672 "fmin v13.8h, v13.8h, v0.8h\n"
673 "ldr q2, [%[outptr5], #0x10]\n"
674 "fmin v14.8h, v14.8h, v0.8h\n"
675 "str q11, [%[outptr3]]\n"
676 "fadd v15.8h, v15.8h, v7.8h\n"
677 "ldr q10, [%[inptr], #0x100]\n"
678 "fadd v16.8h, v16.8h, v8.8h\n"
679 "ldr q3, [%[outptr5], #0x20]\n"
680 "fmax v13.8h, v13.8h, v1.8h\n"
681 "str q12, [%[outptr3], #0x10]\n"
682 "fmax v14.8h, v14.8h, v1.8h\n"
683 "ldr q11, [%[inptr], #0x110]\n"
684 "fmin v15.8h, v15.8h, v0.8h\n"
685 "add %[outptr0], %[outptr0], #0x30\n"
686 "fmin v16.8h, v16.8h, v0.8h\n"
687 "str q13, [%[outptr3], #0x20]\n"
688 "fadd v17.8h, v17.8h, v9.8h\n"
689 "add %[outptr1], %[outptr1], #0x30\n"
690 "fmax v15.8h, v15.8h, v1.8h\n"
691 "str q14, [%[outptr4]]\n"
692 "fmax v16.8h, v16.8h, v1.8h\n"
693 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
694 "fmin v17.8h, v17.8h, v0.8h\n"
695 "str q15, [%[outptr4], #0x10]\n"
696 "fadd v10.8h, v10.8h, v2.8h\n"
697 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
698 "fadd v11.8h, v11.8h, v3.8h\n"
699 "str q16, [%[outptr4], #0x20]\n"
700 "fmax v17.8h, v17.8h, v1.8h\n"
701 "add %[outptr2], %[outptr2], #0x30\n"
702 "fmin v10.8h, v10.8h, v0.8h\n"
703 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
704 "fmin v11.8h, v11.8h, v0.8h\n"
705 "str q17, [%[outptr5]]\n"
706 "add %[outptr3], %[outptr3], #0x30\n"
707 "fmax v10.8h, v10.8h, v1.8h\n"
708 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
709 "fmax v11.8h, v11.8h, v1.8h\n"
710 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
711 "str q10, [%[outptr5], #0x10]\n"
712 "add %[outptr4], %[outptr4], #0x30\n"
713 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
714 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
715 "str q11, [%[outptr5], #0x20]\n"
716 "add %[outptr5], %[outptr5], #0x30\n"
717 "add %[inptr], %[inptr], #0x180\n"
718 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
719 [inptr] "+r" (inptr)
720 : [minval] "w" (minval), [maxval] "w" (maxval)
721 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
722 );
723 }
724 }
725 break;
726
727 case 7:
728 {
729 if ((i+23) >= xmax)
730 {
731 for (int xi=0; xi<23; xi++)
732 {
733 if ((i+xi) < xmax)
734 {
735 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
736 outptr0++;
737 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
738 outptr1++;
739 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
740 outptr2++;
741 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
742 outptr3++;
743 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
744 outptr4++;
745 *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + *outptr5)), maxval);
746 outptr5++;
747 *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + *outptr6)), maxval);
748 outptr6++;
749 }
750 }
751 inptr += 192;
752 } else {
753 /* Optimized routine to copy an entire block */
754 __asm __volatile (
755 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
756 ".arch armv8.2-a+fp16\n"
757 #endif
758 "dup v0.8h, %[maxval].h[0]\n"
759 "ldr q2, [%[outptr0]]\n"
760 "dup v1.8h, %[minval].h[0]\n"
761 "ldr q10, [%[inptr]]\n"
762 "ldr q3, [%[outptr0], #0x10]\n"
763 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
764 "ldr q11, [%[inptr], #0x10]\n"
765 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
766 "fadd v10.8h, v10.8h, v2.8h\n"
767 "ldr q4, [%[outptr0], #0x20]\n"
768 "ldr q12, [%[inptr], #0x20]\n"
769 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
770 "fadd v11.8h, v11.8h, v3.8h\n"
771 "ldr q5, [%[outptr1]]\n"
772 "fmin v10.8h, v10.8h, v0.8h\n"
773 "ldr q13, [%[inptr], #0x30]\n"
774 "fadd v12.8h, v12.8h, v4.8h\n"
775 "ldr q6, [%[outptr1], #0x10]\n"
776 "ldr q14, [%[inptr], #0x40]\n"
777 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
778 "fmax v10.8h, v10.8h, v1.8h\n"
779 "ldr q7, [%[outptr1], #0x20]\n"
780 "fmin v11.8h, v11.8h, v0.8h\n"
781 "ldr q15, [%[inptr], #0x50]\n"
782 "fmin v12.8h, v12.8h, v0.8h\n"
783 "ldr q8, [%[outptr2]]\n"
784 "fadd v13.8h, v13.8h, v5.8h\n"
785 "str q10, [%[outptr0]]\n"
786 "fadd v14.8h, v14.8h, v6.8h\n"
787 "ldr q16, [%[inptr], #0x60]\n"
788 "fmax v11.8h, v11.8h, v1.8h\n"
789 "ldr q9, [%[outptr2], #0x10]\n"
790 "fmax v12.8h, v12.8h, v1.8h\n"
791 "ldr q17, [%[inptr], #0x70]\n"
792 "fmin v13.8h, v13.8h, v0.8h\n"
793 "ldr q2, [%[outptr2], #0x20]\n"
794 "fmin v14.8h, v14.8h, v0.8h\n"
795 "str q11, [%[outptr0], #0x10]\n"
796 "fadd v15.8h, v15.8h, v7.8h\n"
797 "ldr q10, [%[inptr], #0x80]\n"
798 "fadd v16.8h, v16.8h, v8.8h\n"
799 "ldr q3, [%[outptr3]]\n"
800 "fmax v13.8h, v13.8h, v1.8h\n"
801 "str q12, [%[outptr0], #0x20]\n"
802 "fmax v14.8h, v14.8h, v1.8h\n"
803 "ldr q11, [%[inptr], #0x90]\n"
804 "fmin v15.8h, v15.8h, v0.8h\n"
805 "ldr q4, [%[outptr3], #0x10]\n"
806 "fmin v16.8h, v16.8h, v0.8h\n"
807 "str q13, [%[outptr1]]\n"
808 "fadd v17.8h, v17.8h, v9.8h\n"
809 "ldr q12, [%[inptr], #0xa0]\n"
810 "fadd v10.8h, v10.8h, v2.8h\n"
811 "ldr q5, [%[outptr3], #0x20]\n"
812 "fmax v15.8h, v15.8h, v1.8h\n"
813 "str q14, [%[outptr1], #0x10]\n"
814 "fmax v16.8h, v16.8h, v1.8h\n"
815 "ldr q13, [%[inptr], #0xb0]\n"
816 "fmin v17.8h, v17.8h, v0.8h\n"
817 "ldr q6, [%[outptr4]]\n"
818 "fmin v10.8h, v10.8h, v0.8h\n"
819 "str q15, [%[outptr1], #0x20]\n"
820 "fadd v11.8h, v11.8h, v3.8h\n"
821 "ldr q14, [%[inptr], #0xc0]\n"
822 "fadd v12.8h, v12.8h, v4.8h\n"
823 "ldr q7, [%[outptr4], #0x10]\n"
824 "fmax v17.8h, v17.8h, v1.8h\n"
825 "str q16, [%[outptr2]]\n"
826 "fmax v10.8h, v10.8h, v1.8h\n"
827 "ldr q15, [%[inptr], #0xd0]\n"
828 "fmin v11.8h, v11.8h, v0.8h\n"
829 "ldr q8, [%[outptr4], #0x20]\n"
830 "fmin v12.8h, v12.8h, v0.8h\n"
831 "str q17, [%[outptr2], #0x10]\n"
832 "fadd v13.8h, v13.8h, v5.8h\n"
833 "ldr q16, [%[inptr], #0xe0]\n"
834 "fadd v14.8h, v14.8h, v6.8h\n"
835 "ldr q9, [%[outptr5]]\n"
836 "fmax v11.8h, v11.8h, v1.8h\n"
837 "str q10, [%[outptr2], #0x20]\n"
838 "fmax v12.8h, v12.8h, v1.8h\n"
839 "ldr q17, [%[inptr], #0xf0]\n"
840 "fmin v13.8h, v13.8h, v0.8h\n"
841 "ldr q2, [%[outptr5], #0x10]\n"
842 "fmin v14.8h, v14.8h, v0.8h\n"
843 "str q11, [%[outptr3]]\n"
844 "fadd v15.8h, v15.8h, v7.8h\n"
845 "ldr q10, [%[inptr], #0x100]\n"
846 "fadd v16.8h, v16.8h, v8.8h\n"
847 "ldr q3, [%[outptr5], #0x20]\n"
848 "fmax v13.8h, v13.8h, v1.8h\n"
849 "str q12, [%[outptr3], #0x10]\n"
850 "fmax v14.8h, v14.8h, v1.8h\n"
851 "ldr q11, [%[inptr], #0x110]\n"
852 "fmin v15.8h, v15.8h, v0.8h\n"
853 "ldr q4, [%[outptr6]]\n"
854 "fmin v16.8h, v16.8h, v0.8h\n"
855 "str q13, [%[outptr3], #0x20]\n"
856 "fadd v17.8h, v17.8h, v9.8h\n"
857 "ldr q12, [%[inptr], #0x120]\n"
858 "fadd v10.8h, v10.8h, v2.8h\n"
859 "ldr q5, [%[outptr6], #0x10]\n"
860 "fmax v15.8h, v15.8h, v1.8h\n"
861 "str q14, [%[outptr4]]\n"
862 "fmax v16.8h, v16.8h, v1.8h\n"
863 "ldr q13, [%[inptr], #0x130]\n"
864 "fmin v17.8h, v17.8h, v0.8h\n"
865 "ldr q6, [%[outptr6], #0x20]\n"
866 "fmin v10.8h, v10.8h, v0.8h\n"
867 "str q15, [%[outptr4], #0x10]\n"
868 "fadd v11.8h, v11.8h, v3.8h\n"
869 "ldr q14, [%[inptr], #0x140]\n"
870 "fadd v12.8h, v12.8h, v4.8h\n"
871 "add %[outptr0], %[outptr0], #0x30\n"
872 "fmax v17.8h, v17.8h, v1.8h\n"
873 "str q16, [%[outptr4], #0x20]\n"
874 "fmax v10.8h, v10.8h, v1.8h\n"
875 "add %[outptr1], %[outptr1], #0x30\n"
876 "fmin v11.8h, v11.8h, v0.8h\n"
877 "str q17, [%[outptr5]]\n"
878 "fmin v12.8h, v12.8h, v0.8h\n"
879 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
880 "fadd v13.8h, v13.8h, v5.8h\n"
881 "str q10, [%[outptr5], #0x10]\n"
882 "fmax v11.8h, v11.8h, v1.8h\n"
883 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
884 "fmax v12.8h, v12.8h, v1.8h\n"
885 "add %[outptr2], %[outptr2], #0x30\n"
886 "fmin v13.8h, v13.8h, v0.8h\n"
887 "str q11, [%[outptr5], #0x20]\n"
888 "fadd v14.8h, v14.8h, v6.8h\n"
889 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
890 "add %[outptr3], %[outptr3], #0x30\n"
891 "fmax v13.8h, v13.8h, v1.8h\n"
892 "str q12, [%[outptr6]]\n"
893 "fmin v14.8h, v14.8h, v0.8h\n"
894 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
895 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
896 "str q13, [%[outptr6], #0x10]\n"
897 "add %[outptr4], %[outptr4], #0x30\n"
898 "fmax v14.8h, v14.8h, v1.8h\n"
899 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
900 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
901 "add %[outptr5], %[outptr5], #0x30\n"
902 "str q14, [%[outptr6], #0x20]\n"
903 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
904 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
905 "add %[outptr6], %[outptr6], #0x30\n"
906 "add %[inptr], %[inptr], #0x180\n"
907 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
908 [inptr] "+r" (inptr)
909 : [minval] "w" (minval), [maxval] "w" (maxval)
910 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
911 );
912 }
913 }
914 break;
915
916 default:
917 case 8:
918 {
919 if ((i+23) >= xmax)
920 {
921 for (int xi=0; xi<23; xi++)
922 {
923 if ((i+xi) < xmax)
924 {
925 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + *outptr0)), maxval);
926 outptr0++;
927 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + *outptr1)), maxval);
928 outptr1++;
929 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + *outptr2)), maxval);
930 outptr2++;
931 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + *outptr3)), maxval);
932 outptr3++;
933 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + *outptr4)), maxval);
934 outptr4++;
935 *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + *outptr5)), maxval);
936 outptr5++;
937 *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + *outptr6)), maxval);
938 outptr6++;
939 *outptr7 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 168] + *outptr7)), maxval);
940 outptr7++;
941 }
942 }
943 inptr += 192;
944 } else {
945 /* Optimized routine to copy an entire block */
946 __asm __volatile (
947 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
948 ".arch armv8.2-a+fp16\n"
949 #endif
950 "dup v0.8h, %[maxval].h[0]\n"
951 "ldr q2, [%[outptr0]]\n"
952 "dup v1.8h, %[minval].h[0]\n"
953 "ldr q10, [%[inptr]]\n"
954 "ldr q3, [%[outptr0], #0x10]\n"
955 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
956 "ldr q11, [%[inptr], #0x10]\n"
957 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
958 "fadd v10.8h, v10.8h, v2.8h\n"
959 "ldr q4, [%[outptr0], #0x20]\n"
960 "ldr q12, [%[inptr], #0x20]\n"
961 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
962 "fadd v11.8h, v11.8h, v3.8h\n"
963 "ldr q5, [%[outptr1]]\n"
964 "fmin v10.8h, v10.8h, v0.8h\n"
965 "ldr q13, [%[inptr], #0x30]\n"
966 "fadd v12.8h, v12.8h, v4.8h\n"
967 "ldr q6, [%[outptr1], #0x10]\n"
968 "ldr q14, [%[inptr], #0x40]\n"
969 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
970 "fmax v10.8h, v10.8h, v1.8h\n"
971 "ldr q7, [%[outptr1], #0x20]\n"
972 "fmin v11.8h, v11.8h, v0.8h\n"
973 "ldr q15, [%[inptr], #0x50]\n"
974 "fmin v12.8h, v12.8h, v0.8h\n"
975 "ldr q8, [%[outptr2]]\n"
976 "fadd v13.8h, v13.8h, v5.8h\n"
977 "str q10, [%[outptr0]]\n"
978 "fadd v14.8h, v14.8h, v6.8h\n"
979 "ldr q16, [%[inptr], #0x60]\n"
980 "fmax v11.8h, v11.8h, v1.8h\n"
981 "ldr q9, [%[outptr2], #0x10]\n"
982 "fmax v12.8h, v12.8h, v1.8h\n"
983 "ldr q17, [%[inptr], #0x70]\n"
984 "fmin v13.8h, v13.8h, v0.8h\n"
985 "ldr q2, [%[outptr2], #0x20]\n"
986 "fmin v14.8h, v14.8h, v0.8h\n"
987 "str q11, [%[outptr0], #0x10]\n"
988 "fadd v15.8h, v15.8h, v7.8h\n"
989 "ldr q10, [%[inptr], #0x80]\n"
990 "fadd v16.8h, v16.8h, v8.8h\n"
991 "ldr q3, [%[outptr3]]\n"
992 "fmax v13.8h, v13.8h, v1.8h\n"
993 "str q12, [%[outptr0], #0x20]\n"
994 "fmax v14.8h, v14.8h, v1.8h\n"
995 "ldr q11, [%[inptr], #0x90]\n"
996 "fmin v15.8h, v15.8h, v0.8h\n"
997 "ldr q4, [%[outptr3], #0x10]\n"
998 "fmin v16.8h, v16.8h, v0.8h\n"
999 "str q13, [%[outptr1]]\n"
1000 "fadd v17.8h, v17.8h, v9.8h\n"
1001 "ldr q12, [%[inptr], #0xa0]\n"
1002 "fadd v10.8h, v10.8h, v2.8h\n"
1003 "ldr q5, [%[outptr3], #0x20]\n"
1004 "fmax v15.8h, v15.8h, v1.8h\n"
1005 "str q14, [%[outptr1], #0x10]\n"
1006 "fmax v16.8h, v16.8h, v1.8h\n"
1007 "ldr q13, [%[inptr], #0xb0]\n"
1008 "fmin v17.8h, v17.8h, v0.8h\n"
1009 "ldr q6, [%[outptr4]]\n"
1010 "fmin v10.8h, v10.8h, v0.8h\n"
1011 "str q15, [%[outptr1], #0x20]\n"
1012 "fadd v11.8h, v11.8h, v3.8h\n"
1013 "ldr q14, [%[inptr], #0xc0]\n"
1014 "fadd v12.8h, v12.8h, v4.8h\n"
1015 "ldr q7, [%[outptr4], #0x10]\n"
1016 "fmax v17.8h, v17.8h, v1.8h\n"
1017 "str q16, [%[outptr2]]\n"
1018 "fmax v10.8h, v10.8h, v1.8h\n"
1019 "ldr q15, [%[inptr], #0xd0]\n"
1020 "fmin v11.8h, v11.8h, v0.8h\n"
1021 "ldr q8, [%[outptr4], #0x20]\n"
1022 "fmin v12.8h, v12.8h, v0.8h\n"
1023 "str q17, [%[outptr2], #0x10]\n"
1024 "fadd v13.8h, v13.8h, v5.8h\n"
1025 "ldr q16, [%[inptr], #0xe0]\n"
1026 "fadd v14.8h, v14.8h, v6.8h\n"
1027 "ldr q9, [%[outptr5]]\n"
1028 "fmax v11.8h, v11.8h, v1.8h\n"
1029 "str q10, [%[outptr2], #0x20]\n"
1030 "fmax v12.8h, v12.8h, v1.8h\n"
1031 "ldr q17, [%[inptr], #0xf0]\n"
1032 "fmin v13.8h, v13.8h, v0.8h\n"
1033 "ldr q2, [%[outptr5], #0x10]\n"
1034 "fmin v14.8h, v14.8h, v0.8h\n"
1035 "str q11, [%[outptr3]]\n"
1036 "fadd v15.8h, v15.8h, v7.8h\n"
1037 "ldr q10, [%[inptr], #0x100]\n"
1038 "fadd v16.8h, v16.8h, v8.8h\n"
1039 "ldr q3, [%[outptr5], #0x20]\n"
1040 "fmax v13.8h, v13.8h, v1.8h\n"
1041 "str q12, [%[outptr3], #0x10]\n"
1042 "fmax v14.8h, v14.8h, v1.8h\n"
1043 "ldr q11, [%[inptr], #0x110]\n"
1044 "fmin v15.8h, v15.8h, v0.8h\n"
1045 "ldr q4, [%[outptr6]]\n"
1046 "fmin v16.8h, v16.8h, v0.8h\n"
1047 "str q13, [%[outptr3], #0x20]\n"
1048 "fadd v17.8h, v17.8h, v9.8h\n"
1049 "ldr q12, [%[inptr], #0x120]\n"
1050 "fadd v10.8h, v10.8h, v2.8h\n"
1051 "ldr q5, [%[outptr6], #0x10]\n"
1052 "fmax v15.8h, v15.8h, v1.8h\n"
1053 "str q14, [%[outptr4]]\n"
1054 "fmax v16.8h, v16.8h, v1.8h\n"
1055 "ldr q13, [%[inptr], #0x130]\n"
1056 "fmin v17.8h, v17.8h, v0.8h\n"
1057 "ldr q6, [%[outptr6], #0x20]\n"
1058 "fmin v10.8h, v10.8h, v0.8h\n"
1059 "str q15, [%[outptr4], #0x10]\n"
1060 "fadd v11.8h, v11.8h, v3.8h\n"
1061 "ldr q14, [%[inptr], #0x140]\n"
1062 "fadd v12.8h, v12.8h, v4.8h\n"
1063 "ldr q7, [%[outptr7]]\n"
1064 "fmax v17.8h, v17.8h, v1.8h\n"
1065 "str q16, [%[outptr4], #0x20]\n"
1066 "fmax v10.8h, v10.8h, v1.8h\n"
1067 "ldr q15, [%[inptr], #0x150]\n"
1068 "fmin v11.8h, v11.8h, v0.8h\n"
1069 "ldr q8, [%[outptr7], #0x10]\n"
1070 "fmin v12.8h, v12.8h, v0.8h\n"
1071 "str q17, [%[outptr5]]\n"
1072 "fadd v13.8h, v13.8h, v5.8h\n"
1073 "ldr q16, [%[inptr], #0x160]\n"
1074 "fadd v14.8h, v14.8h, v6.8h\n"
1075 "ldr q9, [%[outptr7], #0x20]\n"
1076 "fmax v11.8h, v11.8h, v1.8h\n"
1077 "str q10, [%[outptr5], #0x10]\n"
1078 "fmax v12.8h, v12.8h, v1.8h\n"
1079 "ldr q17, [%[inptr], #0x170]\n"
1080 "fmin v13.8h, v13.8h, v0.8h\n"
1081 "add %[outptr0], %[outptr0], #0x30\n"
1082 "fmin v14.8h, v14.8h, v0.8h\n"
1083 "str q11, [%[outptr5], #0x20]\n"
1084 "fadd v15.8h, v15.8h, v7.8h\n"
1085 "add %[outptr1], %[outptr1], #0x30\n"
1086 "fmax v13.8h, v13.8h, v1.8h\n"
1087 "str q12, [%[outptr6]]\n"
1088 "fmax v14.8h, v14.8h, v1.8h\n"
1089 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1090 "fmin v15.8h, v15.8h, v0.8h\n"
1091 "str q13, [%[outptr6], #0x10]\n"
1092 "fadd v16.8h, v16.8h, v8.8h\n"
1093 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1094 "fadd v17.8h, v17.8h, v9.8h\n"
1095 "str q14, [%[outptr6], #0x20]\n"
1096 "fmax v15.8h, v15.8h, v1.8h\n"
1097 "add %[outptr2], %[outptr2], #0x30\n"
1098 "fmin v16.8h, v16.8h, v0.8h\n"
1099 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1100 "fmin v17.8h, v17.8h, v0.8h\n"
1101 "str q15, [%[outptr7]]\n"
1102 "add %[outptr3], %[outptr3], #0x30\n"
1103 "fmax v16.8h, v16.8h, v1.8h\n"
1104 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1105 "fmax v17.8h, v17.8h, v1.8h\n"
1106 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1107 "str q16, [%[outptr7], #0x10]\n"
1108 "add %[outptr4], %[outptr4], #0x30\n"
1109 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1110 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1111 "str q17, [%[outptr7], #0x20]\n"
1112 "add %[outptr5], %[outptr5], #0x30\n"
1113 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1114 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1115 "add %[outptr6], %[outptr6], #0x30\n"
1116 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
1117 "add %[outptr7], %[outptr7], #0x30\n"
1118 "add %[inptr], %[inptr], #0x180\n"
1119 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1120 [inptr] "+r" (inptr)
1121 : [minval] "w" (minval), [maxval] "w" (maxval)
1122 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1123 );
1124 }
1125 }
1126 break;
1127
1128
1129 }
1130 }
1131 else
1132 {
1133 const __fp16 *biasptr = bias ? bias + i : nullbias;
1134
1135 switch(height)
1136 {
1137 case 1:
1138 {
1139 if ((i+23) >= xmax)
1140 {
1141 for (int xi=0; xi<23; xi++)
1142 {
1143 if ((i+xi) < xmax)
1144 {
1145 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1146 outptr0++;
1147 }
1148 }
1149 inptr += 192;
1150 } else {
1151 /* Optimized routine to copy an entire block */
1152 __asm __volatile (
1153 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1154 ".arch armv8.2-a+fp16\n"
1155 #endif
1156 "dup v0.8h, %[maxval].h[0]\n"
1157 "ldr q2, [%[biasptr]]\n"
1158 "dup v1.8h, %[minval].h[0]\n"
1159 "ldr q3, [%[biasptr], #0x10]\n"
1160 "ldr q4, [%[biasptr], #0x20]\n"
1161 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1162 "ldr q13, [%[inptr]]\n"
1163 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1164 "ldr q14, [%[inptr], #0x10]\n"
1165 "ldr q15, [%[inptr], #0x20]\n"
1166 "add %[inptr], %[inptr], #0x180\n"
1167 "fadd v13.8h, v13.8h, v2.8h\n"
1168 "fadd v14.8h, v14.8h, v3.8h\n"
1169 "fadd v15.8h, v15.8h, v4.8h\n"
1170 "fmin v13.8h, v13.8h, v0.8h\n"
1171 "fmin v14.8h, v14.8h, v0.8h\n"
1172 "fmin v15.8h, v15.8h, v0.8h\n"
1173 "fmax v13.8h, v13.8h, v1.8h\n"
1174 "fmax v14.8h, v14.8h, v1.8h\n"
1175 "fmax v15.8h, v15.8h, v1.8h\n"
1176 "str q13, [%[outptr0]]\n"
1177 "str q14, [%[outptr0], #0x10]\n"
1178 "str q15, [%[outptr0], #0x20]\n"
1179 "add %[outptr0], %[outptr0], #0x30\n"
1180 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1181 [inptr] "+r" (inptr)
1182 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1183 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1184 );
1185 }
1186 }
1187 break;
1188
1189 case 2:
1190 {
1191 if ((i+23) >= xmax)
1192 {
1193 for (int xi=0; xi<23; xi++)
1194 {
1195 if ((i+xi) < xmax)
1196 {
1197 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1198 outptr0++;
1199 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1200 outptr1++;
1201 }
1202 }
1203 inptr += 192;
1204 } else {
1205 /* Optimized routine to copy an entire block */
1206 __asm __volatile (
1207 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1208 ".arch armv8.2-a+fp16\n"
1209 #endif
1210 "dup v0.8h, %[maxval].h[0]\n"
1211 "ldr q2, [%[biasptr]]\n"
1212 "dup v1.8h, %[minval].h[0]\n"
1213 "ldr q3, [%[biasptr], #0x10]\n"
1214 "ldr q4, [%[biasptr], #0x20]\n"
1215 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1216 "ldr q13, [%[inptr]]\n"
1217 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1218 "ldr q14, [%[inptr], #0x10]\n"
1219 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1220 "fadd v13.8h, v13.8h, v2.8h\n"
1221 "ldr q15, [%[inptr], #0x20]\n"
1222 "ldr q16, [%[inptr], #0x30]\n"
1223 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1224 "fadd v14.8h, v14.8h, v3.8h\n"
1225 "ldr q17, [%[inptr], #0x40]\n"
1226 "fmin v13.8h, v13.8h, v0.8h\n"
1227 "ldr q18, [%[inptr], #0x50]\n"
1228 "fadd v15.8h, v15.8h, v4.8h\n"
1229 "add %[inptr], %[inptr], #0x180\n"
1230 "fmin v14.8h, v14.8h, v0.8h\n"
1231 "fmax v13.8h, v13.8h, v1.8h\n"
1232 "fmin v15.8h, v15.8h, v0.8h\n"
1233 "fadd v16.8h, v16.8h, v2.8h\n"
1234 "fmax v14.8h, v14.8h, v1.8h\n"
1235 "str q13, [%[outptr0]]\n"
1236 "fadd v17.8h, v17.8h, v3.8h\n"
1237 "fmax v15.8h, v15.8h, v1.8h\n"
1238 "fmin v16.8h, v16.8h, v0.8h\n"
1239 "str q14, [%[outptr0], #0x10]\n"
1240 "fadd v18.8h, v18.8h, v4.8h\n"
1241 "fmin v17.8h, v17.8h, v0.8h\n"
1242 "fmax v16.8h, v16.8h, v1.8h\n"
1243 "str q15, [%[outptr0], #0x20]\n"
1244 "fmin v18.8h, v18.8h, v0.8h\n"
1245 "add %[outptr0], %[outptr0], #0x30\n"
1246 "fmax v17.8h, v17.8h, v1.8h\n"
1247 "str q16, [%[outptr1]]\n"
1248 "fmax v18.8h, v18.8h, v1.8h\n"
1249 "str q17, [%[outptr1], #0x10]\n"
1250 "str q18, [%[outptr1], #0x20]\n"
1251 "add %[outptr1], %[outptr1], #0x30\n"
1252 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1253 [inptr] "+r" (inptr)
1254 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1255 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1256 );
1257 }
1258 }
1259 break;
1260
1261 case 3:
1262 {
1263 if ((i+23) >= xmax)
1264 {
1265 for (int xi=0; xi<23; xi++)
1266 {
1267 if ((i+xi) < xmax)
1268 {
1269 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1270 outptr0++;
1271 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1272 outptr1++;
1273 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1274 outptr2++;
1275 }
1276 }
1277 inptr += 192;
1278 } else {
1279 /* Optimized routine to copy an entire block */
1280 __asm __volatile (
1281 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1282 ".arch armv8.2-a+fp16\n"
1283 #endif
1284 "dup v0.8h, %[maxval].h[0]\n"
1285 "ldr q2, [%[biasptr]]\n"
1286 "dup v1.8h, %[minval].h[0]\n"
1287 "ldr q3, [%[biasptr], #0x10]\n"
1288 "ldr q4, [%[biasptr], #0x20]\n"
1289 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1290 "ldr q13, [%[inptr]]\n"
1291 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1292 "ldr q14, [%[inptr], #0x10]\n"
1293 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1294 "fadd v13.8h, v13.8h, v2.8h\n"
1295 "ldr q15, [%[inptr], #0x20]\n"
1296 "ldr q16, [%[inptr], #0x30]\n"
1297 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1298 "fadd v14.8h, v14.8h, v3.8h\n"
1299 "ldr q17, [%[inptr], #0x40]\n"
1300 "fmin v13.8h, v13.8h, v0.8h\n"
1301 "ldr q18, [%[inptr], #0x50]\n"
1302 "fadd v15.8h, v15.8h, v4.8h\n"
1303 "ldr q19, [%[inptr], #0x60]\n"
1304 "fadd v16.8h, v16.8h, v2.8h\n"
1305 "ldr q20, [%[inptr], #0x70]\n"
1306 "fmin v14.8h, v14.8h, v0.8h\n"
1307 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1308 "fmax v13.8h, v13.8h, v1.8h\n"
1309 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1310 "fmax v14.8h, v14.8h, v1.8h\n"
1311 "fmin v15.8h, v15.8h, v0.8h\n"
1312 "str q13, [%[outptr0]]\n"
1313 "fmin v16.8h, v16.8h, v0.8h\n"
1314 "ldr q13, [%[inptr], #0x80]\n"
1315 "fadd v17.8h, v17.8h, v3.8h\n"
1316 "add %[inptr], %[inptr], #0x180\n"
1317 "fmax v15.8h, v15.8h, v1.8h\n"
1318 "str q14, [%[outptr0], #0x10]\n"
1319 "fmax v16.8h, v16.8h, v1.8h\n"
1320 "fmin v17.8h, v17.8h, v0.8h\n"
1321 "fadd v18.8h, v18.8h, v4.8h\n"
1322 "str q15, [%[outptr0], #0x20]\n"
1323 "fadd v19.8h, v19.8h, v2.8h\n"
1324 "add %[outptr0], %[outptr0], #0x30\n"
1325 "fmax v17.8h, v17.8h, v1.8h\n"
1326 "str q16, [%[outptr1]]\n"
1327 "fmin v18.8h, v18.8h, v0.8h\n"
1328 "fmin v19.8h, v19.8h, v0.8h\n"
1329 "fadd v20.8h, v20.8h, v3.8h\n"
1330 "str q17, [%[outptr1], #0x10]\n"
1331 "fadd v13.8h, v13.8h, v4.8h\n"
1332 "fmax v18.8h, v18.8h, v1.8h\n"
1333 "fmax v19.8h, v19.8h, v1.8h\n"
1334 "fmin v20.8h, v20.8h, v0.8h\n"
1335 "fmin v13.8h, v13.8h, v0.8h\n"
1336 "str q18, [%[outptr1], #0x20]\n"
1337 "add %[outptr1], %[outptr1], #0x30\n"
1338 "fmax v20.8h, v20.8h, v1.8h\n"
1339 "str q19, [%[outptr2]]\n"
1340 "fmax v13.8h, v13.8h, v1.8h\n"
1341 "str q20, [%[outptr2], #0x10]\n"
1342 "str q13, [%[outptr2], #0x20]\n"
1343 "add %[outptr2], %[outptr2], #0x30\n"
1344 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1345 [inptr] "+r" (inptr)
1346 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1347 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1348 );
1349 }
1350 }
1351 break;
1352
1353 case 4:
1354 {
1355 if ((i+23) >= xmax)
1356 {
1357 for (int xi=0; xi<23; xi++)
1358 {
1359 if ((i+xi) < xmax)
1360 {
1361 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1362 outptr0++;
1363 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1364 outptr1++;
1365 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1366 outptr2++;
1367 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1368 outptr3++;
1369 }
1370 }
1371 inptr += 192;
1372 } else {
1373 /* Optimized routine to copy an entire block */
1374 __asm __volatile (
1375 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1376 ".arch armv8.2-a+fp16\n"
1377 #endif
1378 "dup v0.8h, %[maxval].h[0]\n"
1379 "ldr q2, [%[biasptr]]\n"
1380 "dup v1.8h, %[minval].h[0]\n"
1381 "ldr q3, [%[biasptr], #0x10]\n"
1382 "ldr q4, [%[biasptr], #0x20]\n"
1383 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1384 "ldr q13, [%[inptr]]\n"
1385 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1386 "ldr q14, [%[inptr], #0x10]\n"
1387 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1388 "fadd v13.8h, v13.8h, v2.8h\n"
1389 "ldr q15, [%[inptr], #0x20]\n"
1390 "ldr q16, [%[inptr], #0x30]\n"
1391 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1392 "fadd v14.8h, v14.8h, v3.8h\n"
1393 "ldr q17, [%[inptr], #0x40]\n"
1394 "fmin v13.8h, v13.8h, v0.8h\n"
1395 "ldr q18, [%[inptr], #0x50]\n"
1396 "fadd v15.8h, v15.8h, v4.8h\n"
1397 "ldr q19, [%[inptr], #0x60]\n"
1398 "fadd v16.8h, v16.8h, v2.8h\n"
1399 "ldr q20, [%[inptr], #0x70]\n"
1400 "fmin v14.8h, v14.8h, v0.8h\n"
1401 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1402 "fmax v13.8h, v13.8h, v1.8h\n"
1403 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1404 "fmax v14.8h, v14.8h, v1.8h\n"
1405 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1406 "fmin v15.8h, v15.8h, v0.8h\n"
1407 "str q13, [%[outptr0]]\n"
1408 "fmin v16.8h, v16.8h, v0.8h\n"
1409 "ldr q13, [%[inptr], #0x80]\n"
1410 "fadd v17.8h, v17.8h, v3.8h\n"
1411 "fadd v18.8h, v18.8h, v4.8h\n"
1412 "str q14, [%[outptr0], #0x10]\n"
1413 "fmax v15.8h, v15.8h, v1.8h\n"
1414 "ldr q14, [%[inptr], #0x90]\n"
1415 "fmax v16.8h, v16.8h, v1.8h\n"
1416 "fmin v17.8h, v17.8h, v0.8h\n"
1417 "fmin v18.8h, v18.8h, v0.8h\n"
1418 "str q15, [%[outptr0], #0x20]\n"
1419 "fadd v19.8h, v19.8h, v2.8h\n"
1420 "ldr q15, [%[inptr], #0xa0]\n"
1421 "fadd v20.8h, v20.8h, v3.8h\n"
1422 "add %[outptr0], %[outptr0], #0x30\n"
1423 "fmax v17.8h, v17.8h, v1.8h\n"
1424 "str q16, [%[outptr1]]\n"
1425 "fmax v18.8h, v18.8h, v1.8h\n"
1426 "ldr q16, [%[inptr], #0xb0]\n"
1427 "fmin v19.8h, v19.8h, v0.8h\n"
1428 "add %[inptr], %[inptr], #0x180\n"
1429 "fmin v20.8h, v20.8h, v0.8h\n"
1430 "str q17, [%[outptr1], #0x10]\n"
1431 "fadd v13.8h, v13.8h, v4.8h\n"
1432 "fmax v19.8h, v19.8h, v1.8h\n"
1433 "fadd v14.8h, v14.8h, v2.8h\n"
1434 "str q18, [%[outptr1], #0x20]\n"
1435 "fmax v20.8h, v20.8h, v1.8h\n"
1436 "add %[outptr1], %[outptr1], #0x30\n"
1437 "fmin v13.8h, v13.8h, v0.8h\n"
1438 "str q19, [%[outptr2]]\n"
1439 "fmin v14.8h, v14.8h, v0.8h\n"
1440 "fadd v15.8h, v15.8h, v3.8h\n"
1441 "fadd v16.8h, v16.8h, v4.8h\n"
1442 "str q20, [%[outptr2], #0x10]\n"
1443 "fmax v13.8h, v13.8h, v1.8h\n"
1444 "fmax v14.8h, v14.8h, v1.8h\n"
1445 "fmin v15.8h, v15.8h, v0.8h\n"
1446 "fmin v16.8h, v16.8h, v0.8h\n"
1447 "str q13, [%[outptr2], #0x20]\n"
1448 "add %[outptr2], %[outptr2], #0x30\n"
1449 "fmax v15.8h, v15.8h, v1.8h\n"
1450 "str q14, [%[outptr3]]\n"
1451 "fmax v16.8h, v16.8h, v1.8h\n"
1452 "str q15, [%[outptr3], #0x10]\n"
1453 "str q16, [%[outptr3], #0x20]\n"
1454 "add %[outptr3], %[outptr3], #0x30\n"
1455 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1456 [inptr] "+r" (inptr)
1457 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1458 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1459 );
1460 }
1461 }
1462 break;
1463
1464 case 5:
1465 {
1466 if ((i+23) >= xmax)
1467 {
1468 for (int xi=0; xi<23; xi++)
1469 {
1470 if ((i+xi) < xmax)
1471 {
1472 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1473 outptr0++;
1474 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1475 outptr1++;
1476 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1477 outptr2++;
1478 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1479 outptr3++;
1480 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1481 outptr4++;
1482 }
1483 }
1484 inptr += 192;
1485 } else {
1486 /* Optimized routine to copy an entire block */
1487 __asm __volatile (
1488 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1489 ".arch armv8.2-a+fp16\n"
1490 #endif
1491 "dup v0.8h, %[maxval].h[0]\n"
1492 "ldr q2, [%[biasptr]]\n"
1493 "dup v1.8h, %[minval].h[0]\n"
1494 "ldr q3, [%[biasptr], #0x10]\n"
1495 "ldr q4, [%[biasptr], #0x20]\n"
1496 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1497 "ldr q13, [%[inptr]]\n"
1498 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1499 "ldr q14, [%[inptr], #0x10]\n"
1500 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1501 "fadd v13.8h, v13.8h, v2.8h\n"
1502 "ldr q15, [%[inptr], #0x20]\n"
1503 "ldr q16, [%[inptr], #0x30]\n"
1504 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1505 "fadd v14.8h, v14.8h, v3.8h\n"
1506 "ldr q17, [%[inptr], #0x40]\n"
1507 "fmin v13.8h, v13.8h, v0.8h\n"
1508 "ldr q18, [%[inptr], #0x50]\n"
1509 "fadd v15.8h, v15.8h, v4.8h\n"
1510 "ldr q19, [%[inptr], #0x60]\n"
1511 "fadd v16.8h, v16.8h, v2.8h\n"
1512 "ldr q20, [%[inptr], #0x70]\n"
1513 "fmin v14.8h, v14.8h, v0.8h\n"
1514 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1515 "fmax v13.8h, v13.8h, v1.8h\n"
1516 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1517 "fmax v14.8h, v14.8h, v1.8h\n"
1518 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1519 "fmin v15.8h, v15.8h, v0.8h\n"
1520 "str q13, [%[outptr0]]\n"
1521 "fmin v16.8h, v16.8h, v0.8h\n"
1522 "ldr q13, [%[inptr], #0x80]\n"
1523 "fadd v17.8h, v17.8h, v3.8h\n"
1524 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1525 "fmax v15.8h, v15.8h, v1.8h\n"
1526 "str q14, [%[outptr0], #0x10]\n"
1527 "fmax v16.8h, v16.8h, v1.8h\n"
1528 "ldr q14, [%[inptr], #0x90]\n"
1529 "fmin v17.8h, v17.8h, v0.8h\n"
1530 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1531 "fadd v18.8h, v18.8h, v4.8h\n"
1532 "str q15, [%[outptr0], #0x20]\n"
1533 "fadd v19.8h, v19.8h, v2.8h\n"
1534 "ldr q15, [%[inptr], #0xa0]\n"
1535 "fmax v17.8h, v17.8h, v1.8h\n"
1536 "add %[outptr0], %[outptr0], #0x30\n"
1537 "fmin v18.8h, v18.8h, v0.8h\n"
1538 "str q16, [%[outptr1]]\n"
1539 "fmin v19.8h, v19.8h, v0.8h\n"
1540 "ldr q16, [%[inptr], #0xb0]\n"
1541 "fadd v20.8h, v20.8h, v3.8h\n"
1542 "fadd v13.8h, v13.8h, v4.8h\n"
1543 "str q17, [%[outptr1], #0x10]\n"
1544 "fmax v18.8h, v18.8h, v1.8h\n"
1545 "ldr q17, [%[inptr], #0xc0]\n"
1546 "fmax v19.8h, v19.8h, v1.8h\n"
1547 "fmin v20.8h, v20.8h, v0.8h\n"
1548 "fmin v13.8h, v13.8h, v0.8h\n"
1549 "str q18, [%[outptr1], #0x20]\n"
1550 "fadd v14.8h, v14.8h, v2.8h\n"
1551 "ldr q18, [%[inptr], #0xd0]\n"
1552 "fadd v15.8h, v15.8h, v3.8h\n"
1553 "add %[outptr1], %[outptr1], #0x30\n"
1554 "fmax v20.8h, v20.8h, v1.8h\n"
1555 "str q19, [%[outptr2]]\n"
1556 "fmax v13.8h, v13.8h, v1.8h\n"
1557 "ldr q19, [%[inptr], #0xe0]\n"
1558 "fmin v14.8h, v14.8h, v0.8h\n"
1559 "add %[inptr], %[inptr], #0x180\n"
1560 "fmin v15.8h, v15.8h, v0.8h\n"
1561 "str q20, [%[outptr2], #0x10]\n"
1562 "fadd v16.8h, v16.8h, v4.8h\n"
1563 "fmax v14.8h, v14.8h, v1.8h\n"
1564 "fadd v17.8h, v17.8h, v2.8h\n"
1565 "str q13, [%[outptr2], #0x20]\n"
1566 "fmax v15.8h, v15.8h, v1.8h\n"
1567 "add %[outptr2], %[outptr2], #0x30\n"
1568 "fmin v16.8h, v16.8h, v0.8h\n"
1569 "str q14, [%[outptr3]]\n"
1570 "fmin v17.8h, v17.8h, v0.8h\n"
1571 "fadd v18.8h, v18.8h, v3.8h\n"
1572 "fadd v19.8h, v19.8h, v4.8h\n"
1573 "str q15, [%[outptr3], #0x10]\n"
1574 "fmax v16.8h, v16.8h, v1.8h\n"
1575 "fmax v17.8h, v17.8h, v1.8h\n"
1576 "fmin v18.8h, v18.8h, v0.8h\n"
1577 "fmin v19.8h, v19.8h, v0.8h\n"
1578 "str q16, [%[outptr3], #0x20]\n"
1579 "add %[outptr3], %[outptr3], #0x30\n"
1580 "fmax v18.8h, v18.8h, v1.8h\n"
1581 "str q17, [%[outptr4]]\n"
1582 "fmax v19.8h, v19.8h, v1.8h\n"
1583 "str q18, [%[outptr4], #0x10]\n"
1584 "str q19, [%[outptr4], #0x20]\n"
1585 "add %[outptr4], %[outptr4], #0x30\n"
1586 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1587 [inptr] "+r" (inptr)
1588 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1589 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1590 );
1591 }
1592 }
1593 break;
1594
1595 case 6:
1596 {
1597 if ((i+23) >= xmax)
1598 {
1599 for (int xi=0; xi<23; xi++)
1600 {
1601 if ((i+xi) < xmax)
1602 {
1603 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1604 outptr0++;
1605 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1606 outptr1++;
1607 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1608 outptr2++;
1609 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1610 outptr3++;
1611 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1612 outptr4++;
1613 *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + biasptr[xi])), maxval);
1614 outptr5++;
1615 }
1616 }
1617 inptr += 192;
1618 } else {
1619 /* Optimized routine to copy an entire block */
1620 __asm __volatile (
1621 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1622 ".arch armv8.2-a+fp16\n"
1623 #endif
1624 "dup v0.8h, %[maxval].h[0]\n"
1625 "ldr q2, [%[biasptr]]\n"
1626 "dup v1.8h, %[minval].h[0]\n"
1627 "ldr q3, [%[biasptr], #0x10]\n"
1628 "ldr q4, [%[biasptr], #0x20]\n"
1629 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1630 "ldr q13, [%[inptr]]\n"
1631 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1632 "ldr q14, [%[inptr], #0x10]\n"
1633 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1634 "fadd v13.8h, v13.8h, v2.8h\n"
1635 "ldr q15, [%[inptr], #0x20]\n"
1636 "ldr q16, [%[inptr], #0x30]\n"
1637 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1638 "fadd v14.8h, v14.8h, v3.8h\n"
1639 "ldr q17, [%[inptr], #0x40]\n"
1640 "fmin v13.8h, v13.8h, v0.8h\n"
1641 "ldr q18, [%[inptr], #0x50]\n"
1642 "fadd v15.8h, v15.8h, v4.8h\n"
1643 "ldr q19, [%[inptr], #0x60]\n"
1644 "fadd v16.8h, v16.8h, v2.8h\n"
1645 "ldr q20, [%[inptr], #0x70]\n"
1646 "fmin v14.8h, v14.8h, v0.8h\n"
1647 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1648 "fmax v13.8h, v13.8h, v1.8h\n"
1649 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1650 "fmax v14.8h, v14.8h, v1.8h\n"
1651 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1652 "fmin v15.8h, v15.8h, v0.8h\n"
1653 "str q13, [%[outptr0]]\n"
1654 "fmin v16.8h, v16.8h, v0.8h\n"
1655 "ldr q13, [%[inptr], #0x80]\n"
1656 "fadd v17.8h, v17.8h, v3.8h\n"
1657 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1658 "fmax v15.8h, v15.8h, v1.8h\n"
1659 "str q14, [%[outptr0], #0x10]\n"
1660 "fmax v16.8h, v16.8h, v1.8h\n"
1661 "ldr q14, [%[inptr], #0x90]\n"
1662 "fmin v17.8h, v17.8h, v0.8h\n"
1663 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1664 "fadd v18.8h, v18.8h, v4.8h\n"
1665 "str q15, [%[outptr0], #0x20]\n"
1666 "fadd v19.8h, v19.8h, v2.8h\n"
1667 "ldr q15, [%[inptr], #0xa0]\n"
1668 "fmax v17.8h, v17.8h, v1.8h\n"
1669 "add %[outptr0], %[outptr0], #0x30\n"
1670 "fmin v18.8h, v18.8h, v0.8h\n"
1671 "str q16, [%[outptr1]]\n"
1672 "fmin v19.8h, v19.8h, v0.8h\n"
1673 "ldr q16, [%[inptr], #0xb0]\n"
1674 "fadd v20.8h, v20.8h, v3.8h\n"
1675 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1676 "fmax v18.8h, v18.8h, v1.8h\n"
1677 "str q17, [%[outptr1], #0x10]\n"
1678 "fmax v19.8h, v19.8h, v1.8h\n"
1679 "ldr q17, [%[inptr], #0xc0]\n"
1680 "fmin v20.8h, v20.8h, v0.8h\n"
1681 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1682 "fadd v13.8h, v13.8h, v4.8h\n"
1683 "str q18, [%[outptr1], #0x20]\n"
1684 "fadd v14.8h, v14.8h, v2.8h\n"
1685 "ldr q18, [%[inptr], #0xd0]\n"
1686 "fmax v20.8h, v20.8h, v1.8h\n"
1687 "add %[outptr1], %[outptr1], #0x30\n"
1688 "fmin v13.8h, v13.8h, v0.8h\n"
1689 "str q19, [%[outptr2]]\n"
1690 "fmin v14.8h, v14.8h, v0.8h\n"
1691 "ldr q19, [%[inptr], #0xe0]\n"
1692 "fadd v15.8h, v15.8h, v3.8h\n"
1693 "fadd v16.8h, v16.8h, v4.8h\n"
1694 "str q20, [%[outptr2], #0x10]\n"
1695 "fmax v13.8h, v13.8h, v1.8h\n"
1696 "ldr q20, [%[inptr], #0xf0]\n"
1697 "fmax v14.8h, v14.8h, v1.8h\n"
1698 "fmin v15.8h, v15.8h, v0.8h\n"
1699 "fmin v16.8h, v16.8h, v0.8h\n"
1700 "str q13, [%[outptr2], #0x20]\n"
1701 "fadd v17.8h, v17.8h, v2.8h\n"
1702 "ldr q13, [%[inptr], #0x100]\n"
1703 "fadd v18.8h, v18.8h, v3.8h\n"
1704 "add %[outptr2], %[outptr2], #0x30\n"
1705 "fmax v15.8h, v15.8h, v1.8h\n"
1706 "str q14, [%[outptr3]]\n"
1707 "fmax v16.8h, v16.8h, v1.8h\n"
1708 "ldr q14, [%[inptr], #0x110]\n"
1709 "fmin v17.8h, v17.8h, v0.8h\n"
1710 "add %[inptr], %[inptr], #0x180\n"
1711 "fmin v18.8h, v18.8h, v0.8h\n"
1712 "str q15, [%[outptr3], #0x10]\n"
1713 "fadd v19.8h, v19.8h, v4.8h\n"
1714 "fmax v17.8h, v17.8h, v1.8h\n"
1715 "fadd v20.8h, v20.8h, v2.8h\n"
1716 "str q16, [%[outptr3], #0x20]\n"
1717 "fmax v18.8h, v18.8h, v1.8h\n"
1718 "add %[outptr3], %[outptr3], #0x30\n"
1719 "fmin v19.8h, v19.8h, v0.8h\n"
1720 "str q17, [%[outptr4]]\n"
1721 "fmin v20.8h, v20.8h, v0.8h\n"
1722 "fadd v13.8h, v13.8h, v3.8h\n"
1723 "fadd v14.8h, v14.8h, v4.8h\n"
1724 "str q18, [%[outptr4], #0x10]\n"
1725 "fmax v19.8h, v19.8h, v1.8h\n"
1726 "fmax v20.8h, v20.8h, v1.8h\n"
1727 "fmin v13.8h, v13.8h, v0.8h\n"
1728 "fmin v14.8h, v14.8h, v0.8h\n"
1729 "str q19, [%[outptr4], #0x20]\n"
1730 "add %[outptr4], %[outptr4], #0x30\n"
1731 "fmax v13.8h, v13.8h, v1.8h\n"
1732 "str q20, [%[outptr5]]\n"
1733 "fmax v14.8h, v14.8h, v1.8h\n"
1734 "str q13, [%[outptr5], #0x10]\n"
1735 "str q14, [%[outptr5], #0x20]\n"
1736 "add %[outptr5], %[outptr5], #0x30\n"
1737 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1738 [inptr] "+r" (inptr)
1739 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1740 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1741 );
1742 }
1743 }
1744 break;
1745
1746 case 7:
1747 {
1748 if ((i+23) >= xmax)
1749 {
1750 for (int xi=0; xi<23; xi++)
1751 {
1752 if ((i+xi) < xmax)
1753 {
1754 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1755 outptr0++;
1756 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1757 outptr1++;
1758 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1759 outptr2++;
1760 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1761 outptr3++;
1762 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1763 outptr4++;
1764 *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + biasptr[xi])), maxval);
1765 outptr5++;
1766 *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + biasptr[xi])), maxval);
1767 outptr6++;
1768 }
1769 }
1770 inptr += 192;
1771 } else {
1772 /* Optimized routine to copy an entire block */
1773 __asm __volatile (
1774 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1775 ".arch armv8.2-a+fp16\n"
1776 #endif
1777 "dup v0.8h, %[maxval].h[0]\n"
1778 "ldr q2, [%[biasptr]]\n"
1779 "dup v1.8h, %[minval].h[0]\n"
1780 "ldr q3, [%[biasptr], #0x10]\n"
1781 "ldr q4, [%[biasptr], #0x20]\n"
1782 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1783 "ldr q13, [%[inptr]]\n"
1784 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1785 "ldr q14, [%[inptr], #0x10]\n"
1786 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1787 "fadd v13.8h, v13.8h, v2.8h\n"
1788 "ldr q15, [%[inptr], #0x20]\n"
1789 "ldr q16, [%[inptr], #0x30]\n"
1790 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1791 "fadd v14.8h, v14.8h, v3.8h\n"
1792 "ldr q17, [%[inptr], #0x40]\n"
1793 "fmin v13.8h, v13.8h, v0.8h\n"
1794 "ldr q18, [%[inptr], #0x50]\n"
1795 "fadd v15.8h, v15.8h, v4.8h\n"
1796 "ldr q19, [%[inptr], #0x60]\n"
1797 "fadd v16.8h, v16.8h, v2.8h\n"
1798 "ldr q20, [%[inptr], #0x70]\n"
1799 "fmin v14.8h, v14.8h, v0.8h\n"
1800 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1801 "fmax v13.8h, v13.8h, v1.8h\n"
1802 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1803 "fmax v14.8h, v14.8h, v1.8h\n"
1804 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1805 "fmin v15.8h, v15.8h, v0.8h\n"
1806 "str q13, [%[outptr0]]\n"
1807 "fmin v16.8h, v16.8h, v0.8h\n"
1808 "ldr q13, [%[inptr], #0x80]\n"
1809 "fadd v17.8h, v17.8h, v3.8h\n"
1810 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1811 "fmax v15.8h, v15.8h, v1.8h\n"
1812 "str q14, [%[outptr0], #0x10]\n"
1813 "fmax v16.8h, v16.8h, v1.8h\n"
1814 "ldr q14, [%[inptr], #0x90]\n"
1815 "fmin v17.8h, v17.8h, v0.8h\n"
1816 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1817 "fadd v18.8h, v18.8h, v4.8h\n"
1818 "str q15, [%[outptr0], #0x20]\n"
1819 "fadd v19.8h, v19.8h, v2.8h\n"
1820 "ldr q15, [%[inptr], #0xa0]\n"
1821 "fmax v17.8h, v17.8h, v1.8h\n"
1822 "add %[outptr0], %[outptr0], #0x30\n"
1823 "fmin v18.8h, v18.8h, v0.8h\n"
1824 "str q16, [%[outptr1]]\n"
1825 "fmin v19.8h, v19.8h, v0.8h\n"
1826 "ldr q16, [%[inptr], #0xb0]\n"
1827 "fadd v20.8h, v20.8h, v3.8h\n"
1828 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1829 "fmax v18.8h, v18.8h, v1.8h\n"
1830 "str q17, [%[outptr1], #0x10]\n"
1831 "fmax v19.8h, v19.8h, v1.8h\n"
1832 "ldr q17, [%[inptr], #0xc0]\n"
1833 "fmin v20.8h, v20.8h, v0.8h\n"
1834 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1835 "fadd v13.8h, v13.8h, v4.8h\n"
1836 "str q18, [%[outptr1], #0x20]\n"
1837 "fadd v14.8h, v14.8h, v2.8h\n"
1838 "ldr q18, [%[inptr], #0xd0]\n"
1839 "fmax v20.8h, v20.8h, v1.8h\n"
1840 "add %[outptr1], %[outptr1], #0x30\n"
1841 "fmin v13.8h, v13.8h, v0.8h\n"
1842 "str q19, [%[outptr2]]\n"
1843 "fmin v14.8h, v14.8h, v0.8h\n"
1844 "ldr q19, [%[inptr], #0xe0]\n"
1845 "fadd v15.8h, v15.8h, v3.8h\n"
1846 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1847 "fmax v13.8h, v13.8h, v1.8h\n"
1848 "str q20, [%[outptr2], #0x10]\n"
1849 "fmax v14.8h, v14.8h, v1.8h\n"
1850 "ldr q20, [%[inptr], #0xf0]\n"
1851 "fmin v15.8h, v15.8h, v0.8h\n"
1852 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1853 "fadd v16.8h, v16.8h, v4.8h\n"
1854 "str q13, [%[outptr2], #0x20]\n"
1855 "fadd v17.8h, v17.8h, v2.8h\n"
1856 "ldr q13, [%[inptr], #0x100]\n"
1857 "fmax v15.8h, v15.8h, v1.8h\n"
1858 "add %[outptr2], %[outptr2], #0x30\n"
1859 "fmin v16.8h, v16.8h, v0.8h\n"
1860 "str q14, [%[outptr3]]\n"
1861 "fmin v17.8h, v17.8h, v0.8h\n"
1862 "ldr q14, [%[inptr], #0x110]\n"
1863 "fadd v18.8h, v18.8h, v3.8h\n"
1864 "fadd v19.8h, v19.8h, v4.8h\n"
1865 "str q15, [%[outptr3], #0x10]\n"
1866 "fmax v16.8h, v16.8h, v1.8h\n"
1867 "ldr q15, [%[inptr], #0x120]\n"
1868 "fmax v17.8h, v17.8h, v1.8h\n"
1869 "fmin v18.8h, v18.8h, v0.8h\n"
1870 "fmin v19.8h, v19.8h, v0.8h\n"
1871 "str q16, [%[outptr3], #0x20]\n"
1872 "fadd v20.8h, v20.8h, v2.8h\n"
1873 "ldr q16, [%[inptr], #0x130]\n"
1874 "fadd v13.8h, v13.8h, v3.8h\n"
1875 "add %[outptr3], %[outptr3], #0x30\n"
1876 "fmax v18.8h, v18.8h, v1.8h\n"
1877 "str q17, [%[outptr4]]\n"
1878 "fmax v19.8h, v19.8h, v1.8h\n"
1879 "ldr q17, [%[inptr], #0x140]\n"
1880 "fmin v20.8h, v20.8h, v0.8h\n"
1881 "add %[inptr], %[inptr], #0x180\n"
1882 "fmin v13.8h, v13.8h, v0.8h\n"
1883 "str q18, [%[outptr4], #0x10]\n"
1884 "fadd v14.8h, v14.8h, v4.8h\n"
1885 "fmax v20.8h, v20.8h, v1.8h\n"
1886 "fadd v15.8h, v15.8h, v2.8h\n"
1887 "str q19, [%[outptr4], #0x20]\n"
1888 "fmax v13.8h, v13.8h, v1.8h\n"
1889 "add %[outptr4], %[outptr4], #0x30\n"
1890 "fmin v14.8h, v14.8h, v0.8h\n"
1891 "str q20, [%[outptr5]]\n"
1892 "fmin v15.8h, v15.8h, v0.8h\n"
1893 "fadd v16.8h, v16.8h, v3.8h\n"
1894 "fadd v17.8h, v17.8h, v4.8h\n"
1895 "str q13, [%[outptr5], #0x10]\n"
1896 "fmax v14.8h, v14.8h, v1.8h\n"
1897 "fmax v15.8h, v15.8h, v1.8h\n"
1898 "fmin v16.8h, v16.8h, v0.8h\n"
1899 "fmin v17.8h, v17.8h, v0.8h\n"
1900 "str q14, [%[outptr5], #0x20]\n"
1901 "add %[outptr5], %[outptr5], #0x30\n"
1902 "fmax v16.8h, v16.8h, v1.8h\n"
1903 "str q15, [%[outptr6]]\n"
1904 "fmax v17.8h, v17.8h, v1.8h\n"
1905 "str q16, [%[outptr6], #0x10]\n"
1906 "str q17, [%[outptr6], #0x20]\n"
1907 "add %[outptr6], %[outptr6], #0x30\n"
1908 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1909 [inptr] "+r" (inptr)
1910 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1911 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1912 );
1913 }
1914 }
1915 break;
1916
1917 default:
1918 case 8:
1919 {
1920 if ((i+23) >= xmax)
1921 {
1922 for (int xi=0; xi<23; xi++)
1923 {
1924 if ((i+xi) < xmax)
1925 {
1926 *outptr0 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi] + biasptr[xi])), maxval);
1927 outptr0++;
1928 *outptr1 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 24] + biasptr[xi])), maxval);
1929 outptr1++;
1930 *outptr2 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 48] + biasptr[xi])), maxval);
1931 outptr2++;
1932 *outptr3 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 72] + biasptr[xi])), maxval);
1933 outptr3++;
1934 *outptr4 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 96] + biasptr[xi])), maxval);
1935 outptr4++;
1936 *outptr5 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 120] + biasptr[xi])), maxval);
1937 outptr5++;
1938 *outptr6 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 144] + biasptr[xi])), maxval);
1939 outptr6++;
1940 *outptr7 = std::min(std::max(minval, static_cast<__fp16>(inptr[xi + 168] + biasptr[xi])), maxval);
1941 outptr7++;
1942 }
1943 }
1944 inptr += 192;
1945 } else {
1946 /* Optimized routine to copy an entire block */
1947 __asm __volatile (
1948 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1949 ".arch armv8.2-a+fp16\n"
1950 #endif
1951 "dup v0.8h, %[maxval].h[0]\n"
1952 "ldr q2, [%[biasptr]]\n"
1953 "dup v1.8h, %[minval].h[0]\n"
1954 "ldr q3, [%[biasptr], #0x10]\n"
1955 "ldr q4, [%[biasptr], #0x20]\n"
1956 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1957 "ldr q13, [%[inptr]]\n"
1958 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1959 "ldr q14, [%[inptr], #0x10]\n"
1960 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1961 "fadd v13.8h, v13.8h, v2.8h\n"
1962 "ldr q15, [%[inptr], #0x20]\n"
1963 "ldr q16, [%[inptr], #0x30]\n"
1964 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1965 "fadd v14.8h, v14.8h, v3.8h\n"
1966 "ldr q17, [%[inptr], #0x40]\n"
1967 "fmin v13.8h, v13.8h, v0.8h\n"
1968 "ldr q18, [%[inptr], #0x50]\n"
1969 "fadd v15.8h, v15.8h, v4.8h\n"
1970 "ldr q19, [%[inptr], #0x60]\n"
1971 "fadd v16.8h, v16.8h, v2.8h\n"
1972 "ldr q20, [%[inptr], #0x70]\n"
1973 "fmin v14.8h, v14.8h, v0.8h\n"
1974 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1975 "fmax v13.8h, v13.8h, v1.8h\n"
1976 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1977 "fmax v14.8h, v14.8h, v1.8h\n"
1978 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1979 "fmin v15.8h, v15.8h, v0.8h\n"
1980 "str q13, [%[outptr0]]\n"
1981 "fmin v16.8h, v16.8h, v0.8h\n"
1982 "ldr q13, [%[inptr], #0x80]\n"
1983 "fadd v17.8h, v17.8h, v3.8h\n"
1984 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1985 "fmax v15.8h, v15.8h, v1.8h\n"
1986 "str q14, [%[outptr0], #0x10]\n"
1987 "fmax v16.8h, v16.8h, v1.8h\n"
1988 "ldr q14, [%[inptr], #0x90]\n"
1989 "fmin v17.8h, v17.8h, v0.8h\n"
1990 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1991 "fadd v18.8h, v18.8h, v4.8h\n"
1992 "str q15, [%[outptr0], #0x20]\n"
1993 "fadd v19.8h, v19.8h, v2.8h\n"
1994 "ldr q15, [%[inptr], #0xa0]\n"
1995 "fmax v17.8h, v17.8h, v1.8h\n"
1996 "add %[outptr0], %[outptr0], #0x30\n"
1997 "fmin v18.8h, v18.8h, v0.8h\n"
1998 "str q16, [%[outptr1]]\n"
1999 "fmin v19.8h, v19.8h, v0.8h\n"
2000 "ldr q16, [%[inptr], #0xb0]\n"
2001 "fadd v20.8h, v20.8h, v3.8h\n"
2002 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
2003 "fmax v18.8h, v18.8h, v1.8h\n"
2004 "str q17, [%[outptr1], #0x10]\n"
2005 "fmax v19.8h, v19.8h, v1.8h\n"
2006 "ldr q17, [%[inptr], #0xc0]\n"
2007 "fmin v20.8h, v20.8h, v0.8h\n"
2008 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
2009 "fadd v13.8h, v13.8h, v4.8h\n"
2010 "str q18, [%[outptr1], #0x20]\n"
2011 "fadd v14.8h, v14.8h, v2.8h\n"
2012 "ldr q18, [%[inptr], #0xd0]\n"
2013 "fmax v20.8h, v20.8h, v1.8h\n"
2014 "add %[outptr1], %[outptr1], #0x30\n"
2015 "fmin v13.8h, v13.8h, v0.8h\n"
2016 "str q19, [%[outptr2]]\n"
2017 "fmin v14.8h, v14.8h, v0.8h\n"
2018 "ldr q19, [%[inptr], #0xe0]\n"
2019 "fadd v15.8h, v15.8h, v3.8h\n"
2020 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
2021 "fmax v13.8h, v13.8h, v1.8h\n"
2022 "str q20, [%[outptr2], #0x10]\n"
2023 "fmax v14.8h, v14.8h, v1.8h\n"
2024 "ldr q20, [%[inptr], #0xf0]\n"
2025 "fmin v15.8h, v15.8h, v0.8h\n"
2026 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
2027 "fadd v16.8h, v16.8h, v4.8h\n"
2028 "str q13, [%[outptr2], #0x20]\n"
2029 "fadd v17.8h, v17.8h, v2.8h\n"
2030 "ldr q13, [%[inptr], #0x100]\n"
2031 "fmax v15.8h, v15.8h, v1.8h\n"
2032 "add %[outptr2], %[outptr2], #0x30\n"
2033 "fmin v16.8h, v16.8h, v0.8h\n"
2034 "str q14, [%[outptr3]]\n"
2035 "fmin v17.8h, v17.8h, v0.8h\n"
2036 "ldr q14, [%[inptr], #0x110]\n"
2037 "fadd v18.8h, v18.8h, v3.8h\n"
2038 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
2039 "fmax v16.8h, v16.8h, v1.8h\n"
2040 "str q15, [%[outptr3], #0x10]\n"
2041 "fmax v17.8h, v17.8h, v1.8h\n"
2042 "ldr q15, [%[inptr], #0x120]\n"
2043 "fmin v18.8h, v18.8h, v0.8h\n"
2044 "fadd v19.8h, v19.8h, v4.8h\n"
2045 "str q16, [%[outptr3], #0x20]\n"
2046 "fadd v20.8h, v20.8h, v2.8h\n"
2047 "ldr q16, [%[inptr], #0x130]\n"
2048 "fadd v13.8h, v13.8h, v3.8h\n"
2049 "add %[outptr3], %[outptr3], #0x30\n"
2050 "fmax v18.8h, v18.8h, v1.8h\n"
2051 "str q17, [%[outptr4]]\n"
2052 "fmin v19.8h, v19.8h, v0.8h\n"
2053 "ldr q17, [%[inptr], #0x140]\n"
2054 "fmin v20.8h, v20.8h, v0.8h\n"
2055 "fmin v13.8h, v13.8h, v0.8h\n"
2056 "str q18, [%[outptr4], #0x10]\n"
2057 "fadd v14.8h, v14.8h, v4.8h\n"
2058 "ldr q18, [%[inptr], #0x150]\n"
2059 "fmax v19.8h, v19.8h, v1.8h\n"
2060 "fmax v20.8h, v20.8h, v1.8h\n"
2061 "fmax v13.8h, v13.8h, v1.8h\n"
2062 "fmin v14.8h, v14.8h, v0.8h\n"
2063 "str q19, [%[outptr4], #0x20]\n"
2064 "fadd v15.8h, v15.8h, v2.8h\n"
2065 "ldr q19, [%[inptr], #0x160]\n"
2066 "fadd v16.8h, v16.8h, v3.8h\n"
2067 "add %[outptr4], %[outptr4], #0x30\n"
2068 "fmax v14.8h, v14.8h, v1.8h\n"
2069 "str q20, [%[outptr5]]\n"
2070 "fmin v15.8h, v15.8h, v0.8h\n"
2071 "ldr q20, [%[inptr], #0x170]\n"
2072 "fmin v16.8h, v16.8h, v0.8h\n"
2073 "add %[inptr], %[inptr], #0x180\n"
2074 "fadd v17.8h, v17.8h, v4.8h\n"
2075 "str q13, [%[outptr5], #0x10]\n"
2076 "fmax v15.8h, v15.8h, v1.8h\n"
2077 "fmax v16.8h, v16.8h, v1.8h\n"
2078 "fadd v18.8h, v18.8h, v2.8h\n"
2079 "str q14, [%[outptr5], #0x20]\n"
2080 "fmin v17.8h, v17.8h, v0.8h\n"
2081 "add %[outptr5], %[outptr5], #0x30\n"
2082 "fadd v19.8h, v19.8h, v3.8h\n"
2083 "str q15, [%[outptr6]]\n"
2084 "fmin v18.8h, v18.8h, v0.8h\n"
2085 "fmax v17.8h, v17.8h, v1.8h\n"
2086 "fadd v20.8h, v20.8h, v4.8h\n"
2087 "str q16, [%[outptr6], #0x10]\n"
2088 "fmin v19.8h, v19.8h, v0.8h\n"
2089 "fmax v18.8h, v18.8h, v1.8h\n"
2090 "fmin v20.8h, v20.8h, v0.8h\n"
2091 "str q17, [%[outptr6], #0x20]\n"
2092 "fmax v19.8h, v19.8h, v1.8h\n"
2093 "add %[outptr6], %[outptr6], #0x30\n"
2094 "fmax v20.8h, v20.8h, v1.8h\n"
2095 "str q18, [%[outptr7]]\n"
2096 "str q19, [%[outptr7], #0x10]\n"
2097 "str q20, [%[outptr7], #0x20]\n"
2098 "add %[outptr7], %[outptr7], #0x30\n"
2099 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
2100 [inptr] "+r" (inptr)
2101 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
2102 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
2103 );
2104 }
2105 }
2106 break;
2107
2108
2109 }
2110 }
2111 }
2112 }
2113 }
2114
2115 #endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
2116