1 /*
2 * Copyright (c) 2019-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #pragma once
25
26 #ifdef __aarch64__
27
28 template<>
MergeResults(float * out,const float * in,const int ldout,const int y0,const int ymax,const int x0,const int xmax,const float * bias,Activation act,bool append)29 void MergeResults<12, 8, false>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float *bias, Activation act, bool append)
30 {
31 const float *inptr = in;
32 float nullbias[12];
33 float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
34 float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
35
36 switch(act.type)
37 {
38 default:
39 case Activation::Type::None:
40 break;
41 case Activation::Type::BoundedReLU:
42 maxval = static_cast<float>(act.param1);
43 /* fall through */
44 case Activation::Type::ReLU:
45 minval = 0.0f;
46 break;
47 }
48
49 if (!append && !bias)
50 {
51 memset(nullbias, 0, (12 * sizeof(float)));
52 }
53
54 for (int y=y0; y<ymax; y+=8)
55 {
56 float *outptr0 = out + (y * ldout) + x0;
57 float *outptr1 = outptr0 + ldout;
58 float *outptr2 = outptr1 + ldout;
59 float *outptr3 = outptr2 + ldout;
60 float *outptr4 = outptr3 + ldout;
61 float *outptr5 = outptr4 + ldout;
62 float *outptr6 = outptr5 + ldout;
63 float *outptr7 = outptr6 + ldout;
64
65 const int height = ymax - y;
66
67 for (int i=x0; i<xmax; i+=12)
68 {
69 if (append)
70 {
71 switch(height)
72 {
73 case 1:
74 {
75 if ((i+11) >= xmax)
76 {
77 for (int xi=0; xi<11; xi++)
78 {
79 if ((i+xi) < xmax)
80 {
81 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
82 outptr0++;
83 }
84 }
85 inptr += 96;
86 } else {
87 /* Optimized routine to copy an entire block */
88 __asm __volatile (
89 "dup v0.4s, %[maxval].s[0]\n"
90 "ldr q2, [%[outptr0]]\n"
91 "dup v1.4s, %[minval].s[0]\n"
92 "ldr q10, [%[inptr]]\n"
93 "ldr q3, [%[outptr0], #0x10]\n"
94 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
95 "ldr q11, [%[inptr], #0x10]\n"
96 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
97 "fadd v10.4s, v10.4s, v2.4s\n"
98 "ldr q4, [%[outptr0], #0x20]\n"
99 "ldr q12, [%[inptr], #0x20]\n"
100 "add %[inptr], %[inptr], #0x180\n"
101 "fadd v11.4s, v11.4s, v3.4s\n"
102 "fmin v10.4s, v10.4s, v0.4s\n"
103 "fadd v12.4s, v12.4s, v4.4s\n"
104 "fmin v11.4s, v11.4s, v0.4s\n"
105 "fmax v10.4s, v10.4s, v1.4s\n"
106 "fmin v12.4s, v12.4s, v0.4s\n"
107 "fmax v11.4s, v11.4s, v1.4s\n"
108 "str q10, [%[outptr0]]\n"
109 "fmax v12.4s, v12.4s, v1.4s\n"
110 "str q11, [%[outptr0], #0x10]\n"
111 "str q12, [%[outptr0], #0x20]\n"
112 "add %[outptr0], %[outptr0], #0x30\n"
113 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
114 [inptr] "+r" (inptr)
115 : [minval] "w" (minval), [maxval] "w" (maxval)
116 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
117 );
118 }
119 }
120 break;
121
122 case 2:
123 {
124 if ((i+11) >= xmax)
125 {
126 for (int xi=0; xi<11; xi++)
127 {
128 if ((i+xi) < xmax)
129 {
130 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
131 outptr0++;
132 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
133 outptr1++;
134 }
135 }
136 inptr += 96;
137 } else {
138 /* Optimized routine to copy an entire block */
139 __asm __volatile (
140 "dup v0.4s, %[maxval].s[0]\n"
141 "ldr q2, [%[outptr0]]\n"
142 "dup v1.4s, %[minval].s[0]\n"
143 "ldr q10, [%[inptr]]\n"
144 "ldr q3, [%[outptr0], #0x10]\n"
145 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
146 "ldr q11, [%[inptr], #0x10]\n"
147 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
148 "fadd v10.4s, v10.4s, v2.4s\n"
149 "ldr q4, [%[outptr0], #0x20]\n"
150 "ldr q12, [%[inptr], #0x20]\n"
151 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
152 "fadd v11.4s, v11.4s, v3.4s\n"
153 "ldr q5, [%[outptr1]]\n"
154 "fmin v10.4s, v10.4s, v0.4s\n"
155 "ldr q13, [%[inptr], #0x30]\n"
156 "fadd v12.4s, v12.4s, v4.4s\n"
157 "ldr q6, [%[outptr1], #0x10]\n"
158 "ldr q14, [%[inptr], #0x40]\n"
159 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
160 "fmax v10.4s, v10.4s, v1.4s\n"
161 "ldr q7, [%[outptr1], #0x20]\n"
162 "fmin v11.4s, v11.4s, v0.4s\n"
163 "ldr q15, [%[inptr], #0x50]\n"
164 "fmin v12.4s, v12.4s, v0.4s\n"
165 "add %[inptr], %[inptr], #0x180\n"
166 "fadd v13.4s, v13.4s, v5.4s\n"
167 "str q10, [%[outptr0]]\n"
168 "fmax v11.4s, v11.4s, v1.4s\n"
169 "fmax v12.4s, v12.4s, v1.4s\n"
170 "fadd v14.4s, v14.4s, v6.4s\n"
171 "fmin v13.4s, v13.4s, v0.4s\n"
172 "str q11, [%[outptr0], #0x10]\n"
173 "fadd v15.4s, v15.4s, v7.4s\n"
174 "fmin v14.4s, v14.4s, v0.4s\n"
175 "str q12, [%[outptr0], #0x20]\n"
176 "fmax v13.4s, v13.4s, v1.4s\n"
177 "add %[outptr0], %[outptr0], #0x30\n"
178 "fmin v15.4s, v15.4s, v0.4s\n"
179 "fmax v14.4s, v14.4s, v1.4s\n"
180 "str q13, [%[outptr1]]\n"
181 "fmax v15.4s, v15.4s, v1.4s\n"
182 "str q14, [%[outptr1], #0x10]\n"
183 "str q15, [%[outptr1], #0x20]\n"
184 "add %[outptr1], %[outptr1], #0x30\n"
185 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
186 [inptr] "+r" (inptr)
187 : [minval] "w" (minval), [maxval] "w" (maxval)
188 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
189 );
190 }
191 }
192 break;
193
194 case 3:
195 {
196 if ((i+11) >= xmax)
197 {
198 for (int xi=0; xi<11; xi++)
199 {
200 if ((i+xi) < xmax)
201 {
202 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
203 outptr0++;
204 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
205 outptr1++;
206 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
207 outptr2++;
208 }
209 }
210 inptr += 96;
211 } else {
212 /* Optimized routine to copy an entire block */
213 __asm __volatile (
214 "dup v0.4s, %[maxval].s[0]\n"
215 "ldr q2, [%[outptr0]]\n"
216 "dup v1.4s, %[minval].s[0]\n"
217 "ldr q10, [%[inptr]]\n"
218 "ldr q3, [%[outptr0], #0x10]\n"
219 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
220 "ldr q11, [%[inptr], #0x10]\n"
221 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
222 "fadd v10.4s, v10.4s, v2.4s\n"
223 "ldr q4, [%[outptr0], #0x20]\n"
224 "ldr q12, [%[inptr], #0x20]\n"
225 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
226 "fadd v11.4s, v11.4s, v3.4s\n"
227 "ldr q5, [%[outptr1]]\n"
228 "fmin v10.4s, v10.4s, v0.4s\n"
229 "ldr q13, [%[inptr], #0x30]\n"
230 "fadd v12.4s, v12.4s, v4.4s\n"
231 "ldr q6, [%[outptr1], #0x10]\n"
232 "ldr q14, [%[inptr], #0x40]\n"
233 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
234 "fmax v10.4s, v10.4s, v1.4s\n"
235 "ldr q7, [%[outptr1], #0x20]\n"
236 "fmin v11.4s, v11.4s, v0.4s\n"
237 "ldr q15, [%[inptr], #0x50]\n"
238 "fmin v12.4s, v12.4s, v0.4s\n"
239 "ldr q8, [%[outptr2]]\n"
240 "fadd v13.4s, v13.4s, v5.4s\n"
241 "str q10, [%[outptr0]]\n"
242 "fadd v14.4s, v14.4s, v6.4s\n"
243 "ldr q16, [%[inptr], #0x60]\n"
244 "fmax v11.4s, v11.4s, v1.4s\n"
245 "ldr q9, [%[outptr2], #0x10]\n"
246 "fmax v12.4s, v12.4s, v1.4s\n"
247 "ldr q17, [%[inptr], #0x70]\n"
248 "fmin v13.4s, v13.4s, v0.4s\n"
249 "ldr q2, [%[outptr2], #0x20]\n"
250 "fmin v14.4s, v14.4s, v0.4s\n"
251 "str q11, [%[outptr0], #0x10]\n"
252 "fadd v15.4s, v15.4s, v7.4s\n"
253 "ldr q10, [%[inptr], #0x80]\n"
254 "fadd v16.4s, v16.4s, v8.4s\n"
255 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
256 "fmax v13.4s, v13.4s, v1.4s\n"
257 "str q12, [%[outptr0], #0x20]\n"
258 "fmax v14.4s, v14.4s, v1.4s\n"
259 "add %[outptr0], %[outptr0], #0x30\n"
260 "fmin v15.4s, v15.4s, v0.4s\n"
261 "str q13, [%[outptr1]]\n"
262 "fmin v16.4s, v16.4s, v0.4s\n"
263 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
264 "fadd v17.4s, v17.4s, v9.4s\n"
265 "str q14, [%[outptr1], #0x10]\n"
266 "fmax v15.4s, v15.4s, v1.4s\n"
267 "add %[inptr], %[inptr], #0x180\n"
268 "fmax v16.4s, v16.4s, v1.4s\n"
269 "fmin v17.4s, v17.4s, v0.4s\n"
270 "str q15, [%[outptr1], #0x20]\n"
271 "fadd v10.4s, v10.4s, v2.4s\n"
272 "add %[outptr1], %[outptr1], #0x30\n"
273 "fmax v17.4s, v17.4s, v1.4s\n"
274 "str q16, [%[outptr2]]\n"
275 "fmin v10.4s, v10.4s, v0.4s\n"
276 "str q17, [%[outptr2], #0x10]\n"
277 "fmax v10.4s, v10.4s, v1.4s\n"
278 "str q10, [%[outptr2], #0x20]\n"
279 "add %[outptr2], %[outptr2], #0x30\n"
280 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
281 [inptr] "+r" (inptr)
282 : [minval] "w" (minval), [maxval] "w" (maxval)
283 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
284 );
285 }
286 }
287 break;
288
289 case 4:
290 {
291 if ((i+11) >= xmax)
292 {
293 for (int xi=0; xi<11; xi++)
294 {
295 if ((i+xi) < xmax)
296 {
297 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
298 outptr0++;
299 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
300 outptr1++;
301 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
302 outptr2++;
303 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
304 outptr3++;
305 }
306 }
307 inptr += 96;
308 } else {
309 /* Optimized routine to copy an entire block */
310 __asm __volatile (
311 "dup v0.4s, %[maxval].s[0]\n"
312 "ldr q2, [%[outptr0]]\n"
313 "dup v1.4s, %[minval].s[0]\n"
314 "ldr q10, [%[inptr]]\n"
315 "ldr q3, [%[outptr0], #0x10]\n"
316 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
317 "ldr q11, [%[inptr], #0x10]\n"
318 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
319 "fadd v10.4s, v10.4s, v2.4s\n"
320 "ldr q4, [%[outptr0], #0x20]\n"
321 "ldr q12, [%[inptr], #0x20]\n"
322 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
323 "fadd v11.4s, v11.4s, v3.4s\n"
324 "ldr q5, [%[outptr1]]\n"
325 "fmin v10.4s, v10.4s, v0.4s\n"
326 "ldr q13, [%[inptr], #0x30]\n"
327 "fadd v12.4s, v12.4s, v4.4s\n"
328 "ldr q6, [%[outptr1], #0x10]\n"
329 "ldr q14, [%[inptr], #0x40]\n"
330 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
331 "fmax v10.4s, v10.4s, v1.4s\n"
332 "ldr q7, [%[outptr1], #0x20]\n"
333 "fmin v11.4s, v11.4s, v0.4s\n"
334 "ldr q15, [%[inptr], #0x50]\n"
335 "fmin v12.4s, v12.4s, v0.4s\n"
336 "ldr q8, [%[outptr2]]\n"
337 "fadd v13.4s, v13.4s, v5.4s\n"
338 "str q10, [%[outptr0]]\n"
339 "fadd v14.4s, v14.4s, v6.4s\n"
340 "ldr q16, [%[inptr], #0x60]\n"
341 "fmax v11.4s, v11.4s, v1.4s\n"
342 "ldr q9, [%[outptr2], #0x10]\n"
343 "fmax v12.4s, v12.4s, v1.4s\n"
344 "ldr q17, [%[inptr], #0x70]\n"
345 "fmin v13.4s, v13.4s, v0.4s\n"
346 "ldr q2, [%[outptr2], #0x20]\n"
347 "fmin v14.4s, v14.4s, v0.4s\n"
348 "str q11, [%[outptr0], #0x10]\n"
349 "fadd v15.4s, v15.4s, v7.4s\n"
350 "ldr q10, [%[inptr], #0x80]\n"
351 "fadd v16.4s, v16.4s, v8.4s\n"
352 "ldr q3, [%[outptr3]]\n"
353 "fmax v13.4s, v13.4s, v1.4s\n"
354 "str q12, [%[outptr0], #0x20]\n"
355 "fmax v14.4s, v14.4s, v1.4s\n"
356 "ldr q11, [%[inptr], #0x90]\n"
357 "fmin v15.4s, v15.4s, v0.4s\n"
358 "ldr q4, [%[outptr3], #0x10]\n"
359 "fmin v16.4s, v16.4s, v0.4s\n"
360 "str q13, [%[outptr1]]\n"
361 "fadd v17.4s, v17.4s, v9.4s\n"
362 "ldr q12, [%[inptr], #0xa0]\n"
363 "fadd v10.4s, v10.4s, v2.4s\n"
364 "ldr q5, [%[outptr3], #0x20]\n"
365 "fmax v15.4s, v15.4s, v1.4s\n"
366 "str q14, [%[outptr1], #0x10]\n"
367 "fmax v16.4s, v16.4s, v1.4s\n"
368 "ldr q13, [%[inptr], #0xb0]\n"
369 "fmin v17.4s, v17.4s, v0.4s\n"
370 "add %[outptr0], %[outptr0], #0x30\n"
371 "fmin v10.4s, v10.4s, v0.4s\n"
372 "str q15, [%[outptr1], #0x20]\n"
373 "fadd v11.4s, v11.4s, v3.4s\n"
374 "add %[outptr1], %[outptr1], #0x30\n"
375 "fmax v17.4s, v17.4s, v1.4s\n"
376 "str q16, [%[outptr2]]\n"
377 "fmax v10.4s, v10.4s, v1.4s\n"
378 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
379 "fmin v11.4s, v11.4s, v0.4s\n"
380 "str q17, [%[outptr2], #0x10]\n"
381 "fadd v12.4s, v12.4s, v4.4s\n"
382 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
383 "fadd v13.4s, v13.4s, v5.4s\n"
384 "str q10, [%[outptr2], #0x20]\n"
385 "fmax v11.4s, v11.4s, v1.4s\n"
386 "add %[outptr2], %[outptr2], #0x30\n"
387 "fmin v12.4s, v12.4s, v0.4s\n"
388 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
389 "fmin v13.4s, v13.4s, v0.4s\n"
390 "str q11, [%[outptr3]]\n"
391 "add %[inptr], %[inptr], #0x180\n"
392 "fmax v12.4s, v12.4s, v1.4s\n"
393 "fmax v13.4s, v13.4s, v1.4s\n"
394 "str q12, [%[outptr3], #0x10]\n"
395 "str q13, [%[outptr3], #0x20]\n"
396 "add %[outptr3], %[outptr3], #0x30\n"
397 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
398 [inptr] "+r" (inptr)
399 : [minval] "w" (minval), [maxval] "w" (maxval)
400 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
401 );
402 }
403 }
404 break;
405
406 case 5:
407 {
408 if ((i+11) >= xmax)
409 {
410 for (int xi=0; xi<11; xi++)
411 {
412 if ((i+xi) < xmax)
413 {
414 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
415 outptr0++;
416 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
417 outptr1++;
418 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
419 outptr2++;
420 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
421 outptr3++;
422 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
423 outptr4++;
424 }
425 }
426 inptr += 96;
427 } else {
428 /* Optimized routine to copy an entire block */
429 __asm __volatile (
430 "dup v0.4s, %[maxval].s[0]\n"
431 "ldr q2, [%[outptr0]]\n"
432 "dup v1.4s, %[minval].s[0]\n"
433 "ldr q10, [%[inptr]]\n"
434 "ldr q3, [%[outptr0], #0x10]\n"
435 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
436 "ldr q11, [%[inptr], #0x10]\n"
437 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
438 "fadd v10.4s, v10.4s, v2.4s\n"
439 "ldr q4, [%[outptr0], #0x20]\n"
440 "ldr q12, [%[inptr], #0x20]\n"
441 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
442 "fadd v11.4s, v11.4s, v3.4s\n"
443 "ldr q5, [%[outptr1]]\n"
444 "fmin v10.4s, v10.4s, v0.4s\n"
445 "ldr q13, [%[inptr], #0x30]\n"
446 "fadd v12.4s, v12.4s, v4.4s\n"
447 "ldr q6, [%[outptr1], #0x10]\n"
448 "ldr q14, [%[inptr], #0x40]\n"
449 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
450 "fmax v10.4s, v10.4s, v1.4s\n"
451 "ldr q7, [%[outptr1], #0x20]\n"
452 "fmin v11.4s, v11.4s, v0.4s\n"
453 "ldr q15, [%[inptr], #0x50]\n"
454 "fmin v12.4s, v12.4s, v0.4s\n"
455 "ldr q8, [%[outptr2]]\n"
456 "fadd v13.4s, v13.4s, v5.4s\n"
457 "str q10, [%[outptr0]]\n"
458 "fadd v14.4s, v14.4s, v6.4s\n"
459 "ldr q16, [%[inptr], #0x60]\n"
460 "fmax v11.4s, v11.4s, v1.4s\n"
461 "ldr q9, [%[outptr2], #0x10]\n"
462 "fmax v12.4s, v12.4s, v1.4s\n"
463 "ldr q17, [%[inptr], #0x70]\n"
464 "fmin v13.4s, v13.4s, v0.4s\n"
465 "ldr q2, [%[outptr2], #0x20]\n"
466 "fmin v14.4s, v14.4s, v0.4s\n"
467 "str q11, [%[outptr0], #0x10]\n"
468 "fadd v15.4s, v15.4s, v7.4s\n"
469 "ldr q10, [%[inptr], #0x80]\n"
470 "fadd v16.4s, v16.4s, v8.4s\n"
471 "ldr q3, [%[outptr3]]\n"
472 "fmax v13.4s, v13.4s, v1.4s\n"
473 "str q12, [%[outptr0], #0x20]\n"
474 "fmax v14.4s, v14.4s, v1.4s\n"
475 "ldr q11, [%[inptr], #0x90]\n"
476 "fmin v15.4s, v15.4s, v0.4s\n"
477 "ldr q4, [%[outptr3], #0x10]\n"
478 "fmin v16.4s, v16.4s, v0.4s\n"
479 "str q13, [%[outptr1]]\n"
480 "fadd v17.4s, v17.4s, v9.4s\n"
481 "ldr q12, [%[inptr], #0xa0]\n"
482 "fadd v10.4s, v10.4s, v2.4s\n"
483 "ldr q5, [%[outptr3], #0x20]\n"
484 "fmax v15.4s, v15.4s, v1.4s\n"
485 "str q14, [%[outptr1], #0x10]\n"
486 "fmax v16.4s, v16.4s, v1.4s\n"
487 "ldr q13, [%[inptr], #0xb0]\n"
488 "fmin v17.4s, v17.4s, v0.4s\n"
489 "ldr q6, [%[outptr4]]\n"
490 "fmin v10.4s, v10.4s, v0.4s\n"
491 "str q15, [%[outptr1], #0x20]\n"
492 "fadd v11.4s, v11.4s, v3.4s\n"
493 "ldr q14, [%[inptr], #0xc0]\n"
494 "fadd v12.4s, v12.4s, v4.4s\n"
495 "ldr q7, [%[outptr4], #0x10]\n"
496 "fmax v17.4s, v17.4s, v1.4s\n"
497 "str q16, [%[outptr2]]\n"
498 "fmax v10.4s, v10.4s, v1.4s\n"
499 "ldr q15, [%[inptr], #0xd0]\n"
500 "fmin v11.4s, v11.4s, v0.4s\n"
501 "ldr q8, [%[outptr4], #0x20]\n"
502 "fmin v12.4s, v12.4s, v0.4s\n"
503 "str q17, [%[outptr2], #0x10]\n"
504 "fadd v13.4s, v13.4s, v5.4s\n"
505 "ldr q16, [%[inptr], #0xe0]\n"
506 "fadd v14.4s, v14.4s, v6.4s\n"
507 "add %[outptr0], %[outptr0], #0x30\n"
508 "fmax v11.4s, v11.4s, v1.4s\n"
509 "str q10, [%[outptr2], #0x20]\n"
510 "fmax v12.4s, v12.4s, v1.4s\n"
511 "add %[outptr1], %[outptr1], #0x30\n"
512 "fmin v13.4s, v13.4s, v0.4s\n"
513 "str q11, [%[outptr3]]\n"
514 "fmin v14.4s, v14.4s, v0.4s\n"
515 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
516 "fadd v15.4s, v15.4s, v7.4s\n"
517 "str q12, [%[outptr3], #0x10]\n"
518 "fmax v13.4s, v13.4s, v1.4s\n"
519 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
520 "fmax v14.4s, v14.4s, v1.4s\n"
521 "add %[outptr2], %[outptr2], #0x30\n"
522 "fmin v15.4s, v15.4s, v0.4s\n"
523 "str q13, [%[outptr3], #0x20]\n"
524 "fadd v16.4s, v16.4s, v8.4s\n"
525 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
526 "add %[outptr3], %[outptr3], #0x30\n"
527 "fmax v15.4s, v15.4s, v1.4s\n"
528 "str q14, [%[outptr4]]\n"
529 "fmin v16.4s, v16.4s, v0.4s\n"
530 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
531 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
532 "str q15, [%[outptr4], #0x10]\n"
533 "add %[inptr], %[inptr], #0x180\n"
534 "fmax v16.4s, v16.4s, v1.4s\n"
535 "str q16, [%[outptr4], #0x20]\n"
536 "add %[outptr4], %[outptr4], #0x30\n"
537 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
538 [inptr] "+r" (inptr)
539 : [minval] "w" (minval), [maxval] "w" (maxval)
540 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
541 );
542 }
543 }
544 break;
545
546 case 6:
547 {
548 if ((i+11) >= xmax)
549 {
550 for (int xi=0; xi<11; xi++)
551 {
552 if ((i+xi) < xmax)
553 {
554 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
555 outptr0++;
556 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
557 outptr1++;
558 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
559 outptr2++;
560 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
561 outptr3++;
562 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
563 outptr4++;
564 *outptr5 = std::min(std::max(minval, inptr[xi + 60] + *outptr5), maxval);
565 outptr5++;
566 }
567 }
568 inptr += 96;
569 } else {
570 /* Optimized routine to copy an entire block */
571 __asm __volatile (
572 "dup v0.4s, %[maxval].s[0]\n"
573 "ldr q2, [%[outptr0]]\n"
574 "dup v1.4s, %[minval].s[0]\n"
575 "ldr q10, [%[inptr]]\n"
576 "ldr q3, [%[outptr0], #0x10]\n"
577 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
578 "ldr q11, [%[inptr], #0x10]\n"
579 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
580 "fadd v10.4s, v10.4s, v2.4s\n"
581 "ldr q4, [%[outptr0], #0x20]\n"
582 "ldr q12, [%[inptr], #0x20]\n"
583 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
584 "fadd v11.4s, v11.4s, v3.4s\n"
585 "ldr q5, [%[outptr1]]\n"
586 "fmin v10.4s, v10.4s, v0.4s\n"
587 "ldr q13, [%[inptr], #0x30]\n"
588 "fadd v12.4s, v12.4s, v4.4s\n"
589 "ldr q6, [%[outptr1], #0x10]\n"
590 "ldr q14, [%[inptr], #0x40]\n"
591 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
592 "fmax v10.4s, v10.4s, v1.4s\n"
593 "ldr q7, [%[outptr1], #0x20]\n"
594 "fmin v11.4s, v11.4s, v0.4s\n"
595 "ldr q15, [%[inptr], #0x50]\n"
596 "fmin v12.4s, v12.4s, v0.4s\n"
597 "ldr q8, [%[outptr2]]\n"
598 "fadd v13.4s, v13.4s, v5.4s\n"
599 "str q10, [%[outptr0]]\n"
600 "fadd v14.4s, v14.4s, v6.4s\n"
601 "ldr q16, [%[inptr], #0x60]\n"
602 "fmax v11.4s, v11.4s, v1.4s\n"
603 "ldr q9, [%[outptr2], #0x10]\n"
604 "fmax v12.4s, v12.4s, v1.4s\n"
605 "ldr q17, [%[inptr], #0x70]\n"
606 "fmin v13.4s, v13.4s, v0.4s\n"
607 "ldr q2, [%[outptr2], #0x20]\n"
608 "fmin v14.4s, v14.4s, v0.4s\n"
609 "str q11, [%[outptr0], #0x10]\n"
610 "fadd v15.4s, v15.4s, v7.4s\n"
611 "ldr q10, [%[inptr], #0x80]\n"
612 "fadd v16.4s, v16.4s, v8.4s\n"
613 "ldr q3, [%[outptr3]]\n"
614 "fmax v13.4s, v13.4s, v1.4s\n"
615 "str q12, [%[outptr0], #0x20]\n"
616 "fmax v14.4s, v14.4s, v1.4s\n"
617 "ldr q11, [%[inptr], #0x90]\n"
618 "fmin v15.4s, v15.4s, v0.4s\n"
619 "ldr q4, [%[outptr3], #0x10]\n"
620 "fmin v16.4s, v16.4s, v0.4s\n"
621 "str q13, [%[outptr1]]\n"
622 "fadd v17.4s, v17.4s, v9.4s\n"
623 "ldr q12, [%[inptr], #0xa0]\n"
624 "fadd v10.4s, v10.4s, v2.4s\n"
625 "ldr q5, [%[outptr3], #0x20]\n"
626 "fmax v15.4s, v15.4s, v1.4s\n"
627 "str q14, [%[outptr1], #0x10]\n"
628 "fmax v16.4s, v16.4s, v1.4s\n"
629 "ldr q13, [%[inptr], #0xb0]\n"
630 "fmin v17.4s, v17.4s, v0.4s\n"
631 "ldr q6, [%[outptr4]]\n"
632 "fmin v10.4s, v10.4s, v0.4s\n"
633 "str q15, [%[outptr1], #0x20]\n"
634 "fadd v11.4s, v11.4s, v3.4s\n"
635 "ldr q14, [%[inptr], #0xc0]\n"
636 "fadd v12.4s, v12.4s, v4.4s\n"
637 "ldr q7, [%[outptr4], #0x10]\n"
638 "fmax v17.4s, v17.4s, v1.4s\n"
639 "str q16, [%[outptr2]]\n"
640 "fmax v10.4s, v10.4s, v1.4s\n"
641 "ldr q15, [%[inptr], #0xd0]\n"
642 "fmin v11.4s, v11.4s, v0.4s\n"
643 "ldr q8, [%[outptr4], #0x20]\n"
644 "fmin v12.4s, v12.4s, v0.4s\n"
645 "str q17, [%[outptr2], #0x10]\n"
646 "fadd v13.4s, v13.4s, v5.4s\n"
647 "ldr q16, [%[inptr], #0xe0]\n"
648 "fadd v14.4s, v14.4s, v6.4s\n"
649 "ldr q9, [%[outptr5]]\n"
650 "fmax v11.4s, v11.4s, v1.4s\n"
651 "str q10, [%[outptr2], #0x20]\n"
652 "fmax v12.4s, v12.4s, v1.4s\n"
653 "ldr q17, [%[inptr], #0xf0]\n"
654 "fmin v13.4s, v13.4s, v0.4s\n"
655 "ldr q2, [%[outptr5], #0x10]\n"
656 "fmin v14.4s, v14.4s, v0.4s\n"
657 "str q11, [%[outptr3]]\n"
658 "fadd v15.4s, v15.4s, v7.4s\n"
659 "ldr q10, [%[inptr], #0x100]\n"
660 "fadd v16.4s, v16.4s, v8.4s\n"
661 "ldr q3, [%[outptr5], #0x20]\n"
662 "fmax v13.4s, v13.4s, v1.4s\n"
663 "str q12, [%[outptr3], #0x10]\n"
664 "fmax v14.4s, v14.4s, v1.4s\n"
665 "ldr q11, [%[inptr], #0x110]\n"
666 "fmin v15.4s, v15.4s, v0.4s\n"
667 "add %[outptr0], %[outptr0], #0x30\n"
668 "fmin v16.4s, v16.4s, v0.4s\n"
669 "str q13, [%[outptr3], #0x20]\n"
670 "fadd v17.4s, v17.4s, v9.4s\n"
671 "add %[outptr1], %[outptr1], #0x30\n"
672 "fmax v15.4s, v15.4s, v1.4s\n"
673 "str q14, [%[outptr4]]\n"
674 "fmax v16.4s, v16.4s, v1.4s\n"
675 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
676 "fmin v17.4s, v17.4s, v0.4s\n"
677 "str q15, [%[outptr4], #0x10]\n"
678 "fadd v10.4s, v10.4s, v2.4s\n"
679 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
680 "fadd v11.4s, v11.4s, v3.4s\n"
681 "str q16, [%[outptr4], #0x20]\n"
682 "fmax v17.4s, v17.4s, v1.4s\n"
683 "add %[outptr2], %[outptr2], #0x30\n"
684 "fmin v10.4s, v10.4s, v0.4s\n"
685 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
686 "fmin v11.4s, v11.4s, v0.4s\n"
687 "str q17, [%[outptr5]]\n"
688 "add %[outptr3], %[outptr3], #0x30\n"
689 "fmax v10.4s, v10.4s, v1.4s\n"
690 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
691 "fmax v11.4s, v11.4s, v1.4s\n"
692 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
693 "str q10, [%[outptr5], #0x10]\n"
694 "add %[outptr4], %[outptr4], #0x30\n"
695 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
696 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
697 "str q11, [%[outptr5], #0x20]\n"
698 "add %[outptr5], %[outptr5], #0x30\n"
699 "add %[inptr], %[inptr], #0x180\n"
700 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
701 [inptr] "+r" (inptr)
702 : [minval] "w" (minval), [maxval] "w" (maxval)
703 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
704 );
705 }
706 }
707 break;
708
709 case 7:
710 {
711 if ((i+11) >= xmax)
712 {
713 for (int xi=0; xi<11; xi++)
714 {
715 if ((i+xi) < xmax)
716 {
717 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
718 outptr0++;
719 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
720 outptr1++;
721 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
722 outptr2++;
723 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
724 outptr3++;
725 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
726 outptr4++;
727 *outptr5 = std::min(std::max(minval, inptr[xi + 60] + *outptr5), maxval);
728 outptr5++;
729 *outptr6 = std::min(std::max(minval, inptr[xi + 72] + *outptr6), maxval);
730 outptr6++;
731 }
732 }
733 inptr += 96;
734 } else {
735 /* Optimized routine to copy an entire block */
736 __asm __volatile (
737 "dup v0.4s, %[maxval].s[0]\n"
738 "ldr q2, [%[outptr0]]\n"
739 "dup v1.4s, %[minval].s[0]\n"
740 "ldr q10, [%[inptr]]\n"
741 "ldr q3, [%[outptr0], #0x10]\n"
742 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
743 "ldr q11, [%[inptr], #0x10]\n"
744 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
745 "fadd v10.4s, v10.4s, v2.4s\n"
746 "ldr q4, [%[outptr0], #0x20]\n"
747 "ldr q12, [%[inptr], #0x20]\n"
748 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
749 "fadd v11.4s, v11.4s, v3.4s\n"
750 "ldr q5, [%[outptr1]]\n"
751 "fmin v10.4s, v10.4s, v0.4s\n"
752 "ldr q13, [%[inptr], #0x30]\n"
753 "fadd v12.4s, v12.4s, v4.4s\n"
754 "ldr q6, [%[outptr1], #0x10]\n"
755 "ldr q14, [%[inptr], #0x40]\n"
756 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
757 "fmax v10.4s, v10.4s, v1.4s\n"
758 "ldr q7, [%[outptr1], #0x20]\n"
759 "fmin v11.4s, v11.4s, v0.4s\n"
760 "ldr q15, [%[inptr], #0x50]\n"
761 "fmin v12.4s, v12.4s, v0.4s\n"
762 "ldr q8, [%[outptr2]]\n"
763 "fadd v13.4s, v13.4s, v5.4s\n"
764 "str q10, [%[outptr0]]\n"
765 "fadd v14.4s, v14.4s, v6.4s\n"
766 "ldr q16, [%[inptr], #0x60]\n"
767 "fmax v11.4s, v11.4s, v1.4s\n"
768 "ldr q9, [%[outptr2], #0x10]\n"
769 "fmax v12.4s, v12.4s, v1.4s\n"
770 "ldr q17, [%[inptr], #0x70]\n"
771 "fmin v13.4s, v13.4s, v0.4s\n"
772 "ldr q2, [%[outptr2], #0x20]\n"
773 "fmin v14.4s, v14.4s, v0.4s\n"
774 "str q11, [%[outptr0], #0x10]\n"
775 "fadd v15.4s, v15.4s, v7.4s\n"
776 "ldr q10, [%[inptr], #0x80]\n"
777 "fadd v16.4s, v16.4s, v8.4s\n"
778 "ldr q3, [%[outptr3]]\n"
779 "fmax v13.4s, v13.4s, v1.4s\n"
780 "str q12, [%[outptr0], #0x20]\n"
781 "fmax v14.4s, v14.4s, v1.4s\n"
782 "ldr q11, [%[inptr], #0x90]\n"
783 "fmin v15.4s, v15.4s, v0.4s\n"
784 "ldr q4, [%[outptr3], #0x10]\n"
785 "fmin v16.4s, v16.4s, v0.4s\n"
786 "str q13, [%[outptr1]]\n"
787 "fadd v17.4s, v17.4s, v9.4s\n"
788 "ldr q12, [%[inptr], #0xa0]\n"
789 "fadd v10.4s, v10.4s, v2.4s\n"
790 "ldr q5, [%[outptr3], #0x20]\n"
791 "fmax v15.4s, v15.4s, v1.4s\n"
792 "str q14, [%[outptr1], #0x10]\n"
793 "fmax v16.4s, v16.4s, v1.4s\n"
794 "ldr q13, [%[inptr], #0xb0]\n"
795 "fmin v17.4s, v17.4s, v0.4s\n"
796 "ldr q6, [%[outptr4]]\n"
797 "fmin v10.4s, v10.4s, v0.4s\n"
798 "str q15, [%[outptr1], #0x20]\n"
799 "fadd v11.4s, v11.4s, v3.4s\n"
800 "ldr q14, [%[inptr], #0xc0]\n"
801 "fadd v12.4s, v12.4s, v4.4s\n"
802 "ldr q7, [%[outptr4], #0x10]\n"
803 "fmax v17.4s, v17.4s, v1.4s\n"
804 "str q16, [%[outptr2]]\n"
805 "fmax v10.4s, v10.4s, v1.4s\n"
806 "ldr q15, [%[inptr], #0xd0]\n"
807 "fmin v11.4s, v11.4s, v0.4s\n"
808 "ldr q8, [%[outptr4], #0x20]\n"
809 "fmin v12.4s, v12.4s, v0.4s\n"
810 "str q17, [%[outptr2], #0x10]\n"
811 "fadd v13.4s, v13.4s, v5.4s\n"
812 "ldr q16, [%[inptr], #0xe0]\n"
813 "fadd v14.4s, v14.4s, v6.4s\n"
814 "ldr q9, [%[outptr5]]\n"
815 "fmax v11.4s, v11.4s, v1.4s\n"
816 "str q10, [%[outptr2], #0x20]\n"
817 "fmax v12.4s, v12.4s, v1.4s\n"
818 "ldr q17, [%[inptr], #0xf0]\n"
819 "fmin v13.4s, v13.4s, v0.4s\n"
820 "ldr q2, [%[outptr5], #0x10]\n"
821 "fmin v14.4s, v14.4s, v0.4s\n"
822 "str q11, [%[outptr3]]\n"
823 "fadd v15.4s, v15.4s, v7.4s\n"
824 "ldr q10, [%[inptr], #0x100]\n"
825 "fadd v16.4s, v16.4s, v8.4s\n"
826 "ldr q3, [%[outptr5], #0x20]\n"
827 "fmax v13.4s, v13.4s, v1.4s\n"
828 "str q12, [%[outptr3], #0x10]\n"
829 "fmax v14.4s, v14.4s, v1.4s\n"
830 "ldr q11, [%[inptr], #0x110]\n"
831 "fmin v15.4s, v15.4s, v0.4s\n"
832 "ldr q4, [%[outptr6]]\n"
833 "fmin v16.4s, v16.4s, v0.4s\n"
834 "str q13, [%[outptr3], #0x20]\n"
835 "fadd v17.4s, v17.4s, v9.4s\n"
836 "ldr q12, [%[inptr], #0x120]\n"
837 "fadd v10.4s, v10.4s, v2.4s\n"
838 "ldr q5, [%[outptr6], #0x10]\n"
839 "fmax v15.4s, v15.4s, v1.4s\n"
840 "str q14, [%[outptr4]]\n"
841 "fmax v16.4s, v16.4s, v1.4s\n"
842 "ldr q13, [%[inptr], #0x130]\n"
843 "fmin v17.4s, v17.4s, v0.4s\n"
844 "ldr q6, [%[outptr6], #0x20]\n"
845 "fmin v10.4s, v10.4s, v0.4s\n"
846 "str q15, [%[outptr4], #0x10]\n"
847 "fadd v11.4s, v11.4s, v3.4s\n"
848 "ldr q14, [%[inptr], #0x140]\n"
849 "fadd v12.4s, v12.4s, v4.4s\n"
850 "add %[outptr0], %[outptr0], #0x30\n"
851 "fmax v17.4s, v17.4s, v1.4s\n"
852 "str q16, [%[outptr4], #0x20]\n"
853 "fmax v10.4s, v10.4s, v1.4s\n"
854 "add %[outptr1], %[outptr1], #0x30\n"
855 "fmin v11.4s, v11.4s, v0.4s\n"
856 "str q17, [%[outptr5]]\n"
857 "fmin v12.4s, v12.4s, v0.4s\n"
858 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
859 "fadd v13.4s, v13.4s, v5.4s\n"
860 "str q10, [%[outptr5], #0x10]\n"
861 "fmax v11.4s, v11.4s, v1.4s\n"
862 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
863 "fmax v12.4s, v12.4s, v1.4s\n"
864 "add %[outptr2], %[outptr2], #0x30\n"
865 "fmin v13.4s, v13.4s, v0.4s\n"
866 "str q11, [%[outptr5], #0x20]\n"
867 "fadd v14.4s, v14.4s, v6.4s\n"
868 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
869 "add %[outptr3], %[outptr3], #0x30\n"
870 "fmax v13.4s, v13.4s, v1.4s\n"
871 "str q12, [%[outptr6]]\n"
872 "fmin v14.4s, v14.4s, v0.4s\n"
873 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
874 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
875 "str q13, [%[outptr6], #0x10]\n"
876 "add %[outptr4], %[outptr4], #0x30\n"
877 "fmax v14.4s, v14.4s, v1.4s\n"
878 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
879 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
880 "add %[outptr5], %[outptr5], #0x30\n"
881 "str q14, [%[outptr6], #0x20]\n"
882 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
883 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
884 "add %[outptr6], %[outptr6], #0x30\n"
885 "add %[inptr], %[inptr], #0x180\n"
886 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
887 [inptr] "+r" (inptr)
888 : [minval] "w" (minval), [maxval] "w" (maxval)
889 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
890 );
891 }
892 }
893 break;
894
895 default:
896 case 8:
897 {
898 if ((i+11) >= xmax)
899 {
900 for (int xi=0; xi<11; xi++)
901 {
902 if ((i+xi) < xmax)
903 {
904 *outptr0 = std::min(std::max(minval, inptr[xi] + *outptr0), maxval);
905 outptr0++;
906 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + *outptr1), maxval);
907 outptr1++;
908 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + *outptr2), maxval);
909 outptr2++;
910 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + *outptr3), maxval);
911 outptr3++;
912 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + *outptr4), maxval);
913 outptr4++;
914 *outptr5 = std::min(std::max(minval, inptr[xi + 60] + *outptr5), maxval);
915 outptr5++;
916 *outptr6 = std::min(std::max(minval, inptr[xi + 72] + *outptr6), maxval);
917 outptr6++;
918 *outptr7 = std::min(std::max(minval, inptr[xi + 84] + *outptr7), maxval);
919 outptr7++;
920 }
921 }
922 inptr += 96;
923 } else {
924 /* Optimized routine to copy an entire block */
925 __asm __volatile (
926 "dup v0.4s, %[maxval].s[0]\n"
927 "ldr q2, [%[outptr0]]\n"
928 "dup v1.4s, %[minval].s[0]\n"
929 "ldr q10, [%[inptr]]\n"
930 "ldr q3, [%[outptr0], #0x10]\n"
931 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
932 "ldr q11, [%[inptr], #0x10]\n"
933 "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
934 "fadd v10.4s, v10.4s, v2.4s\n"
935 "ldr q4, [%[outptr0], #0x20]\n"
936 "ldr q12, [%[inptr], #0x20]\n"
937 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
938 "fadd v11.4s, v11.4s, v3.4s\n"
939 "ldr q5, [%[outptr1]]\n"
940 "fmin v10.4s, v10.4s, v0.4s\n"
941 "ldr q13, [%[inptr], #0x30]\n"
942 "fadd v12.4s, v12.4s, v4.4s\n"
943 "ldr q6, [%[outptr1], #0x10]\n"
944 "ldr q14, [%[inptr], #0x40]\n"
945 "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
946 "fmax v10.4s, v10.4s, v1.4s\n"
947 "ldr q7, [%[outptr1], #0x20]\n"
948 "fmin v11.4s, v11.4s, v0.4s\n"
949 "ldr q15, [%[inptr], #0x50]\n"
950 "fmin v12.4s, v12.4s, v0.4s\n"
951 "ldr q8, [%[outptr2]]\n"
952 "fadd v13.4s, v13.4s, v5.4s\n"
953 "str q10, [%[outptr0]]\n"
954 "fadd v14.4s, v14.4s, v6.4s\n"
955 "ldr q16, [%[inptr], #0x60]\n"
956 "fmax v11.4s, v11.4s, v1.4s\n"
957 "ldr q9, [%[outptr2], #0x10]\n"
958 "fmax v12.4s, v12.4s, v1.4s\n"
959 "ldr q17, [%[inptr], #0x70]\n"
960 "fmin v13.4s, v13.4s, v0.4s\n"
961 "ldr q2, [%[outptr2], #0x20]\n"
962 "fmin v14.4s, v14.4s, v0.4s\n"
963 "str q11, [%[outptr0], #0x10]\n"
964 "fadd v15.4s, v15.4s, v7.4s\n"
965 "ldr q10, [%[inptr], #0x80]\n"
966 "fadd v16.4s, v16.4s, v8.4s\n"
967 "ldr q3, [%[outptr3]]\n"
968 "fmax v13.4s, v13.4s, v1.4s\n"
969 "str q12, [%[outptr0], #0x20]\n"
970 "fmax v14.4s, v14.4s, v1.4s\n"
971 "ldr q11, [%[inptr], #0x90]\n"
972 "fmin v15.4s, v15.4s, v0.4s\n"
973 "ldr q4, [%[outptr3], #0x10]\n"
974 "fmin v16.4s, v16.4s, v0.4s\n"
975 "str q13, [%[outptr1]]\n"
976 "fadd v17.4s, v17.4s, v9.4s\n"
977 "ldr q12, [%[inptr], #0xa0]\n"
978 "fadd v10.4s, v10.4s, v2.4s\n"
979 "ldr q5, [%[outptr3], #0x20]\n"
980 "fmax v15.4s, v15.4s, v1.4s\n"
981 "str q14, [%[outptr1], #0x10]\n"
982 "fmax v16.4s, v16.4s, v1.4s\n"
983 "ldr q13, [%[inptr], #0xb0]\n"
984 "fmin v17.4s, v17.4s, v0.4s\n"
985 "ldr q6, [%[outptr4]]\n"
986 "fmin v10.4s, v10.4s, v0.4s\n"
987 "str q15, [%[outptr1], #0x20]\n"
988 "fadd v11.4s, v11.4s, v3.4s\n"
989 "ldr q14, [%[inptr], #0xc0]\n"
990 "fadd v12.4s, v12.4s, v4.4s\n"
991 "ldr q7, [%[outptr4], #0x10]\n"
992 "fmax v17.4s, v17.4s, v1.4s\n"
993 "str q16, [%[outptr2]]\n"
994 "fmax v10.4s, v10.4s, v1.4s\n"
995 "ldr q15, [%[inptr], #0xd0]\n"
996 "fmin v11.4s, v11.4s, v0.4s\n"
997 "ldr q8, [%[outptr4], #0x20]\n"
998 "fmin v12.4s, v12.4s, v0.4s\n"
999 "str q17, [%[outptr2], #0x10]\n"
1000 "fadd v13.4s, v13.4s, v5.4s\n"
1001 "ldr q16, [%[inptr], #0xe0]\n"
1002 "fadd v14.4s, v14.4s, v6.4s\n"
1003 "ldr q9, [%[outptr5]]\n"
1004 "fmax v11.4s, v11.4s, v1.4s\n"
1005 "str q10, [%[outptr2], #0x20]\n"
1006 "fmax v12.4s, v12.4s, v1.4s\n"
1007 "ldr q17, [%[inptr], #0xf0]\n"
1008 "fmin v13.4s, v13.4s, v0.4s\n"
1009 "ldr q2, [%[outptr5], #0x10]\n"
1010 "fmin v14.4s, v14.4s, v0.4s\n"
1011 "str q11, [%[outptr3]]\n"
1012 "fadd v15.4s, v15.4s, v7.4s\n"
1013 "ldr q10, [%[inptr], #0x100]\n"
1014 "fadd v16.4s, v16.4s, v8.4s\n"
1015 "ldr q3, [%[outptr5], #0x20]\n"
1016 "fmax v13.4s, v13.4s, v1.4s\n"
1017 "str q12, [%[outptr3], #0x10]\n"
1018 "fmax v14.4s, v14.4s, v1.4s\n"
1019 "ldr q11, [%[inptr], #0x110]\n"
1020 "fmin v15.4s, v15.4s, v0.4s\n"
1021 "ldr q4, [%[outptr6]]\n"
1022 "fmin v16.4s, v16.4s, v0.4s\n"
1023 "str q13, [%[outptr3], #0x20]\n"
1024 "fadd v17.4s, v17.4s, v9.4s\n"
1025 "ldr q12, [%[inptr], #0x120]\n"
1026 "fadd v10.4s, v10.4s, v2.4s\n"
1027 "ldr q5, [%[outptr6], #0x10]\n"
1028 "fmax v15.4s, v15.4s, v1.4s\n"
1029 "str q14, [%[outptr4]]\n"
1030 "fmax v16.4s, v16.4s, v1.4s\n"
1031 "ldr q13, [%[inptr], #0x130]\n"
1032 "fmin v17.4s, v17.4s, v0.4s\n"
1033 "ldr q6, [%[outptr6], #0x20]\n"
1034 "fmin v10.4s, v10.4s, v0.4s\n"
1035 "str q15, [%[outptr4], #0x10]\n"
1036 "fadd v11.4s, v11.4s, v3.4s\n"
1037 "ldr q14, [%[inptr], #0x140]\n"
1038 "fadd v12.4s, v12.4s, v4.4s\n"
1039 "ldr q7, [%[outptr7]]\n"
1040 "fmax v17.4s, v17.4s, v1.4s\n"
1041 "str q16, [%[outptr4], #0x20]\n"
1042 "fmax v10.4s, v10.4s, v1.4s\n"
1043 "ldr q15, [%[inptr], #0x150]\n"
1044 "fmin v11.4s, v11.4s, v0.4s\n"
1045 "ldr q8, [%[outptr7], #0x10]\n"
1046 "fmin v12.4s, v12.4s, v0.4s\n"
1047 "str q17, [%[outptr5]]\n"
1048 "fadd v13.4s, v13.4s, v5.4s\n"
1049 "ldr q16, [%[inptr], #0x160]\n"
1050 "fadd v14.4s, v14.4s, v6.4s\n"
1051 "ldr q9, [%[outptr7], #0x20]\n"
1052 "fmax v11.4s, v11.4s, v1.4s\n"
1053 "str q10, [%[outptr5], #0x10]\n"
1054 "fmax v12.4s, v12.4s, v1.4s\n"
1055 "ldr q17, [%[inptr], #0x170]\n"
1056 "fmin v13.4s, v13.4s, v0.4s\n"
1057 "add %[outptr0], %[outptr0], #0x30\n"
1058 "fmin v14.4s, v14.4s, v0.4s\n"
1059 "str q11, [%[outptr5], #0x20]\n"
1060 "fadd v15.4s, v15.4s, v7.4s\n"
1061 "add %[outptr1], %[outptr1], #0x30\n"
1062 "fmax v13.4s, v13.4s, v1.4s\n"
1063 "str q12, [%[outptr6]]\n"
1064 "fmax v14.4s, v14.4s, v1.4s\n"
1065 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1066 "fmin v15.4s, v15.4s, v0.4s\n"
1067 "str q13, [%[outptr6], #0x10]\n"
1068 "fadd v16.4s, v16.4s, v8.4s\n"
1069 "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
1070 "fadd v17.4s, v17.4s, v9.4s\n"
1071 "str q14, [%[outptr6], #0x20]\n"
1072 "fmax v15.4s, v15.4s, v1.4s\n"
1073 "add %[outptr2], %[outptr2], #0x30\n"
1074 "fmin v16.4s, v16.4s, v0.4s\n"
1075 "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
1076 "fmin v17.4s, v17.4s, v0.4s\n"
1077 "str q15, [%[outptr7]]\n"
1078 "add %[outptr3], %[outptr3], #0x30\n"
1079 "fmax v16.4s, v16.4s, v1.4s\n"
1080 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1081 "fmax v17.4s, v17.4s, v1.4s\n"
1082 "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
1083 "str q16, [%[outptr7], #0x10]\n"
1084 "add %[outptr4], %[outptr4], #0x30\n"
1085 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1086 "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
1087 "str q17, [%[outptr7], #0x20]\n"
1088 "add %[outptr5], %[outptr5], #0x30\n"
1089 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1090 "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
1091 "add %[outptr6], %[outptr6], #0x30\n"
1092 "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
1093 "add %[outptr7], %[outptr7], #0x30\n"
1094 "add %[inptr], %[inptr], #0x180\n"
1095 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1096 [inptr] "+r" (inptr)
1097 : [minval] "w" (minval), [maxval] "w" (maxval)
1098 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1099 );
1100 }
1101 }
1102 break;
1103
1104
1105 }
1106 }
1107 else
1108 {
1109 const float *biasptr = bias ? bias + i : nullbias;
1110
1111 switch(height)
1112 {
1113 case 1:
1114 {
1115 if ((i+11) >= xmax)
1116 {
1117 for (int xi=0; xi<11; xi++)
1118 {
1119 if ((i+xi) < xmax)
1120 {
1121 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1122 outptr0++;
1123 }
1124 }
1125 inptr += 96;
1126 } else {
1127 /* Optimized routine to copy an entire block */
1128 __asm __volatile (
1129 "dup v0.4s, %[maxval].s[0]\n"
1130 "ldr q2, [%[biasptr]]\n"
1131 "dup v1.4s, %[minval].s[0]\n"
1132 "ldr q3, [%[biasptr], #0x10]\n"
1133 "ldr q4, [%[biasptr], #0x20]\n"
1134 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1135 "ldr q13, [%[inptr]]\n"
1136 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1137 "ldr q14, [%[inptr], #0x10]\n"
1138 "ldr q15, [%[inptr], #0x20]\n"
1139 "add %[inptr], %[inptr], #0x180\n"
1140 "fadd v13.4s, v13.4s, v2.4s\n"
1141 "fadd v14.4s, v14.4s, v3.4s\n"
1142 "fadd v15.4s, v15.4s, v4.4s\n"
1143 "fmin v13.4s, v13.4s, v0.4s\n"
1144 "fmin v14.4s, v14.4s, v0.4s\n"
1145 "fmin v15.4s, v15.4s, v0.4s\n"
1146 "fmax v13.4s, v13.4s, v1.4s\n"
1147 "fmax v14.4s, v14.4s, v1.4s\n"
1148 "fmax v15.4s, v15.4s, v1.4s\n"
1149 "str q13, [%[outptr0]]\n"
1150 "str q14, [%[outptr0], #0x10]\n"
1151 "str q15, [%[outptr0], #0x20]\n"
1152 "add %[outptr0], %[outptr0], #0x30\n"
1153 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1154 [inptr] "+r" (inptr)
1155 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1156 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1157 );
1158 }
1159 }
1160 break;
1161
1162 case 2:
1163 {
1164 if ((i+11) >= xmax)
1165 {
1166 for (int xi=0; xi<11; xi++)
1167 {
1168 if ((i+xi) < xmax)
1169 {
1170 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1171 outptr0++;
1172 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1173 outptr1++;
1174 }
1175 }
1176 inptr += 96;
1177 } else {
1178 /* Optimized routine to copy an entire block */
1179 __asm __volatile (
1180 "dup v0.4s, %[maxval].s[0]\n"
1181 "ldr q2, [%[biasptr]]\n"
1182 "dup v1.4s, %[minval].s[0]\n"
1183 "ldr q3, [%[biasptr], #0x10]\n"
1184 "ldr q4, [%[biasptr], #0x20]\n"
1185 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1186 "ldr q13, [%[inptr]]\n"
1187 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1188 "ldr q14, [%[inptr], #0x10]\n"
1189 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1190 "fadd v13.4s, v13.4s, v2.4s\n"
1191 "ldr q15, [%[inptr], #0x20]\n"
1192 "ldr q16, [%[inptr], #0x30]\n"
1193 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1194 "fadd v14.4s, v14.4s, v3.4s\n"
1195 "ldr q17, [%[inptr], #0x40]\n"
1196 "fmin v13.4s, v13.4s, v0.4s\n"
1197 "ldr q18, [%[inptr], #0x50]\n"
1198 "fadd v15.4s, v15.4s, v4.4s\n"
1199 "add %[inptr], %[inptr], #0x180\n"
1200 "fmin v14.4s, v14.4s, v0.4s\n"
1201 "fmax v13.4s, v13.4s, v1.4s\n"
1202 "fmin v15.4s, v15.4s, v0.4s\n"
1203 "fadd v16.4s, v16.4s, v2.4s\n"
1204 "fmax v14.4s, v14.4s, v1.4s\n"
1205 "str q13, [%[outptr0]]\n"
1206 "fadd v17.4s, v17.4s, v3.4s\n"
1207 "fmax v15.4s, v15.4s, v1.4s\n"
1208 "fmin v16.4s, v16.4s, v0.4s\n"
1209 "str q14, [%[outptr0], #0x10]\n"
1210 "fadd v18.4s, v18.4s, v4.4s\n"
1211 "fmin v17.4s, v17.4s, v0.4s\n"
1212 "fmax v16.4s, v16.4s, v1.4s\n"
1213 "str q15, [%[outptr0], #0x20]\n"
1214 "fmin v18.4s, v18.4s, v0.4s\n"
1215 "add %[outptr0], %[outptr0], #0x30\n"
1216 "fmax v17.4s, v17.4s, v1.4s\n"
1217 "str q16, [%[outptr1]]\n"
1218 "fmax v18.4s, v18.4s, v1.4s\n"
1219 "str q17, [%[outptr1], #0x10]\n"
1220 "str q18, [%[outptr1], #0x20]\n"
1221 "add %[outptr1], %[outptr1], #0x30\n"
1222 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1223 [inptr] "+r" (inptr)
1224 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1225 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1226 );
1227 }
1228 }
1229 break;
1230
1231 case 3:
1232 {
1233 if ((i+11) >= xmax)
1234 {
1235 for (int xi=0; xi<11; xi++)
1236 {
1237 if ((i+xi) < xmax)
1238 {
1239 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1240 outptr0++;
1241 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1242 outptr1++;
1243 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1244 outptr2++;
1245 }
1246 }
1247 inptr += 96;
1248 } else {
1249 /* Optimized routine to copy an entire block */
1250 __asm __volatile (
1251 "dup v0.4s, %[maxval].s[0]\n"
1252 "ldr q2, [%[biasptr]]\n"
1253 "dup v1.4s, %[minval].s[0]\n"
1254 "ldr q3, [%[biasptr], #0x10]\n"
1255 "ldr q4, [%[biasptr], #0x20]\n"
1256 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1257 "ldr q13, [%[inptr]]\n"
1258 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1259 "ldr q14, [%[inptr], #0x10]\n"
1260 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1261 "fadd v13.4s, v13.4s, v2.4s\n"
1262 "ldr q15, [%[inptr], #0x20]\n"
1263 "ldr q16, [%[inptr], #0x30]\n"
1264 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1265 "fadd v14.4s, v14.4s, v3.4s\n"
1266 "ldr q17, [%[inptr], #0x40]\n"
1267 "fmin v13.4s, v13.4s, v0.4s\n"
1268 "ldr q18, [%[inptr], #0x50]\n"
1269 "fadd v15.4s, v15.4s, v4.4s\n"
1270 "ldr q19, [%[inptr], #0x60]\n"
1271 "fadd v16.4s, v16.4s, v2.4s\n"
1272 "ldr q20, [%[inptr], #0x70]\n"
1273 "fmin v14.4s, v14.4s, v0.4s\n"
1274 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1275 "fmax v13.4s, v13.4s, v1.4s\n"
1276 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1277 "fmax v14.4s, v14.4s, v1.4s\n"
1278 "fmin v15.4s, v15.4s, v0.4s\n"
1279 "str q13, [%[outptr0]]\n"
1280 "fmin v16.4s, v16.4s, v0.4s\n"
1281 "ldr q13, [%[inptr], #0x80]\n"
1282 "fadd v17.4s, v17.4s, v3.4s\n"
1283 "add %[inptr], %[inptr], #0x180\n"
1284 "fmax v15.4s, v15.4s, v1.4s\n"
1285 "str q14, [%[outptr0], #0x10]\n"
1286 "fmax v16.4s, v16.4s, v1.4s\n"
1287 "fmin v17.4s, v17.4s, v0.4s\n"
1288 "fadd v18.4s, v18.4s, v4.4s\n"
1289 "str q15, [%[outptr0], #0x20]\n"
1290 "fadd v19.4s, v19.4s, v2.4s\n"
1291 "add %[outptr0], %[outptr0], #0x30\n"
1292 "fmax v17.4s, v17.4s, v1.4s\n"
1293 "str q16, [%[outptr1]]\n"
1294 "fmin v18.4s, v18.4s, v0.4s\n"
1295 "fmin v19.4s, v19.4s, v0.4s\n"
1296 "fadd v20.4s, v20.4s, v3.4s\n"
1297 "str q17, [%[outptr1], #0x10]\n"
1298 "fadd v13.4s, v13.4s, v4.4s\n"
1299 "fmax v18.4s, v18.4s, v1.4s\n"
1300 "fmax v19.4s, v19.4s, v1.4s\n"
1301 "fmin v20.4s, v20.4s, v0.4s\n"
1302 "fmin v13.4s, v13.4s, v0.4s\n"
1303 "str q18, [%[outptr1], #0x20]\n"
1304 "add %[outptr1], %[outptr1], #0x30\n"
1305 "fmax v20.4s, v20.4s, v1.4s\n"
1306 "str q19, [%[outptr2]]\n"
1307 "fmax v13.4s, v13.4s, v1.4s\n"
1308 "str q20, [%[outptr2], #0x10]\n"
1309 "str q13, [%[outptr2], #0x20]\n"
1310 "add %[outptr2], %[outptr2], #0x30\n"
1311 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1312 [inptr] "+r" (inptr)
1313 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1314 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1315 );
1316 }
1317 }
1318 break;
1319
1320 case 4:
1321 {
1322 if ((i+11) >= xmax)
1323 {
1324 for (int xi=0; xi<11; xi++)
1325 {
1326 if ((i+xi) < xmax)
1327 {
1328 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1329 outptr0++;
1330 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1331 outptr1++;
1332 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1333 outptr2++;
1334 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1335 outptr3++;
1336 }
1337 }
1338 inptr += 96;
1339 } else {
1340 /* Optimized routine to copy an entire block */
1341 __asm __volatile (
1342 "dup v0.4s, %[maxval].s[0]\n"
1343 "ldr q2, [%[biasptr]]\n"
1344 "dup v1.4s, %[minval].s[0]\n"
1345 "ldr q3, [%[biasptr], #0x10]\n"
1346 "ldr q4, [%[biasptr], #0x20]\n"
1347 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1348 "ldr q13, [%[inptr]]\n"
1349 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1350 "ldr q14, [%[inptr], #0x10]\n"
1351 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1352 "fadd v13.4s, v13.4s, v2.4s\n"
1353 "ldr q15, [%[inptr], #0x20]\n"
1354 "ldr q16, [%[inptr], #0x30]\n"
1355 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1356 "fadd v14.4s, v14.4s, v3.4s\n"
1357 "ldr q17, [%[inptr], #0x40]\n"
1358 "fmin v13.4s, v13.4s, v0.4s\n"
1359 "ldr q18, [%[inptr], #0x50]\n"
1360 "fadd v15.4s, v15.4s, v4.4s\n"
1361 "ldr q19, [%[inptr], #0x60]\n"
1362 "fadd v16.4s, v16.4s, v2.4s\n"
1363 "ldr q20, [%[inptr], #0x70]\n"
1364 "fmin v14.4s, v14.4s, v0.4s\n"
1365 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1366 "fmax v13.4s, v13.4s, v1.4s\n"
1367 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1368 "fmax v14.4s, v14.4s, v1.4s\n"
1369 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1370 "fmin v15.4s, v15.4s, v0.4s\n"
1371 "str q13, [%[outptr0]]\n"
1372 "fmin v16.4s, v16.4s, v0.4s\n"
1373 "ldr q13, [%[inptr], #0x80]\n"
1374 "fadd v17.4s, v17.4s, v3.4s\n"
1375 "fadd v18.4s, v18.4s, v4.4s\n"
1376 "str q14, [%[outptr0], #0x10]\n"
1377 "fmax v15.4s, v15.4s, v1.4s\n"
1378 "ldr q14, [%[inptr], #0x90]\n"
1379 "fmax v16.4s, v16.4s, v1.4s\n"
1380 "fmin v17.4s, v17.4s, v0.4s\n"
1381 "fmin v18.4s, v18.4s, v0.4s\n"
1382 "str q15, [%[outptr0], #0x20]\n"
1383 "fadd v19.4s, v19.4s, v2.4s\n"
1384 "ldr q15, [%[inptr], #0xa0]\n"
1385 "fadd v20.4s, v20.4s, v3.4s\n"
1386 "add %[outptr0], %[outptr0], #0x30\n"
1387 "fmax v17.4s, v17.4s, v1.4s\n"
1388 "str q16, [%[outptr1]]\n"
1389 "fmax v18.4s, v18.4s, v1.4s\n"
1390 "ldr q16, [%[inptr], #0xb0]\n"
1391 "fmin v19.4s, v19.4s, v0.4s\n"
1392 "add %[inptr], %[inptr], #0x180\n"
1393 "fmin v20.4s, v20.4s, v0.4s\n"
1394 "str q17, [%[outptr1], #0x10]\n"
1395 "fadd v13.4s, v13.4s, v4.4s\n"
1396 "fmax v19.4s, v19.4s, v1.4s\n"
1397 "fadd v14.4s, v14.4s, v2.4s\n"
1398 "str q18, [%[outptr1], #0x20]\n"
1399 "fmax v20.4s, v20.4s, v1.4s\n"
1400 "add %[outptr1], %[outptr1], #0x30\n"
1401 "fmin v13.4s, v13.4s, v0.4s\n"
1402 "str q19, [%[outptr2]]\n"
1403 "fmin v14.4s, v14.4s, v0.4s\n"
1404 "fadd v15.4s, v15.4s, v3.4s\n"
1405 "fadd v16.4s, v16.4s, v4.4s\n"
1406 "str q20, [%[outptr2], #0x10]\n"
1407 "fmax v13.4s, v13.4s, v1.4s\n"
1408 "fmax v14.4s, v14.4s, v1.4s\n"
1409 "fmin v15.4s, v15.4s, v0.4s\n"
1410 "fmin v16.4s, v16.4s, v0.4s\n"
1411 "str q13, [%[outptr2], #0x20]\n"
1412 "add %[outptr2], %[outptr2], #0x30\n"
1413 "fmax v15.4s, v15.4s, v1.4s\n"
1414 "str q14, [%[outptr3]]\n"
1415 "fmax v16.4s, v16.4s, v1.4s\n"
1416 "str q15, [%[outptr3], #0x10]\n"
1417 "str q16, [%[outptr3], #0x20]\n"
1418 "add %[outptr3], %[outptr3], #0x30\n"
1419 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1420 [inptr] "+r" (inptr)
1421 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1422 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1423 );
1424 }
1425 }
1426 break;
1427
1428 case 5:
1429 {
1430 if ((i+11) >= xmax)
1431 {
1432 for (int xi=0; xi<11; xi++)
1433 {
1434 if ((i+xi) < xmax)
1435 {
1436 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1437 outptr0++;
1438 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1439 outptr1++;
1440 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1441 outptr2++;
1442 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1443 outptr3++;
1444 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1445 outptr4++;
1446 }
1447 }
1448 inptr += 96;
1449 } else {
1450 /* Optimized routine to copy an entire block */
1451 __asm __volatile (
1452 "dup v0.4s, %[maxval].s[0]\n"
1453 "ldr q2, [%[biasptr]]\n"
1454 "dup v1.4s, %[minval].s[0]\n"
1455 "ldr q3, [%[biasptr], #0x10]\n"
1456 "ldr q4, [%[biasptr], #0x20]\n"
1457 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1458 "ldr q13, [%[inptr]]\n"
1459 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1460 "ldr q14, [%[inptr], #0x10]\n"
1461 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1462 "fadd v13.4s, v13.4s, v2.4s\n"
1463 "ldr q15, [%[inptr], #0x20]\n"
1464 "ldr q16, [%[inptr], #0x30]\n"
1465 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1466 "fadd v14.4s, v14.4s, v3.4s\n"
1467 "ldr q17, [%[inptr], #0x40]\n"
1468 "fmin v13.4s, v13.4s, v0.4s\n"
1469 "ldr q18, [%[inptr], #0x50]\n"
1470 "fadd v15.4s, v15.4s, v4.4s\n"
1471 "ldr q19, [%[inptr], #0x60]\n"
1472 "fadd v16.4s, v16.4s, v2.4s\n"
1473 "ldr q20, [%[inptr], #0x70]\n"
1474 "fmin v14.4s, v14.4s, v0.4s\n"
1475 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1476 "fmax v13.4s, v13.4s, v1.4s\n"
1477 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1478 "fmax v14.4s, v14.4s, v1.4s\n"
1479 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1480 "fmin v15.4s, v15.4s, v0.4s\n"
1481 "str q13, [%[outptr0]]\n"
1482 "fmin v16.4s, v16.4s, v0.4s\n"
1483 "ldr q13, [%[inptr], #0x80]\n"
1484 "fadd v17.4s, v17.4s, v3.4s\n"
1485 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1486 "fmax v15.4s, v15.4s, v1.4s\n"
1487 "str q14, [%[outptr0], #0x10]\n"
1488 "fmax v16.4s, v16.4s, v1.4s\n"
1489 "ldr q14, [%[inptr], #0x90]\n"
1490 "fmin v17.4s, v17.4s, v0.4s\n"
1491 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1492 "fadd v18.4s, v18.4s, v4.4s\n"
1493 "str q15, [%[outptr0], #0x20]\n"
1494 "fadd v19.4s, v19.4s, v2.4s\n"
1495 "ldr q15, [%[inptr], #0xa0]\n"
1496 "fmax v17.4s, v17.4s, v1.4s\n"
1497 "add %[outptr0], %[outptr0], #0x30\n"
1498 "fmin v18.4s, v18.4s, v0.4s\n"
1499 "str q16, [%[outptr1]]\n"
1500 "fmin v19.4s, v19.4s, v0.4s\n"
1501 "ldr q16, [%[inptr], #0xb0]\n"
1502 "fadd v20.4s, v20.4s, v3.4s\n"
1503 "fadd v13.4s, v13.4s, v4.4s\n"
1504 "str q17, [%[outptr1], #0x10]\n"
1505 "fmax v18.4s, v18.4s, v1.4s\n"
1506 "ldr q17, [%[inptr], #0xc0]\n"
1507 "fmax v19.4s, v19.4s, v1.4s\n"
1508 "fmin v20.4s, v20.4s, v0.4s\n"
1509 "fmin v13.4s, v13.4s, v0.4s\n"
1510 "str q18, [%[outptr1], #0x20]\n"
1511 "fadd v14.4s, v14.4s, v2.4s\n"
1512 "ldr q18, [%[inptr], #0xd0]\n"
1513 "fadd v15.4s, v15.4s, v3.4s\n"
1514 "add %[outptr1], %[outptr1], #0x30\n"
1515 "fmax v20.4s, v20.4s, v1.4s\n"
1516 "str q19, [%[outptr2]]\n"
1517 "fmax v13.4s, v13.4s, v1.4s\n"
1518 "ldr q19, [%[inptr], #0xe0]\n"
1519 "fmin v14.4s, v14.4s, v0.4s\n"
1520 "add %[inptr], %[inptr], #0x180\n"
1521 "fmin v15.4s, v15.4s, v0.4s\n"
1522 "str q20, [%[outptr2], #0x10]\n"
1523 "fadd v16.4s, v16.4s, v4.4s\n"
1524 "fmax v14.4s, v14.4s, v1.4s\n"
1525 "fadd v17.4s, v17.4s, v2.4s\n"
1526 "str q13, [%[outptr2], #0x20]\n"
1527 "fmax v15.4s, v15.4s, v1.4s\n"
1528 "add %[outptr2], %[outptr2], #0x30\n"
1529 "fmin v16.4s, v16.4s, v0.4s\n"
1530 "str q14, [%[outptr3]]\n"
1531 "fmin v17.4s, v17.4s, v0.4s\n"
1532 "fadd v18.4s, v18.4s, v3.4s\n"
1533 "fadd v19.4s, v19.4s, v4.4s\n"
1534 "str q15, [%[outptr3], #0x10]\n"
1535 "fmax v16.4s, v16.4s, v1.4s\n"
1536 "fmax v17.4s, v17.4s, v1.4s\n"
1537 "fmin v18.4s, v18.4s, v0.4s\n"
1538 "fmin v19.4s, v19.4s, v0.4s\n"
1539 "str q16, [%[outptr3], #0x20]\n"
1540 "add %[outptr3], %[outptr3], #0x30\n"
1541 "fmax v18.4s, v18.4s, v1.4s\n"
1542 "str q17, [%[outptr4]]\n"
1543 "fmax v19.4s, v19.4s, v1.4s\n"
1544 "str q18, [%[outptr4], #0x10]\n"
1545 "str q19, [%[outptr4], #0x20]\n"
1546 "add %[outptr4], %[outptr4], #0x30\n"
1547 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1548 [inptr] "+r" (inptr)
1549 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1550 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1551 );
1552 }
1553 }
1554 break;
1555
1556 case 6:
1557 {
1558 if ((i+11) >= xmax)
1559 {
1560 for (int xi=0; xi<11; xi++)
1561 {
1562 if ((i+xi) < xmax)
1563 {
1564 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1565 outptr0++;
1566 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1567 outptr1++;
1568 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1569 outptr2++;
1570 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1571 outptr3++;
1572 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1573 outptr4++;
1574 *outptr5 = std::min(std::max(minval, inptr[xi + 60] + biasptr[xi]), maxval);
1575 outptr5++;
1576 }
1577 }
1578 inptr += 96;
1579 } else {
1580 /* Optimized routine to copy an entire block */
1581 __asm __volatile (
1582 "dup v0.4s, %[maxval].s[0]\n"
1583 "ldr q2, [%[biasptr]]\n"
1584 "dup v1.4s, %[minval].s[0]\n"
1585 "ldr q3, [%[biasptr], #0x10]\n"
1586 "ldr q4, [%[biasptr], #0x20]\n"
1587 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1588 "ldr q13, [%[inptr]]\n"
1589 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1590 "ldr q14, [%[inptr], #0x10]\n"
1591 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1592 "fadd v13.4s, v13.4s, v2.4s\n"
1593 "ldr q15, [%[inptr], #0x20]\n"
1594 "ldr q16, [%[inptr], #0x30]\n"
1595 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1596 "fadd v14.4s, v14.4s, v3.4s\n"
1597 "ldr q17, [%[inptr], #0x40]\n"
1598 "fmin v13.4s, v13.4s, v0.4s\n"
1599 "ldr q18, [%[inptr], #0x50]\n"
1600 "fadd v15.4s, v15.4s, v4.4s\n"
1601 "ldr q19, [%[inptr], #0x60]\n"
1602 "fadd v16.4s, v16.4s, v2.4s\n"
1603 "ldr q20, [%[inptr], #0x70]\n"
1604 "fmin v14.4s, v14.4s, v0.4s\n"
1605 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1606 "fmax v13.4s, v13.4s, v1.4s\n"
1607 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1608 "fmax v14.4s, v14.4s, v1.4s\n"
1609 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1610 "fmin v15.4s, v15.4s, v0.4s\n"
1611 "str q13, [%[outptr0]]\n"
1612 "fmin v16.4s, v16.4s, v0.4s\n"
1613 "ldr q13, [%[inptr], #0x80]\n"
1614 "fadd v17.4s, v17.4s, v3.4s\n"
1615 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1616 "fmax v15.4s, v15.4s, v1.4s\n"
1617 "str q14, [%[outptr0], #0x10]\n"
1618 "fmax v16.4s, v16.4s, v1.4s\n"
1619 "ldr q14, [%[inptr], #0x90]\n"
1620 "fmin v17.4s, v17.4s, v0.4s\n"
1621 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1622 "fadd v18.4s, v18.4s, v4.4s\n"
1623 "str q15, [%[outptr0], #0x20]\n"
1624 "fadd v19.4s, v19.4s, v2.4s\n"
1625 "ldr q15, [%[inptr], #0xa0]\n"
1626 "fmax v17.4s, v17.4s, v1.4s\n"
1627 "add %[outptr0], %[outptr0], #0x30\n"
1628 "fmin v18.4s, v18.4s, v0.4s\n"
1629 "str q16, [%[outptr1]]\n"
1630 "fmin v19.4s, v19.4s, v0.4s\n"
1631 "ldr q16, [%[inptr], #0xb0]\n"
1632 "fadd v20.4s, v20.4s, v3.4s\n"
1633 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1634 "fmax v18.4s, v18.4s, v1.4s\n"
1635 "str q17, [%[outptr1], #0x10]\n"
1636 "fmax v19.4s, v19.4s, v1.4s\n"
1637 "ldr q17, [%[inptr], #0xc0]\n"
1638 "fmin v20.4s, v20.4s, v0.4s\n"
1639 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1640 "fadd v13.4s, v13.4s, v4.4s\n"
1641 "str q18, [%[outptr1], #0x20]\n"
1642 "fadd v14.4s, v14.4s, v2.4s\n"
1643 "ldr q18, [%[inptr], #0xd0]\n"
1644 "fmax v20.4s, v20.4s, v1.4s\n"
1645 "add %[outptr1], %[outptr1], #0x30\n"
1646 "fmin v13.4s, v13.4s, v0.4s\n"
1647 "str q19, [%[outptr2]]\n"
1648 "fmin v14.4s, v14.4s, v0.4s\n"
1649 "ldr q19, [%[inptr], #0xe0]\n"
1650 "fadd v15.4s, v15.4s, v3.4s\n"
1651 "fadd v16.4s, v16.4s, v4.4s\n"
1652 "str q20, [%[outptr2], #0x10]\n"
1653 "fmax v13.4s, v13.4s, v1.4s\n"
1654 "ldr q20, [%[inptr], #0xf0]\n"
1655 "fmax v14.4s, v14.4s, v1.4s\n"
1656 "fmin v15.4s, v15.4s, v0.4s\n"
1657 "fmin v16.4s, v16.4s, v0.4s\n"
1658 "str q13, [%[outptr2], #0x20]\n"
1659 "fadd v17.4s, v17.4s, v2.4s\n"
1660 "ldr q13, [%[inptr], #0x100]\n"
1661 "fadd v18.4s, v18.4s, v3.4s\n"
1662 "add %[outptr2], %[outptr2], #0x30\n"
1663 "fmax v15.4s, v15.4s, v1.4s\n"
1664 "str q14, [%[outptr3]]\n"
1665 "fmax v16.4s, v16.4s, v1.4s\n"
1666 "ldr q14, [%[inptr], #0x110]\n"
1667 "fmin v17.4s, v17.4s, v0.4s\n"
1668 "add %[inptr], %[inptr], #0x180\n"
1669 "fmin v18.4s, v18.4s, v0.4s\n"
1670 "str q15, [%[outptr3], #0x10]\n"
1671 "fadd v19.4s, v19.4s, v4.4s\n"
1672 "fmax v17.4s, v17.4s, v1.4s\n"
1673 "fadd v20.4s, v20.4s, v2.4s\n"
1674 "str q16, [%[outptr3], #0x20]\n"
1675 "fmax v18.4s, v18.4s, v1.4s\n"
1676 "add %[outptr3], %[outptr3], #0x30\n"
1677 "fmin v19.4s, v19.4s, v0.4s\n"
1678 "str q17, [%[outptr4]]\n"
1679 "fmin v20.4s, v20.4s, v0.4s\n"
1680 "fadd v13.4s, v13.4s, v3.4s\n"
1681 "fadd v14.4s, v14.4s, v4.4s\n"
1682 "str q18, [%[outptr4], #0x10]\n"
1683 "fmax v19.4s, v19.4s, v1.4s\n"
1684 "fmax v20.4s, v20.4s, v1.4s\n"
1685 "fmin v13.4s, v13.4s, v0.4s\n"
1686 "fmin v14.4s, v14.4s, v0.4s\n"
1687 "str q19, [%[outptr4], #0x20]\n"
1688 "add %[outptr4], %[outptr4], #0x30\n"
1689 "fmax v13.4s, v13.4s, v1.4s\n"
1690 "str q20, [%[outptr5]]\n"
1691 "fmax v14.4s, v14.4s, v1.4s\n"
1692 "str q13, [%[outptr5], #0x10]\n"
1693 "str q14, [%[outptr5], #0x20]\n"
1694 "add %[outptr5], %[outptr5], #0x30\n"
1695 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1696 [inptr] "+r" (inptr)
1697 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1698 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1699 );
1700 }
1701 }
1702 break;
1703
1704 case 7:
1705 {
1706 if ((i+11) >= xmax)
1707 {
1708 for (int xi=0; xi<11; xi++)
1709 {
1710 if ((i+xi) < xmax)
1711 {
1712 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1713 outptr0++;
1714 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1715 outptr1++;
1716 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1717 outptr2++;
1718 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1719 outptr3++;
1720 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1721 outptr4++;
1722 *outptr5 = std::min(std::max(minval, inptr[xi + 60] + biasptr[xi]), maxval);
1723 outptr5++;
1724 *outptr6 = std::min(std::max(minval, inptr[xi + 72] + biasptr[xi]), maxval);
1725 outptr6++;
1726 }
1727 }
1728 inptr += 96;
1729 } else {
1730 /* Optimized routine to copy an entire block */
1731 __asm __volatile (
1732 "dup v0.4s, %[maxval].s[0]\n"
1733 "ldr q2, [%[biasptr]]\n"
1734 "dup v1.4s, %[minval].s[0]\n"
1735 "ldr q3, [%[biasptr], #0x10]\n"
1736 "ldr q4, [%[biasptr], #0x20]\n"
1737 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1738 "ldr q13, [%[inptr]]\n"
1739 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1740 "ldr q14, [%[inptr], #0x10]\n"
1741 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1742 "fadd v13.4s, v13.4s, v2.4s\n"
1743 "ldr q15, [%[inptr], #0x20]\n"
1744 "ldr q16, [%[inptr], #0x30]\n"
1745 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1746 "fadd v14.4s, v14.4s, v3.4s\n"
1747 "ldr q17, [%[inptr], #0x40]\n"
1748 "fmin v13.4s, v13.4s, v0.4s\n"
1749 "ldr q18, [%[inptr], #0x50]\n"
1750 "fadd v15.4s, v15.4s, v4.4s\n"
1751 "ldr q19, [%[inptr], #0x60]\n"
1752 "fadd v16.4s, v16.4s, v2.4s\n"
1753 "ldr q20, [%[inptr], #0x70]\n"
1754 "fmin v14.4s, v14.4s, v0.4s\n"
1755 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1756 "fmax v13.4s, v13.4s, v1.4s\n"
1757 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1758 "fmax v14.4s, v14.4s, v1.4s\n"
1759 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1760 "fmin v15.4s, v15.4s, v0.4s\n"
1761 "str q13, [%[outptr0]]\n"
1762 "fmin v16.4s, v16.4s, v0.4s\n"
1763 "ldr q13, [%[inptr], #0x80]\n"
1764 "fadd v17.4s, v17.4s, v3.4s\n"
1765 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1766 "fmax v15.4s, v15.4s, v1.4s\n"
1767 "str q14, [%[outptr0], #0x10]\n"
1768 "fmax v16.4s, v16.4s, v1.4s\n"
1769 "ldr q14, [%[inptr], #0x90]\n"
1770 "fmin v17.4s, v17.4s, v0.4s\n"
1771 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1772 "fadd v18.4s, v18.4s, v4.4s\n"
1773 "str q15, [%[outptr0], #0x20]\n"
1774 "fadd v19.4s, v19.4s, v2.4s\n"
1775 "ldr q15, [%[inptr], #0xa0]\n"
1776 "fmax v17.4s, v17.4s, v1.4s\n"
1777 "add %[outptr0], %[outptr0], #0x30\n"
1778 "fmin v18.4s, v18.4s, v0.4s\n"
1779 "str q16, [%[outptr1]]\n"
1780 "fmin v19.4s, v19.4s, v0.4s\n"
1781 "ldr q16, [%[inptr], #0xb0]\n"
1782 "fadd v20.4s, v20.4s, v3.4s\n"
1783 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1784 "fmax v18.4s, v18.4s, v1.4s\n"
1785 "str q17, [%[outptr1], #0x10]\n"
1786 "fmax v19.4s, v19.4s, v1.4s\n"
1787 "ldr q17, [%[inptr], #0xc0]\n"
1788 "fmin v20.4s, v20.4s, v0.4s\n"
1789 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1790 "fadd v13.4s, v13.4s, v4.4s\n"
1791 "str q18, [%[outptr1], #0x20]\n"
1792 "fadd v14.4s, v14.4s, v2.4s\n"
1793 "ldr q18, [%[inptr], #0xd0]\n"
1794 "fmax v20.4s, v20.4s, v1.4s\n"
1795 "add %[outptr1], %[outptr1], #0x30\n"
1796 "fmin v13.4s, v13.4s, v0.4s\n"
1797 "str q19, [%[outptr2]]\n"
1798 "fmin v14.4s, v14.4s, v0.4s\n"
1799 "ldr q19, [%[inptr], #0xe0]\n"
1800 "fadd v15.4s, v15.4s, v3.4s\n"
1801 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1802 "fmax v13.4s, v13.4s, v1.4s\n"
1803 "str q20, [%[outptr2], #0x10]\n"
1804 "fmax v14.4s, v14.4s, v1.4s\n"
1805 "ldr q20, [%[inptr], #0xf0]\n"
1806 "fmin v15.4s, v15.4s, v0.4s\n"
1807 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1808 "fadd v16.4s, v16.4s, v4.4s\n"
1809 "str q13, [%[outptr2], #0x20]\n"
1810 "fadd v17.4s, v17.4s, v2.4s\n"
1811 "ldr q13, [%[inptr], #0x100]\n"
1812 "fmax v15.4s, v15.4s, v1.4s\n"
1813 "add %[outptr2], %[outptr2], #0x30\n"
1814 "fmin v16.4s, v16.4s, v0.4s\n"
1815 "str q14, [%[outptr3]]\n"
1816 "fmin v17.4s, v17.4s, v0.4s\n"
1817 "ldr q14, [%[inptr], #0x110]\n"
1818 "fadd v18.4s, v18.4s, v3.4s\n"
1819 "fadd v19.4s, v19.4s, v4.4s\n"
1820 "str q15, [%[outptr3], #0x10]\n"
1821 "fmax v16.4s, v16.4s, v1.4s\n"
1822 "ldr q15, [%[inptr], #0x120]\n"
1823 "fmax v17.4s, v17.4s, v1.4s\n"
1824 "fmin v18.4s, v18.4s, v0.4s\n"
1825 "fmin v19.4s, v19.4s, v0.4s\n"
1826 "str q16, [%[outptr3], #0x20]\n"
1827 "fadd v20.4s, v20.4s, v2.4s\n"
1828 "ldr q16, [%[inptr], #0x130]\n"
1829 "fadd v13.4s, v13.4s, v3.4s\n"
1830 "add %[outptr3], %[outptr3], #0x30\n"
1831 "fmax v18.4s, v18.4s, v1.4s\n"
1832 "str q17, [%[outptr4]]\n"
1833 "fmax v19.4s, v19.4s, v1.4s\n"
1834 "ldr q17, [%[inptr], #0x140]\n"
1835 "fmin v20.4s, v20.4s, v0.4s\n"
1836 "add %[inptr], %[inptr], #0x180\n"
1837 "fmin v13.4s, v13.4s, v0.4s\n"
1838 "str q18, [%[outptr4], #0x10]\n"
1839 "fadd v14.4s, v14.4s, v4.4s\n"
1840 "fmax v20.4s, v20.4s, v1.4s\n"
1841 "fadd v15.4s, v15.4s, v2.4s\n"
1842 "str q19, [%[outptr4], #0x20]\n"
1843 "fmax v13.4s, v13.4s, v1.4s\n"
1844 "add %[outptr4], %[outptr4], #0x30\n"
1845 "fmin v14.4s, v14.4s, v0.4s\n"
1846 "str q20, [%[outptr5]]\n"
1847 "fmin v15.4s, v15.4s, v0.4s\n"
1848 "fadd v16.4s, v16.4s, v3.4s\n"
1849 "fadd v17.4s, v17.4s, v4.4s\n"
1850 "str q13, [%[outptr5], #0x10]\n"
1851 "fmax v14.4s, v14.4s, v1.4s\n"
1852 "fmax v15.4s, v15.4s, v1.4s\n"
1853 "fmin v16.4s, v16.4s, v0.4s\n"
1854 "fmin v17.4s, v17.4s, v0.4s\n"
1855 "str q14, [%[outptr5], #0x20]\n"
1856 "add %[outptr5], %[outptr5], #0x30\n"
1857 "fmax v16.4s, v16.4s, v1.4s\n"
1858 "str q15, [%[outptr6]]\n"
1859 "fmax v17.4s, v17.4s, v1.4s\n"
1860 "str q16, [%[outptr6], #0x10]\n"
1861 "str q17, [%[outptr6], #0x20]\n"
1862 "add %[outptr6], %[outptr6], #0x30\n"
1863 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
1864 [inptr] "+r" (inptr)
1865 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
1866 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
1867 );
1868 }
1869 }
1870 break;
1871
1872 default:
1873 case 8:
1874 {
1875 if ((i+11) >= xmax)
1876 {
1877 for (int xi=0; xi<11; xi++)
1878 {
1879 if ((i+xi) < xmax)
1880 {
1881 *outptr0 = std::min(std::max(minval, inptr[xi] + biasptr[xi]), maxval);
1882 outptr0++;
1883 *outptr1 = std::min(std::max(minval, inptr[xi + 12] + biasptr[xi]), maxval);
1884 outptr1++;
1885 *outptr2 = std::min(std::max(minval, inptr[xi + 24] + biasptr[xi]), maxval);
1886 outptr2++;
1887 *outptr3 = std::min(std::max(minval, inptr[xi + 36] + biasptr[xi]), maxval);
1888 outptr3++;
1889 *outptr4 = std::min(std::max(minval, inptr[xi + 48] + biasptr[xi]), maxval);
1890 outptr4++;
1891 *outptr5 = std::min(std::max(minval, inptr[xi + 60] + biasptr[xi]), maxval);
1892 outptr5++;
1893 *outptr6 = std::min(std::max(minval, inptr[xi + 72] + biasptr[xi]), maxval);
1894 outptr6++;
1895 *outptr7 = std::min(std::max(minval, inptr[xi + 84] + biasptr[xi]), maxval);
1896 outptr7++;
1897 }
1898 }
1899 inptr += 96;
1900 } else {
1901 /* Optimized routine to copy an entire block */
1902 __asm __volatile (
1903 "dup v0.4s, %[maxval].s[0]\n"
1904 "ldr q2, [%[biasptr]]\n"
1905 "dup v1.4s, %[minval].s[0]\n"
1906 "ldr q3, [%[biasptr], #0x10]\n"
1907 "ldr q4, [%[biasptr], #0x20]\n"
1908 "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
1909 "ldr q13, [%[inptr]]\n"
1910 "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
1911 "ldr q14, [%[inptr], #0x10]\n"
1912 "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
1913 "fadd v13.4s, v13.4s, v2.4s\n"
1914 "ldr q15, [%[inptr], #0x20]\n"
1915 "ldr q16, [%[inptr], #0x30]\n"
1916 "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
1917 "fadd v14.4s, v14.4s, v3.4s\n"
1918 "ldr q17, [%[inptr], #0x40]\n"
1919 "fmin v13.4s, v13.4s, v0.4s\n"
1920 "ldr q18, [%[inptr], #0x50]\n"
1921 "fadd v15.4s, v15.4s, v4.4s\n"
1922 "ldr q19, [%[inptr], #0x60]\n"
1923 "fadd v16.4s, v16.4s, v2.4s\n"
1924 "ldr q20, [%[inptr], #0x70]\n"
1925 "fmin v14.4s, v14.4s, v0.4s\n"
1926 "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
1927 "fmax v13.4s, v13.4s, v1.4s\n"
1928 "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
1929 "fmax v14.4s, v14.4s, v1.4s\n"
1930 "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
1931 "fmin v15.4s, v15.4s, v0.4s\n"
1932 "str q13, [%[outptr0]]\n"
1933 "fmin v16.4s, v16.4s, v0.4s\n"
1934 "ldr q13, [%[inptr], #0x80]\n"
1935 "fadd v17.4s, v17.4s, v3.4s\n"
1936 "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
1937 "fmax v15.4s, v15.4s, v1.4s\n"
1938 "str q14, [%[outptr0], #0x10]\n"
1939 "fmax v16.4s, v16.4s, v1.4s\n"
1940 "ldr q14, [%[inptr], #0x90]\n"
1941 "fmin v17.4s, v17.4s, v0.4s\n"
1942 "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
1943 "fadd v18.4s, v18.4s, v4.4s\n"
1944 "str q15, [%[outptr0], #0x20]\n"
1945 "fadd v19.4s, v19.4s, v2.4s\n"
1946 "ldr q15, [%[inptr], #0xa0]\n"
1947 "fmax v17.4s, v17.4s, v1.4s\n"
1948 "add %[outptr0], %[outptr0], #0x30\n"
1949 "fmin v18.4s, v18.4s, v0.4s\n"
1950 "str q16, [%[outptr1]]\n"
1951 "fmin v19.4s, v19.4s, v0.4s\n"
1952 "ldr q16, [%[inptr], #0xb0]\n"
1953 "fadd v20.4s, v20.4s, v3.4s\n"
1954 "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
1955 "fmax v18.4s, v18.4s, v1.4s\n"
1956 "str q17, [%[outptr1], #0x10]\n"
1957 "fmax v19.4s, v19.4s, v1.4s\n"
1958 "ldr q17, [%[inptr], #0xc0]\n"
1959 "fmin v20.4s, v20.4s, v0.4s\n"
1960 "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
1961 "fadd v13.4s, v13.4s, v4.4s\n"
1962 "str q18, [%[outptr1], #0x20]\n"
1963 "fadd v14.4s, v14.4s, v2.4s\n"
1964 "ldr q18, [%[inptr], #0xd0]\n"
1965 "fmax v20.4s, v20.4s, v1.4s\n"
1966 "add %[outptr1], %[outptr1], #0x30\n"
1967 "fmin v13.4s, v13.4s, v0.4s\n"
1968 "str q19, [%[outptr2]]\n"
1969 "fmin v14.4s, v14.4s, v0.4s\n"
1970 "ldr q19, [%[inptr], #0xe0]\n"
1971 "fadd v15.4s, v15.4s, v3.4s\n"
1972 "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
1973 "fmax v13.4s, v13.4s, v1.4s\n"
1974 "str q20, [%[outptr2], #0x10]\n"
1975 "fmax v14.4s, v14.4s, v1.4s\n"
1976 "ldr q20, [%[inptr], #0xf0]\n"
1977 "fmin v15.4s, v15.4s, v0.4s\n"
1978 "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
1979 "fadd v16.4s, v16.4s, v4.4s\n"
1980 "str q13, [%[outptr2], #0x20]\n"
1981 "fadd v17.4s, v17.4s, v2.4s\n"
1982 "ldr q13, [%[inptr], #0x100]\n"
1983 "fmax v15.4s, v15.4s, v1.4s\n"
1984 "add %[outptr2], %[outptr2], #0x30\n"
1985 "fmin v16.4s, v16.4s, v0.4s\n"
1986 "str q14, [%[outptr3]]\n"
1987 "fmin v17.4s, v17.4s, v0.4s\n"
1988 "ldr q14, [%[inptr], #0x110]\n"
1989 "fadd v18.4s, v18.4s, v3.4s\n"
1990 "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
1991 "fmax v16.4s, v16.4s, v1.4s\n"
1992 "str q15, [%[outptr3], #0x10]\n"
1993 "fmax v17.4s, v17.4s, v1.4s\n"
1994 "ldr q15, [%[inptr], #0x120]\n"
1995 "fmin v18.4s, v18.4s, v0.4s\n"
1996 "fadd v19.4s, v19.4s, v4.4s\n"
1997 "str q16, [%[outptr3], #0x20]\n"
1998 "fadd v20.4s, v20.4s, v2.4s\n"
1999 "ldr q16, [%[inptr], #0x130]\n"
2000 "fadd v13.4s, v13.4s, v3.4s\n"
2001 "add %[outptr3], %[outptr3], #0x30\n"
2002 "fmax v18.4s, v18.4s, v1.4s\n"
2003 "str q17, [%[outptr4]]\n"
2004 "fmin v19.4s, v19.4s, v0.4s\n"
2005 "ldr q17, [%[inptr], #0x140]\n"
2006 "fmin v20.4s, v20.4s, v0.4s\n"
2007 "fmin v13.4s, v13.4s, v0.4s\n"
2008 "str q18, [%[outptr4], #0x10]\n"
2009 "fadd v14.4s, v14.4s, v4.4s\n"
2010 "ldr q18, [%[inptr], #0x150]\n"
2011 "fmax v19.4s, v19.4s, v1.4s\n"
2012 "fmax v20.4s, v20.4s, v1.4s\n"
2013 "fmax v13.4s, v13.4s, v1.4s\n"
2014 "fmin v14.4s, v14.4s, v0.4s\n"
2015 "str q19, [%[outptr4], #0x20]\n"
2016 "fadd v15.4s, v15.4s, v2.4s\n"
2017 "ldr q19, [%[inptr], #0x160]\n"
2018 "fadd v16.4s, v16.4s, v3.4s\n"
2019 "add %[outptr4], %[outptr4], #0x30\n"
2020 "fmax v14.4s, v14.4s, v1.4s\n"
2021 "str q20, [%[outptr5]]\n"
2022 "fmin v15.4s, v15.4s, v0.4s\n"
2023 "ldr q20, [%[inptr], #0x170]\n"
2024 "fmin v16.4s, v16.4s, v0.4s\n"
2025 "add %[inptr], %[inptr], #0x180\n"
2026 "fadd v17.4s, v17.4s, v4.4s\n"
2027 "str q13, [%[outptr5], #0x10]\n"
2028 "fmax v15.4s, v15.4s, v1.4s\n"
2029 "fmax v16.4s, v16.4s, v1.4s\n"
2030 "fadd v18.4s, v18.4s, v2.4s\n"
2031 "str q14, [%[outptr5], #0x20]\n"
2032 "fmin v17.4s, v17.4s, v0.4s\n"
2033 "add %[outptr5], %[outptr5], #0x30\n"
2034 "fadd v19.4s, v19.4s, v3.4s\n"
2035 "str q15, [%[outptr6]]\n"
2036 "fmin v18.4s, v18.4s, v0.4s\n"
2037 "fmax v17.4s, v17.4s, v1.4s\n"
2038 "fadd v20.4s, v20.4s, v4.4s\n"
2039 "str q16, [%[outptr6], #0x10]\n"
2040 "fmin v19.4s, v19.4s, v0.4s\n"
2041 "fmax v18.4s, v18.4s, v1.4s\n"
2042 "fmin v20.4s, v20.4s, v0.4s\n"
2043 "str q17, [%[outptr6], #0x20]\n"
2044 "fmax v19.4s, v19.4s, v1.4s\n"
2045 "add %[outptr6], %[outptr6], #0x30\n"
2046 "fmax v20.4s, v20.4s, v1.4s\n"
2047 "str q18, [%[outptr7]]\n"
2048 "str q19, [%[outptr7], #0x10]\n"
2049 "str q20, [%[outptr7], #0x20]\n"
2050 "add %[outptr7], %[outptr7], #0x30\n"
2051 : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
2052 [inptr] "+r" (inptr)
2053 : [biasptr] "r" (biasptr), [minval] "w" (minval), [maxval] "w" (maxval)
2054 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "memory"
2055 );
2056 }
2057 }
2058 break;
2059
2060
2061 }
2062 }
2063 }
2064 }
2065 }
2066
2067 #endif // __aarch64__
2068