xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_compute/core/Helpers.h"
26 #include "arm_compute/core/ITensor.h"
27 #include "arm_compute/core/Types.h"
28 #include "arm_compute/core/Window.h"
29 
30 #include <cstddef>
31 #include <cstdint>
32 #include <limits>
33 
34 #if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
35 namespace
36 {
a64_add_bn_clamp_direct_fp16_2x32(float16_t * out,size_t out_stride,float16_t * out_direct,size_t out_direct_stride,const float16_t * in0,size_t in0_stride,const float16_t * in1,size_t in1_stride,const float16_t * bn_mul,const float16_t * bn_add,const float16_t minval,const float16_t maxval,size_t width,size_t height)37 void a64_add_bn_clamp_direct_fp16_2x32(
38     float16_t *out, size_t out_stride,
39     float16_t *out_direct, size_t out_direct_stride,
40     const float16_t *in0, size_t in0_stride,
41     const float16_t *in1, size_t in1_stride,
42     const float16_t *bn_mul,
43     const float16_t *bn_add,
44     const float16_t  minval,
45     const float16_t  maxval,
46     size_t width, size_t height)
47 {
48     struct KernelArgs
49     {
50         float16_t minval;
51         float16_t maxval;
52     } ka;
53     ka.minval = minval;
54     ka.maxval = maxval;
55 
56     __asm__ __volatile__(
57         "ldr w21, [%x[args_ptr], %[offsetof_minval]]\n"
58         "ldr w20, [%x[args_ptr], %[offsetof_maxval]]\n"
59         "cmp %x[width], #0x20\n"
60         "dup v13.8h, w21\n"
61         "dup v12.8h, w20\n"
62         "blt 7f\n"
63         "1:" // Column loop
64         "ldr q24, [%x[bn_mul], #0x0]\n"
65         "ldr q25, [%x[bn_mul], #0x10]\n"
66         "mov x12, %x[in0]\n"
67         "mov x11, %x[in1]\n"
68         "ldr q26, [%x[bn_mul], #0x20]\n"
69         "ldr q27, [%x[bn_mul], #0x30]\n"
70         "mov x10, %x[out]\n"
71         "mov x9, %x[out_direct]\n"
72         "ldr q28, [%x[bn_add], #0x0]\n"
73         "ldr q29, [%x[bn_add], #0x10]\n"
74         "mov x20, %x[height]\n"
75         "mov x28, x12\n"
76         "ldr q30, [%x[bn_add], #0x20]\n"
77         "ldr q31, [%x[bn_add], #0x30]\n"
78         "mov x27, x11\n"
79         "mov x26, x10\n"
80         "ldr q11, [x28, #0x0]\n"
81         "ldr q10, [x27, #0x0]\n"
82         "mov x25, x9\n"
83         "add x24, x28, %x[in0_stride]\n"
84         "ldr q9, [x28, #0x10]\n"
85         "ldr q8, [x27, #0x10]\n"
86         "add x23, x27, %x[in1_stride]\n"
87         "add x22, x26, %x[out_stride]\n"
88         "ldr q7, [x28, #0x20]\n"
89         "ldr q6, [x27, #0x20]\n"
90         "add x21, x25, %x[out_direct_stride]\n"
91         "cmp x20, #0x2\n"
92         "ldr q5, [x28, #0x30]\n"
93         "ldr q4, [x27, #0x30]\n"
94         "add x12, x24, %x[in0_stride]\n"
95         "add x11, x23, %x[in1_stride]\n"
96         "add x10, x22, %x[out_stride]\n"
97         "add x9, x21, %x[out_direct_stride]\n"
98         "csel x24, x24, x28, GE\n"
99         "csel x23, x23, x27, GE\n"
100         "csel x22, x22, x26, GE\n"
101         "csel x21, x21, x25, GE\n"
102         "subs x20, x20, #0x2\n"
103         "add %x[bn_mul], %x[bn_mul], #0x40\n"
104         "add %x[bn_add], %x[bn_add], #0x40\n"
105         "add x28, x28, #0x40\n"
106         "add x27, x27, #0x40\n"
107         "ble 4f\n"
108         "2:" // Row loop
109         "ldr q3, [x24, #0x0]\n"
110         "ldr q22, [x23, #0x0]\n"
111         "fadd v2.8h, v11.8h, v10.8h\n"
112         "fadd v1.8h, v9.8h, v8.8h\n"
113         "ldr q21, [x24, #0x10]\n"
114         "ldr q20, [x23, #0x10]\n"
115         "fadd v0.8h, v7.8h, v6.8h\n"
116         "fadd v23.8h, v5.8h, v4.8h\n"
117         "ldr q19, [x24, #0x20]\n"
118         "ldr q18, [x23, #0x20]\n"
119         "fadd v22.8h, v3.8h, v22.8h\n"
120         "fadd v21.8h, v21.8h, v20.8h\n"
121         "ldr q17, [x24, #0x30]\n"
122         "ldr q16, [x23, #0x30]\n"
123         "fadd v20.8h, v19.8h, v18.8h\n"
124         "fadd v19.8h, v17.8h, v16.8h\n"
125         "add x24, x24, #0x40\n"
126         "add x23, x23, #0x40\n"
127         "cbz %x[out_direct], 3f\n"
128         "str q2, [x25, #0x0]\n"
129         "str q1, [x25, #0x10]\n"
130         "str q0, [x25, #0x20]\n"
131         "str q23, [x25, #0x30]\n"
132         "add x25, x25, #0x40\n"
133         "str q22, [x21, #0x0]\n"
134         "str q21, [x21, #0x10]\n"
135         "str q20, [x21, #0x20]\n"
136         "str q19, [x21, #0x30]\n"
137         "add x21, x21, #0x40\n"
138         "3:" // Main loop: No direct output
139         "mov v16.16b, v2.16b\n"
140         "mov v2.16b, v28.16b\n"
141         "fmla v2.8h, v16.8h, v24.8h\n"
142         "mov x28, x12\n"
143         "ldr q11, [x28, #0x0]\n"
144         "ldr q9, [x28, #0x10]\n"
145         "mov v18.16b, v1.16b\n"
146         "mov v1.16b, v29.16b\n"
147         "ldr q7, [x28, #0x20]\n"
148         "ldr q5, [x28, #0x30]\n"
149         "mov v17.16b, v0.16b\n"
150         "mov v0.16b, v30.16b\n"
151         "mov v16.16b, v23.16b\n"
152         "mov v23.16b, v31.16b\n"
153         "fmla v1.8h, v18.8h, v25.8h\n"
154         "mov x27, x11\n"
155         "ldr q10, [x27, #0x0]\n"
156         "ldr q8, [x27, #0x10]\n"
157         "fmla v0.8h, v17.8h, v26.8h\n"
158         "fmla v23.8h, v16.8h, v27.8h\n"
159         "ldr q6, [x27, #0x20]\n"
160         "ldr q4, [x27, #0x30]\n"
161         "mov v17.16b, v22.16b\n"
162         "mov v22.16b, v28.16b\n"
163         "mov v16.16b, v21.16b\n"
164         "mov v21.16b, v29.16b\n"
165         "fmla v22.8h, v17.8h, v24.8h\n"
166         "mov x25, x9\n"
167         "mov v17.16b, v20.16b\n"
168         "mov v20.16b, v30.16b\n"
169         "fmla v21.8h, v16.8h, v25.8h\n"
170         "add x24, x28, %x[in0_stride]\n"
171         "mov v16.16b, v19.16b\n"
172         "mov v19.16b, v31.16b\n"
173         "fmla v20.8h, v17.8h, v26.8h\n"
174         "add x23, x27, %x[in1_stride]\n"
175         "fmla v19.8h, v16.8h, v27.8h\n"
176         "fmin v2.8h, v2.8h, v12.8h\n"
177         "add x21, x25, %x[out_direct_stride]\n"
178         "cmp x20, #0x2\n"
179         "fmin v1.8h, v1.8h, v12.8h\n"
180         "fmin v0.8h, v0.8h, v12.8h\n"
181         "add x12, x24, %x[in0_stride]\n"
182         "add x11, x23, %x[in1_stride]\n"
183         "fmin v23.8h, v23.8h, v12.8h\n"
184         "fmax v2.8h, v2.8h, v13.8h\n"
185         "str q2, [x26, #0x0]\n"
186         "add x9, x21, %x[out_direct_stride]\n"
187         "fmax v1.8h, v1.8h, v13.8h\n"
188         "fmax v0.8h, v0.8h, v13.8h\n"
189         "str q1, [x26, #0x10]\n"
190         "csel x24, x24, x28, GE\n"
191         "fmax v23.8h, v23.8h, v13.8h\n"
192         "fmin v22.8h, v22.8h, v12.8h\n"
193         "str q0, [x26, #0x20]\n"
194         "csel x23, x23, x27, GE\n"
195         "fmin v21.8h, v21.8h, v12.8h\n"
196         "fmin v20.8h, v20.8h, v12.8h\n"
197         "str q23, [x26, #0x30]\n"
198         "mov x26, x10\n"
199         "fmin v19.8h, v19.8h, v12.8h\n"
200         "fmax v22.8h, v22.8h, v13.8h\n"
201         "str q22, [x22, #0x0]\n"
202         "csel x21, x21, x25, GE\n"
203         "fmax v21.8h, v21.8h, v13.8h\n"
204         "fmax v20.8h, v20.8h, v13.8h\n"
205         "str q21, [x22, #0x10]\n"
206         "add x28, x28, #0x40\n"
207         "fmax v19.8h, v19.8h, v13.8h\n"
208         "str q20, [x22, #0x20]\n"
209         "add x27, x27, #0x40\n"
210         "str q19, [x22, #0x30]\n"
211         "add x22, x26, %x[out_stride]\n"
212         "add x10, x22, %x[out_stride]\n"
213         "csel x22, x22, x26, GE\n"
214         "subs x20, x20, #0x2\n"
215         "bgt 2b\n"
216         "4:" // Row loop skip
217         "ldr q3, [x24, #0x0]\n"
218         "ldr q22, [x23, #0x0]\n"
219         "fadd v2.8h, v11.8h, v10.8h\n"
220         "fadd v1.8h, v9.8h, v8.8h\n"
221         "ldr q21, [x24, #0x10]\n"
222         "ldr q20, [x23, #0x10]\n"
223         "fadd v0.8h, v7.8h, v6.8h\n"
224         "fadd v23.8h, v5.8h, v4.8h\n"
225         "ldr q19, [x24, #0x20]\n"
226         "ldr q18, [x23, #0x20]\n"
227         "fadd v22.8h, v3.8h, v22.8h\n"
228         "fadd v21.8h, v21.8h, v20.8h\n"
229         "ldr q17, [x24, #0x30]\n"
230         "ldr q16, [x23, #0x30]\n"
231         "fadd v20.8h, v19.8h, v18.8h\n"
232         "fadd v19.8h, v17.8h, v16.8h\n"
233         "add x24, x24, #0x40\n"
234         "add x23, x23, #0x40\n"
235         "cbz %x[out_direct], 5f\n"
236         "str q2, [x25, #0x0]\n"
237         "str q1, [x25, #0x10]\n"
238         "str q0, [x25, #0x20]\n"
239         "str q23, [x25, #0x30]\n"
240         "add x25, x25, #0x40\n"
241         "str q22, [x21, #0x0]\n"
242         "str q21, [x21, #0x10]\n"
243         "str q20, [x21, #0x20]\n"
244         "str q19, [x21, #0x30]\n"
245         "add x21, x21, #0x40\n"
246         "5:" // Tail loop: No direct output
247         "mov v16.16b, v2.16b\n"
248         "mov v2.16b, v28.16b\n"
249         "fmla v2.8h, v16.8h, v24.8h\n"
250         "add %x[in0], %x[in0], #0x40\n"
251         "mov v16.16b, v1.16b\n"
252         "mov v1.16b, v29.16b\n"
253         "fmla v1.8h, v16.8h, v25.8h\n"
254         "add %x[in1], %x[in1], #0x40\n"
255         "mov v16.16b, v0.16b\n"
256         "mov v0.16b, v30.16b\n"
257         "fmla v0.8h, v16.8h, v26.8h\n"
258         "add %x[out], %x[out], #0x40\n"
259         "mov v16.16b, v23.16b\n"
260         "mov v23.16b, v31.16b\n"
261         "fmla v23.8h, v16.8h, v27.8h\n"
262         "mov v16.16b, v22.16b\n"
263         "mov v22.16b, v28.16b\n"
264         "fmla v22.8h, v16.8h, v24.8h\n"
265         "mov v16.16b, v21.16b\n"
266         "mov v21.16b, v29.16b\n"
267         "fmla v21.8h, v16.8h, v25.8h\n"
268         "mov v16.16b, v20.16b\n"
269         "mov v20.16b, v30.16b\n"
270         "fmla v20.8h, v16.8h, v26.8h\n"
271         "mov v16.16b, v19.16b\n"
272         "mov v19.16b, v31.16b\n"
273         "fmla v19.8h, v16.8h, v27.8h\n"
274         "fmin v2.8h, v2.8h, v12.8h\n"
275         "fmin v1.8h, v1.8h, v12.8h\n"
276         "fmin v0.8h, v0.8h, v12.8h\n"
277         "fmin v23.8h, v23.8h, v12.8h\n"
278         "fmin v22.8h, v22.8h, v12.8h\n"
279         "fmin v21.8h, v21.8h, v12.8h\n"
280         "fmin v20.8h, v20.8h, v12.8h\n"
281         "fmin v19.8h, v19.8h, v12.8h\n"
282         "fmax v2.8h, v2.8h, v13.8h\n"
283         "fmax v1.8h, v1.8h, v13.8h\n"
284         "str q2, [x26, #0x0]\n"
285         "fmax v0.8h, v0.8h, v13.8h\n"
286         "fmax v23.8h, v23.8h, v13.8h\n"
287         "str q1, [x26, #0x10]\n"
288         "fmax v22.8h, v22.8h, v13.8h\n"
289         "fmax v21.8h, v21.8h, v13.8h\n"
290         "str q0, [x26, #0x20]\n"
291         "fmax v20.8h, v20.8h, v13.8h\n"
292         "fmax v19.8h, v19.8h, v13.8h\n"
293         "str q23, [x26, #0x30]\n"
294         "add x26, x26, #0x40\n"
295         "str q22, [x22, #0x0]\n"
296         "str q21, [x22, #0x10]\n"
297         "str q20, [x22, #0x20]\n"
298         "str q19, [x22, #0x30]\n"
299         "add x22, x22, #0x40\n"
300         "cbz %x[out_direct], 6f\n"
301         "add %x[out_direct], %x[out_direct], #0x40\n"
302         "6:" // No direct pointer update
303         "sub %x[width], %x[width], #0x20\n"
304         "cmp %x[width], #0x20\n"
305         "bge 1b\n"
306         "cbz %x[width], 58f\n"
307         "7:" // main loop skip
308         "ldr q24, [%x[bn_mul], #0x0]\n"
309         "ldr q25, [%x[bn_mul], #0x10]\n"
310         "mov x20, %x[height]\n"
311         "mov x12, %x[in0]\n"
312         "ldr q26, [%x[bn_mul], #0x20]\n"
313         "ldr q27, [%x[bn_mul], #0x30]\n"
314         "mov x11, %x[in1]\n"
315         "mov x10, %x[out]\n"
316         "ldr q28, [%x[bn_add], #0x0]\n"
317         "ldr q29, [%x[bn_add], #0x10]\n"
318         "mov x9, %x[out_direct]\n"
319         "add %x[bn_mul], %x[bn_mul], #0x40\n"
320         "ldr q30, [%x[bn_add], #0x20]\n"
321         "ldr q31, [%x[bn_add], #0x30]\n"
322         "add %x[bn_add], %x[bn_add], #0x40\n"
323         "8:" // tail loop: Row loop
324         "mov x28, x12\n"
325         "mov x27, x11\n"
326         "mov x26, x10\n"
327         "mov x25, x9\n"
328         "add x24, x28, %x[in0_stride]\n"
329         "add x23, x27, %x[in1_stride]\n"
330         "add x22, x26, %x[out_stride]\n"
331         "add x21, x25, %x[out_direct_stride]\n"
332         "cmp x20, #0x2\n"
333         "add x12, x24, %x[in0_stride]\n"
334         "add x11, x23, %x[in1_stride]\n"
335         "add x10, x22, %x[out_stride]\n"
336         "add x9, x21, %x[out_direct_stride]\n"
337         "csel x24, x24, x28, GE\n"
338         "csel x23, x23, x27, GE\n"
339         "csel x22, x22, x26, GE\n"
340         "csel x21, x21, x25, GE\n"
341         "tbz %x[width], #4, 16f\n"
342         "ldr q11, [x28, #0x0]\n"
343         "ldr q10, [x27, #0x0]\n"
344         "ldr q9, [x28, #0x10]\n"
345         "ldr q8, [x27, #0x10]\n"
346         "add x28, x28, #0x20\n"
347         "add x27, x27, #0x20\n"
348         "ldr q3, [x24, #0x0]\n"
349         "ldr q22, [x23, #0x0]\n"
350         "ldr q21, [x24, #0x10]\n"
351         "ldr q20, [x23, #0x10]\n"
352         "add x24, x24, #0x20\n"
353         "add x23, x23, #0x20\n"
354         "tbz %x[width], #3, 12f\n"
355         "ldr q7, [x28, #0x0]\n"
356         "ldr q6, [x27, #0x0]\n"
357         "add x28, x28, #0x10\n"
358         "add x27, x27, #0x10\n"
359         "ldr q19, [x24, #0x0]\n"
360         "ldr q18, [x23, #0x0]\n"
361         "add x24, x24, #0x10\n"
362         "add x23, x23, #0x10\n"
363         "tbz %x[width], #2, 10f\n"
364         "ldr d5, [x28], #0x8\n"
365         "ldr d4, [x27], #0x8\n"
366         "ldr d17, [x24], #0x8\n"
367         "ldr d16, [x23], #0x8\n"
368         "tbz %x[width], #1, 9f\n"
369         "ld1 { v5.s }[2], [x28], #0x4\n"
370         "ld1 { v4.s }[2], [x27], #0x4\n"
371         "ld1 { v17.s }[2], [x24], #0x4\n"
372         "ld1 { v16.s }[2], [x23], #0x4\n"
373         "tbz %x[width], #0, 24f\n"
374         "ld1 { v5.h }[6], [x28], #0x2\n"
375         "ld1 { v4.h }[6], [x27], #0x2\n"
376         "ld1 { v17.h }[6], [x24], #0x2\n"
377         "ld1 { v16.h }[6], [x23], #0x2\n"
378         "b 24f\n"
379         "9:" // tail loop: unique 1: partial_0_28
380         "tbz %x[width], #0, 24f\n"
381         "ld1 { v5.h }[4], [x28], #0x2\n"
382         "ld1 { v4.h }[4], [x27], #0x2\n"
383         "ld1 { v17.h }[4], [x24], #0x2\n"
384         "ld1 { v16.h }[4], [x23], #0x2\n"
385         "b 24f\n"
386         "10:" // tail loop: unique 1: partial_1_24
387         "tbz %x[width], #1, 11f\n"
388         "ldr s5, [x28], #0x4\n"
389         "ldr s4, [x27], #0x4\n"
390         "ldr s17, [x24], #0x4\n"
391         "ldr s16, [x23], #0x4\n"
392         "tbz %x[width], #0, 24f\n"
393         "ld1 { v5.h }[2], [x28], #0x2\n"
394         "ld1 { v4.h }[2], [x27], #0x2\n"
395         "ld1 { v17.h }[2], [x24], #0x2\n"
396         "ld1 { v16.h }[2], [x23], #0x2\n"
397         "b 24f\n"
398         "11:" // tail loop: unique 1: partial_0_24
399         "tbz %x[width], #0, 24f\n"
400         "ldr h5, [x28], #0x2\n"
401         "ldr h4, [x27], #0x2\n"
402         "ldr h17, [x24], #0x2\n"
403         "ldr h16, [x23], #0x2\n"
404         "b 24f\n"
405         "12:" // tail loop: unique 1: partial_2_16
406         "tbz %x[width], #2, 14f\n"
407         "ldr d7, [x28], #0x8\n"
408         "ldr d6, [x27], #0x8\n"
409         "ldr d19, [x24], #0x8\n"
410         "ldr d18, [x23], #0x8\n"
411         "tbz %x[width], #1, 13f\n"
412         "ld1 { v7.s }[2], [x28], #0x4\n"
413         "ld1 { v6.s }[2], [x27], #0x4\n"
414         "ld1 { v19.s }[2], [x24], #0x4\n"
415         "ld1 { v18.s }[2], [x23], #0x4\n"
416         "tbz %x[width], #0, 24f\n"
417         "ld1 { v7.h }[6], [x28], #0x2\n"
418         "ld1 { v6.h }[6], [x27], #0x2\n"
419         "ld1 { v19.h }[6], [x24], #0x2\n"
420         "ld1 { v18.h }[6], [x23], #0x2\n"
421         "b 24f\n"
422         "13:" // tail loop: unique 1: partial_0_20
423         "tbz %x[width], #0, 24f\n"
424         "ld1 { v7.h }[4], [x28], #0x2\n"
425         "ld1 { v6.h }[4], [x27], #0x2\n"
426         "ld1 { v19.h }[4], [x24], #0x2\n"
427         "ld1 { v18.h }[4], [x23], #0x2\n"
428         "b 24f\n"
429         "14:" // tail loop: unique 1: partial_1_16
430         "tbz %x[width], #1, 15f\n"
431         "ldr s7, [x28], #0x4\n"
432         "ldr s6, [x27], #0x4\n"
433         "ldr s19, [x24], #0x4\n"
434         "ldr s18, [x23], #0x4\n"
435         "tbz %x[width], #0, 24f\n"
436         "ld1 { v7.h }[2], [x28], #0x2\n"
437         "ld1 { v6.h }[2], [x27], #0x2\n"
438         "ld1 { v19.h }[2], [x24], #0x2\n"
439         "ld1 { v18.h }[2], [x23], #0x2\n"
440         "b 24f\n"
441         "15:" // tail loop: unique 1: partial_0_16
442         "tbz %x[width], #0, 24f\n"
443         "ldr h7, [x28], #0x2\n"
444         "ldr h6, [x27], #0x2\n"
445         "ldr h19, [x24], #0x2\n"
446         "ldr h18, [x23], #0x2\n"
447         "b 24f\n"
448         "16:" // tail loop: unique 1: partial_3_0
449         "tbz %x[width], #3, 20f\n"
450         "ldr q11, [x28, #0x0]\n"
451         "ldr q10, [x27, #0x0]\n"
452         "add x28, x28, #0x10\n"
453         "add x27, x27, #0x10\n"
454         "ldr q3, [x24, #0x0]\n"
455         "ldr q22, [x23, #0x0]\n"
456         "add x24, x24, #0x10\n"
457         "add x23, x23, #0x10\n"
458         "tbz %x[width], #2, 18f\n"
459         "ldr d9, [x28], #0x8\n"
460         "ldr d8, [x27], #0x8\n"
461         "ldr d21, [x24], #0x8\n"
462         "ldr d20, [x23], #0x8\n"
463         "tbz %x[width], #1, 17f\n"
464         "ld1 { v9.s }[2], [x28], #0x4\n"
465         "ld1 { v8.s }[2], [x27], #0x4\n"
466         "ld1 { v21.s }[2], [x24], #0x4\n"
467         "ld1 { v20.s }[2], [x23], #0x4\n"
468         "tbz %x[width], #0, 24f\n"
469         "ld1 { v9.h }[6], [x28], #0x2\n"
470         "ld1 { v8.h }[6], [x27], #0x2\n"
471         "ld1 { v21.h }[6], [x24], #0x2\n"
472         "ld1 { v20.h }[6], [x23], #0x2\n"
473         "b 24f\n"
474         "17:" // tail loop: unique 1: partial_0_12
475         "tbz %x[width], #0, 24f\n"
476         "ld1 { v9.h }[4], [x28], #0x2\n"
477         "ld1 { v8.h }[4], [x27], #0x2\n"
478         "ld1 { v21.h }[4], [x24], #0x2\n"
479         "ld1 { v20.h }[4], [x23], #0x2\n"
480         "b 24f\n"
481         "18:" // tail loop: unique 1: partial_1_8
482         "tbz %x[width], #1, 19f\n"
483         "ldr s9, [x28], #0x4\n"
484         "ldr s8, [x27], #0x4\n"
485         "ldr s21, [x24], #0x4\n"
486         "ldr s20, [x23], #0x4\n"
487         "tbz %x[width], #0, 24f\n"
488         "ld1 { v9.h }[2], [x28], #0x2\n"
489         "ld1 { v8.h }[2], [x27], #0x2\n"
490         "ld1 { v21.h }[2], [x24], #0x2\n"
491         "ld1 { v20.h }[2], [x23], #0x2\n"
492         "b 24f\n"
493         "19:" // tail loop: unique 1: partial_0_8
494         "tbz %x[width], #0, 24f\n"
495         "ldr h9, [x28], #0x2\n"
496         "ldr h8, [x27], #0x2\n"
497         "ldr h21, [x24], #0x2\n"
498         "ldr h20, [x23], #0x2\n"
499         "b 24f\n"
500         "20:" // tail loop: unique 1: partial_2_0
501         "tbz %x[width], #2, 22f\n"
502         "ldr d11, [x28], #0x8\n"
503         "ldr d10, [x27], #0x8\n"
504         "ldr d3, [x24], #0x8\n"
505         "ldr d22, [x23], #0x8\n"
506         "tbz %x[width], #1, 21f\n"
507         "ld1 { v11.s }[2], [x28], #0x4\n"
508         "ld1 { v10.s }[2], [x27], #0x4\n"
509         "ld1 { v3.s }[2], [x24], #0x4\n"
510         "ld1 { v22.s }[2], [x23], #0x4\n"
511         "tbz %x[width], #0, 24f\n"
512         "ld1 { v11.h }[6], [x28], #0x2\n"
513         "ld1 { v10.h }[6], [x27], #0x2\n"
514         "ld1 { v3.h }[6], [x24], #0x2\n"
515         "ld1 { v22.h }[6], [x23], #0x2\n"
516         "b 24f\n"
517         "21:" // tail loop: unique 1: partial_0_4
518         "tbz %x[width], #0, 24f\n"
519         "ld1 { v11.h }[4], [x28], #0x2\n"
520         "ld1 { v10.h }[4], [x27], #0x2\n"
521         "ld1 { v3.h }[4], [x24], #0x2\n"
522         "ld1 { v22.h }[4], [x23], #0x2\n"
523         "b 24f\n"
524         "22:" // tail loop: unique 1: partial_1_0
525         "tbz %x[width], #1, 23f\n"
526         "ldr s11, [x28], #0x4\n"
527         "ldr s10, [x27], #0x4\n"
528         "ldr s3, [x24], #0x4\n"
529         "ldr s22, [x23], #0x4\n"
530         "tbz %x[width], #0, 24f\n"
531         "ld1 { v11.h }[2], [x28], #0x2\n"
532         "ld1 { v10.h }[2], [x27], #0x2\n"
533         "ld1 { v3.h }[2], [x24], #0x2\n"
534         "ld1 { v22.h }[2], [x23], #0x2\n"
535         "b 24f\n"
536         "23:" // tail loop: unique 1: partial_0_0
537         "ldr h11, [x28], #0x2\n"
538         "ldr h10, [x27], #0x2\n"
539         "ldr h3, [x24], #0x2\n"
540         "ldr h22, [x23], #0x2\n"
541         "24:" // tail loop: unique 1: Done
542         "fadd v2.8h, v11.8h, v10.8h\n"
543         "fadd v1.8h, v9.8h, v8.8h\n"
544         "fadd v0.8h, v7.8h, v6.8h\n"
545         "fadd v23.8h, v5.8h, v4.8h\n"
546         "fadd v22.8h, v3.8h, v22.8h\n"
547         "fadd v21.8h, v21.8h, v20.8h\n"
548         "fadd v20.8h, v19.8h, v18.8h\n"
549         "fadd v19.8h, v17.8h, v16.8h\n"
550         "cbz %x[out_direct], 41f\n"
551         "tbz %x[width], #4, 32f\n"
552         "str q2, [x25, #0x0]\n"
553         "str q1, [x25, #0x10]\n"
554         "add x25, x25, #0x20\n"
555         "str q22, [x21, #0x0]\n"
556         "str q21, [x21, #0x10]\n"
557         "add x21, x21, #0x20\n"
558         "tbz %x[width], #3, 28f\n"
559         "str q0, [x25, #0x0]\n"
560         "add x25, x25, #0x10\n"
561         "str q20, [x21, #0x0]\n"
562         "add x21, x21, #0x10\n"
563         "tbz %x[width], #2, 26f\n"
564         "str d23, [x25], #0x8\n"
565         "str d19, [x21], #0x8\n"
566         "tbz %x[width], #1, 25f\n"
567         "st1 { v23.s }[2], [x25], #0x4\n"
568         "st1 { v19.s }[2], [x21], #0x4\n"
569         "tbz %x[width], #0, 40f\n"
570         "st1 { v23.h }[6], [x25], #0x2\n"
571         "st1 { v19.h }[6], [x21], #0x2\n"
572         "b 40f\n"
573         "25:" // tail loop: Main loop: unique 2: partial_0_28
574         "tbz %x[width], #0, 40f\n"
575         "st1 { v23.h }[4], [x25], #0x2\n"
576         "st1 { v19.h }[4], [x21], #0x2\n"
577         "b 40f\n"
578         "26:" // tail loop: Main loop: unique 2: partial_1_24
579         "tbz %x[width], #1, 27f\n"
580         "str s23, [x25], #0x4\n"
581         "str s19, [x21], #0x4\n"
582         "tbz %x[width], #0, 40f\n"
583         "st1 { v23.h }[2], [x25], #0x2\n"
584         "st1 { v19.h }[2], [x21], #0x2\n"
585         "b 40f\n"
586         "27:" // tail loop: Main loop: unique 2: partial_0_24
587         "tbz %x[width], #0, 40f\n"
588         "str h23, [x25], #0x2\n"
589         "str h19, [x21], #0x2\n"
590         "b 40f\n"
591         "28:" // tail loop: Main loop: unique 2: partial_2_16
592         "tbz %x[width], #2, 30f\n"
593         "str d0, [x25], #0x8\n"
594         "str d20, [x21], #0x8\n"
595         "tbz %x[width], #1, 29f\n"
596         "st1 { v0.s }[2], [x25], #0x4\n"
597         "st1 { v20.s }[2], [x21], #0x4\n"
598         "tbz %x[width], #0, 40f\n"
599         "st1 { v0.h }[6], [x25], #0x2\n"
600         "st1 { v20.h }[6], [x21], #0x2\n"
601         "b 40f\n"
602         "29:" // tail loop: Main loop: unique 2: partial_0_20
603         "tbz %x[width], #0, 40f\n"
604         "st1 { v0.h }[4], [x25], #0x2\n"
605         "st1 { v20.h }[4], [x21], #0x2\n"
606         "b 40f\n"
607         "30:" // tail loop: Main loop: unique 2: partial_1_16
608         "tbz %x[width], #1, 31f\n"
609         "str s0, [x25], #0x4\n"
610         "str s20, [x21], #0x4\n"
611         "tbz %x[width], #0, 40f\n"
612         "st1 { v0.h }[2], [x25], #0x2\n"
613         "st1 { v20.h }[2], [x21], #0x2\n"
614         "b 40f\n"
615         "31:" // tail loop: Main loop: unique 2: partial_0_16
616         "tbz %x[width], #0, 40f\n"
617         "str h0, [x25], #0x2\n"
618         "str h20, [x21], #0x2\n"
619         "b 40f\n"
620         "32:" // tail loop: Main loop: unique 2: partial_3_0
621         "tbz %x[width], #3, 36f\n"
622         "str q2, [x25, #0x0]\n"
623         "add x25, x25, #0x10\n"
624         "str q22, [x21, #0x0]\n"
625         "add x21, x21, #0x10\n"
626         "tbz %x[width], #2, 34f\n"
627         "str d1, [x25], #0x8\n"
628         "str d21, [x21], #0x8\n"
629         "tbz %x[width], #1, 33f\n"
630         "st1 { v1.s }[2], [x25], #0x4\n"
631         "st1 { v21.s }[2], [x21], #0x4\n"
632         "tbz %x[width], #0, 40f\n"
633         "st1 { v1.h }[6], [x25], #0x2\n"
634         "st1 { v21.h }[6], [x21], #0x2\n"
635         "b 40f\n"
636         "33:" // tail loop: Main loop: unique 2: partial_0_12
637         "tbz %x[width], #0, 40f\n"
638         "st1 { v1.h }[4], [x25], #0x2\n"
639         "st1 { v21.h }[4], [x21], #0x2\n"
640         "b 40f\n"
641         "34:" // tail loop: Main loop: unique 2: partial_1_8
642         "tbz %x[width], #1, 35f\n"
643         "str s1, [x25], #0x4\n"
644         "str s21, [x21], #0x4\n"
645         "tbz %x[width], #0, 40f\n"
646         "st1 { v1.h }[2], [x25], #0x2\n"
647         "st1 { v21.h }[2], [x21], #0x2\n"
648         "b 40f\n"
649         "35:" // tail loop: Main loop: unique 2: partial_0_8
650         "tbz %x[width], #0, 40f\n"
651         "str h1, [x25], #0x2\n"
652         "str h21, [x21], #0x2\n"
653         "b 40f\n"
654         "36:" // tail loop: Main loop: unique 2: partial_2_0
655         "tbz %x[width], #2, 38f\n"
656         "str d2, [x25], #0x8\n"
657         "str d22, [x21], #0x8\n"
658         "tbz %x[width], #1, 37f\n"
659         "st1 { v2.s }[2], [x25], #0x4\n"
660         "st1 { v22.s }[2], [x21], #0x4\n"
661         "tbz %x[width], #0, 40f\n"
662         "st1 { v2.h }[6], [x25], #0x2\n"
663         "st1 { v22.h }[6], [x21], #0x2\n"
664         "b 40f\n"
665         "37:" // tail loop: Main loop: unique 2: partial_0_4
666         "tbz %x[width], #0, 40f\n"
667         "st1 { v2.h }[4], [x25], #0x2\n"
668         "st1 { v22.h }[4], [x21], #0x2\n"
669         "b 40f\n"
670         "38:" // tail loop: Main loop: unique 2: partial_1_0
671         "tbz %x[width], #1, 39f\n"
672         "str s2, [x25], #0x4\n"
673         "str s22, [x21], #0x4\n"
674         "tbz %x[width], #0, 40f\n"
675         "st1 { v2.h }[2], [x25], #0x2\n"
676         "st1 { v22.h }[2], [x21], #0x2\n"
677         "b 40f\n"
678         "39:" // tail loop: Main loop: unique 2: partial_0_0
679         "str h2, [x25], #0x2\n"
680         "str h22, [x21], #0x2\n"
681         "40:" // tail loop: Main loop: unique 2: Done
682         "41:" // tail loop: Main loop: No direct output
683         "mov v16.16b, v2.16b\n"
684         "mov v2.16b, v28.16b\n"
685         "fmla v2.8h, v16.8h, v24.8h\n"
686         "mov v16.16b, v1.16b\n"
687         "mov v1.16b, v29.16b\n"
688         "fmla v1.8h, v16.8h, v25.8h\n"
689         "mov v16.16b, v0.16b\n"
690         "mov v0.16b, v30.16b\n"
691         "fmla v0.8h, v16.8h, v26.8h\n"
692         "mov v16.16b, v23.16b\n"
693         "mov v23.16b, v31.16b\n"
694         "fmla v23.8h, v16.8h, v27.8h\n"
695         "mov v16.16b, v22.16b\n"
696         "mov v22.16b, v28.16b\n"
697         "fmla v22.8h, v16.8h, v24.8h\n"
698         "mov v16.16b, v21.16b\n"
699         "mov v21.16b, v29.16b\n"
700         "fmla v21.8h, v16.8h, v25.8h\n"
701         "mov v16.16b, v20.16b\n"
702         "mov v20.16b, v30.16b\n"
703         "fmla v20.8h, v16.8h, v26.8h\n"
704         "mov v16.16b, v19.16b\n"
705         "mov v19.16b, v31.16b\n"
706         "fmla v19.8h, v16.8h, v27.8h\n"
707         "fmin v2.8h, v2.8h, v12.8h\n"
708         "fmin v1.8h, v1.8h, v12.8h\n"
709         "fmin v0.8h, v0.8h, v12.8h\n"
710         "fmin v23.8h, v23.8h, v12.8h\n"
711         "fmin v22.8h, v22.8h, v12.8h\n"
712         "fmin v21.8h, v21.8h, v12.8h\n"
713         "fmin v20.8h, v20.8h, v12.8h\n"
714         "fmin v19.8h, v19.8h, v12.8h\n"
715         "fmax v2.8h, v2.8h, v13.8h\n"
716         "fmax v1.8h, v1.8h, v13.8h\n"
717         "fmax v0.8h, v0.8h, v13.8h\n"
718         "fmax v23.8h, v23.8h, v13.8h\n"
719         "fmax v22.8h, v22.8h, v13.8h\n"
720         "fmax v21.8h, v21.8h, v13.8h\n"
721         "fmax v20.8h, v20.8h, v13.8h\n"
722         "fmax v19.8h, v19.8h, v13.8h\n"
723         "tbz %x[width], #4, 49f\n"
724         "str q2, [x26, #0x0]\n"
725         "str q1, [x26, #0x10]\n"
726         "add x26, x26, #0x20\n"
727         "str q22, [x22, #0x0]\n"
728         "str q21, [x22, #0x10]\n"
729         "add x22, x22, #0x20\n"
730         "tbz %x[width], #3, 45f\n"
731         "str q0, [x26, #0x0]\n"
732         "add x26, x26, #0x10\n"
733         "str q20, [x22, #0x0]\n"
734         "add x22, x22, #0x10\n"
735         "tbz %x[width], #2, 43f\n"
736         "str d23, [x26], #0x8\n"
737         "str d19, [x22], #0x8\n"
738         "tbz %x[width], #1, 42f\n"
739         "st1 { v23.s }[2], [x26], #0x4\n"
740         "st1 { v19.s }[2], [x22], #0x4\n"
741         "tbz %x[width], #0, 57f\n"
742         "st1 { v23.h }[6], [x26], #0x2\n"
743         "st1 { v19.h }[6], [x22], #0x2\n"
744         "b 57f\n"
745         "42:" // tail loop: unique 3: partial_0_28
746         "tbz %x[width], #0, 57f\n"
747         "st1 { v23.h }[4], [x26], #0x2\n"
748         "st1 { v19.h }[4], [x22], #0x2\n"
749         "b 57f\n"
750         "43:" // tail loop: unique 3: partial_1_24
751         "tbz %x[width], #1, 44f\n"
752         "str s23, [x26], #0x4\n"
753         "str s19, [x22], #0x4\n"
754         "tbz %x[width], #0, 57f\n"
755         "st1 { v23.h }[2], [x26], #0x2\n"
756         "st1 { v19.h }[2], [x22], #0x2\n"
757         "b 57f\n"
758         "44:" // tail loop: unique 3: partial_0_24
759         "tbz %x[width], #0, 57f\n"
760         "str h23, [x26], #0x2\n"
761         "str h19, [x22], #0x2\n"
762         "b 57f\n"
763         "45:" // tail loop: unique 3: partial_2_16
764         "tbz %x[width], #2, 47f\n"
765         "str d0, [x26], #0x8\n"
766         "str d20, [x22], #0x8\n"
767         "tbz %x[width], #1, 46f\n"
768         "st1 { v0.s }[2], [x26], #0x4\n"
769         "st1 { v20.s }[2], [x22], #0x4\n"
770         "tbz %x[width], #0, 57f\n"
771         "st1 { v0.h }[6], [x26], #0x2\n"
772         "st1 { v20.h }[6], [x22], #0x2\n"
773         "b 57f\n"
774         "46:" // tail loop: unique 3: partial_0_20
775         "tbz %x[width], #0, 57f\n"
776         "st1 { v0.h }[4], [x26], #0x2\n"
777         "st1 { v20.h }[4], [x22], #0x2\n"
778         "b 57f\n"
779         "47:" // tail loop: unique 3: partial_1_16
780         "tbz %x[width], #1, 48f\n"
781         "str s0, [x26], #0x4\n"
782         "str s20, [x22], #0x4\n"
783         "tbz %x[width], #0, 57f\n"
784         "st1 { v0.h }[2], [x26], #0x2\n"
785         "st1 { v20.h }[2], [x22], #0x2\n"
786         "b 57f\n"
787         "48:" // tail loop: unique 3: partial_0_16
788         "tbz %x[width], #0, 57f\n"
789         "str h0, [x26], #0x2\n"
790         "str h20, [x22], #0x2\n"
791         "b 57f\n"
792         "49:" // tail loop: unique 3: partial_3_0
793         "tbz %x[width], #3, 53f\n"
794         "str q2, [x26, #0x0]\n"
795         "add x26, x26, #0x10\n"
796         "str q22, [x22, #0x0]\n"
797         "add x22, x22, #0x10\n"
798         "tbz %x[width], #2, 51f\n"
799         "str d1, [x26], #0x8\n"
800         "str d21, [x22], #0x8\n"
801         "tbz %x[width], #1, 50f\n"
802         "st1 { v1.s }[2], [x26], #0x4\n"
803         "st1 { v21.s }[2], [x22], #0x4\n"
804         "tbz %x[width], #0, 57f\n"
805         "st1 { v1.h }[6], [x26], #0x2\n"
806         "st1 { v21.h }[6], [x22], #0x2\n"
807         "b 57f\n"
808         "50:" // tail loop: unique 3: partial_0_12
809         "tbz %x[width], #0, 57f\n"
810         "st1 { v1.h }[4], [x26], #0x2\n"
811         "st1 { v21.h }[4], [x22], #0x2\n"
812         "b 57f\n"
813         "51:" // tail loop: unique 3: partial_1_8
814         "tbz %x[width], #1, 52f\n"
815         "str s1, [x26], #0x4\n"
816         "str s21, [x22], #0x4\n"
817         "tbz %x[width], #0, 57f\n"
818         "st1 { v1.h }[2], [x26], #0x2\n"
819         "st1 { v21.h }[2], [x22], #0x2\n"
820         "b 57f\n"
821         "52:" // tail loop: unique 3: partial_0_8
822         "tbz %x[width], #0, 57f\n"
823         "str h1, [x26], #0x2\n"
824         "str h21, [x22], #0x2\n"
825         "b 57f\n"
826         "53:" // tail loop: unique 3: partial_2_0
827         "tbz %x[width], #2, 55f\n"
828         "str d2, [x26], #0x8\n"
829         "str d22, [x22], #0x8\n"
830         "tbz %x[width], #1, 54f\n"
831         "st1 { v2.s }[2], [x26], #0x4\n"
832         "st1 { v22.s }[2], [x22], #0x4\n"
833         "tbz %x[width], #0, 57f\n"
834         "st1 { v2.h }[6], [x26], #0x2\n"
835         "st1 { v22.h }[6], [x22], #0x2\n"
836         "b 57f\n"
837         "54:" // tail loop: unique 3: partial_0_4
838         "tbz %x[width], #0, 57f\n"
839         "st1 { v2.h }[4], [x26], #0x2\n"
840         "st1 { v22.h }[4], [x22], #0x2\n"
841         "b 57f\n"
842         "55:" // tail loop: unique 3: partial_1_0
843         "tbz %x[width], #1, 56f\n"
844         "str s2, [x26], #0x4\n"
845         "str s22, [x22], #0x4\n"
846         "tbz %x[width], #0, 57f\n"
847         "st1 { v2.h }[2], [x26], #0x2\n"
848         "st1 { v22.h }[2], [x22], #0x2\n"
849         "b 57f\n"
850         "56:" // tail loop: unique 3: partial_0_0
851         "str h2, [x26], #0x2\n"
852         "str h22, [x22], #0x2\n"
853         "57:" // tail loop: unique 3: Done
854         "subs x20, x20, #0x2\n"
855         "bgt 8b\n"
856         "58:" // odd columns skip
857         : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
858         : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
859         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
860 }
861 
862 } // namespace
863 
864 namespace arm_compute
865 {
866 namespace cpu
867 {
add_mul_add_fp16_neon(const ITensor * input1,const ITensor * input2,const ITensor * bn_mul,const ITensor * bn_add,ITensor * add_output,ITensor * final_output,ConvertPolicy policy,const ActivationLayerInfo & act_info,const Window & window)868 void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
869                            ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
870 {
871     ARM_COMPUTE_UNUSED(policy);
872 
873     const size_t out_stride        = final_output->info()->strides_in_bytes()[1];
874     const size_t out_direct_stride = (add_output != nullptr) ? add_output->info()->strides_in_bytes()[1] : 0;
875     const size_t in0_stride        = input1->info()->strides_in_bytes()[1];
876     const size_t in1_stride        = input2->info()->strides_in_bytes()[1];
877 
878     float16_t minval = std::numeric_limits<half>::lowest();
879     float16_t maxval = std::numeric_limits<half>::max();
880 
881     if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
882     {
883         minval = static_cast<float16_t>(0.f);
884     }
885     else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
886     {
887         minval = static_cast<float16_t>(0.f);
888         maxval = static_cast<float16_t>(act_info.a());
889     }
890     else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
891     {
892         minval = static_cast<float16_t>(act_info.b());
893         maxval = static_cast<float16_t>(act_info.a());
894     }
895 
896     // Clear X & Y dimensions on execution window as we handle manually
897     Window win = window;
898     win.set(Window::DimX, Window::Dimension(0, 1, 1));
899     win.set(Window::DimY, Window::Dimension(0, 1, 1));
900 
901     Iterator in1_it(input1, window);
902     Iterator in2_it(input2, window);
903     Iterator out_it(final_output, window);
904 
905     const size_t width  = window.num_iterations(0);
906     const size_t height = window.num_iterations(1);
907 
908     if(add_output != nullptr)
909     {
910         Iterator add_out_it(add_output, window);
911         execute_window_loop(
912             win, [&](const Coordinates &)
913         {
914             a64_add_bn_clamp_direct_fp16_2x32(
915                 reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
916                 reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
917                 reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
918                 reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
919                 reinterpret_cast<float16_t *>(bn_mul->buffer()),
920                 reinterpret_cast<float16_t *>(bn_add->buffer()),
921                 minval,
922                 maxval,
923                 width, height);
924         },
925         in1_it, in2_it, add_out_it, out_it);
926     }
927     else
928     {
929         execute_window_loop(
930             win, [&](const Coordinates &)
931         {
932             a64_add_bn_clamp_direct_fp16_2x32(
933                 reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
934                 nullptr, out_direct_stride,
935                 reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
936                 reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
937                 reinterpret_cast<float16_t *>(bn_mul->buffer()),
938                 reinterpret_cast<float16_t *>(bn_add->buffer()),
939                 minval,
940                 maxval,
941                 width, height);
942         },
943         in1_it, in2_it, out_it);
944     }
945 }
946 } // namespace cpu
947 } // namespace arm_compute
948 
949 #endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
950