xref: /aosp_15_r20/external/XNNPACK/src/f16-igemm/gen/8x16-minmax-neonfp16arith-ld64.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f16-igemm/neonfp16arith-ld64.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/igemm.h>
16 
17 
xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const void ** restrict a,const void * restrict w,void * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const void * zero,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64(
19     size_t mr,
20     size_t nc,
21     size_t kc,
22     size_t ks,
23     const void** restrict a,
24     const void* restrict w,
25     void* restrict c,
26     size_t cm_stride,
27     size_t cn_stride,
28     size_t a_offset,
29     const void* zero,
30     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
31 {
32   assert(mr != 0);
33   assert(mr <= 8);
34   assert(nc != 0);
35   assert(kc != 0);
36   assert(kc % sizeof(__fp16) == 0);
37   assert(ks != 0);
38   assert(ks % (8 * sizeof(void*)) == 0);
39   assert(a_offset % sizeof(__fp16) == 0);
40   assert(a != NULL);
41   assert(w != NULL);
42   assert(c != NULL);
43 
44   __fp16* c0 = (__fp16*) c;
45   __fp16* c1 = (__fp16*) ((uintptr_t) c0 + cm_stride);
46   if XNN_UNPREDICTABLE(mr < 2) {
47     c1 = c0;
48   }
49   __fp16* c2 = (__fp16*) ((uintptr_t) c1 + cm_stride);
50   if XNN_UNPREDICTABLE(mr <= 2) {
51     c2 = c1;
52   }
53   __fp16* c3 = (__fp16*) ((uintptr_t) c2 + cm_stride);
54   if XNN_UNPREDICTABLE(mr < 4) {
55     c3 = c2;
56   }
57   __fp16* c4 = (__fp16*) ((uintptr_t) c3 + cm_stride);
58   if XNN_UNPREDICTABLE(mr <= 4) {
59     c4 = c3;
60   }
61   __fp16* c5 = (__fp16*) ((uintptr_t) c4 + cm_stride);
62   if XNN_UNPREDICTABLE(mr < 6) {
63     c5 = c4;
64   }
65   __fp16* c6 = (__fp16*) ((uintptr_t) c5 + cm_stride);
66   if XNN_UNPREDICTABLE(mr <= 6) {
67     c6 = c5;
68   }
69   __fp16* c7 = (__fp16*) ((uintptr_t) c6 + cm_stride);
70   if XNN_UNPREDICTABLE(mr != 8) {
71     c7 = c6;
72   }
73 
74   do {
75     float16x8_t vacc0x01234567 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
76     float16x8_t vacc0x89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
77     float16x8_t vacc1x01234567 = vacc0x01234567;
78     float16x8_t vacc1x89ABCDEF = vacc0x89ABCDEF;
79     float16x8_t vacc2x01234567 = vacc0x01234567;
80     float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF;
81     float16x8_t vacc3x01234567 = vacc0x01234567;
82     float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF;
83     float16x8_t vacc4x01234567 = vacc0x01234567;
84     float16x8_t vacc4x89ABCDEF = vacc0x89ABCDEF;
85     float16x8_t vacc5x01234567 = vacc0x01234567;
86     float16x8_t vacc5x89ABCDEF = vacc0x89ABCDEF;
87     float16x8_t vacc6x01234567 = vacc0x01234567;
88     float16x8_t vacc6x89ABCDEF = vacc0x89ABCDEF;
89     float16x8_t vacc7x01234567 = vacc0x01234567;
90     float16x8_t vacc7x89ABCDEF = vacc0x89ABCDEF;
91 
92     size_t p = ks;
93     do {
94       const __fp16* restrict a0 = (const __fp16*) a[0];
95       assert(a0 != NULL);
96       if XNN_UNPREDICTABLE(a0 != zero) {
97         a0 = (const __fp16*) ((uintptr_t) a0 + a_offset);
98       }
99       const __fp16* restrict a1 = (const __fp16*) a[1];
100       assert(a1 != NULL);
101       if XNN_UNPREDICTABLE(a1 != zero) {
102         a1 = (const __fp16*) ((uintptr_t) a1 + a_offset);
103       }
104       const __fp16* restrict a2 = (const __fp16*) a[2];
105       assert(a2 != NULL);
106       if XNN_UNPREDICTABLE(a2 != zero) {
107         a2 = (const __fp16*) ((uintptr_t) a2 + a_offset);
108       }
109       const __fp16* restrict a3 = (const __fp16*) a[3];
110       assert(a3 != NULL);
111       if XNN_UNPREDICTABLE(a3 != zero) {
112         a3 = (const __fp16*) ((uintptr_t) a3 + a_offset);
113       }
114       const __fp16* restrict a4 = (const __fp16*) a[4];
115       assert(a4 != NULL);
116       if XNN_UNPREDICTABLE(a4 != zero) {
117         a4 = (const __fp16*) ((uintptr_t) a4 + a_offset);
118       }
119       const __fp16* restrict a5 = (const __fp16*) a[5];
120       assert(a5 != NULL);
121       if XNN_UNPREDICTABLE(a5 != zero) {
122         a5 = (const __fp16*) ((uintptr_t) a5 + a_offset);
123       }
124       const __fp16* restrict a6 = (const __fp16*) a[6];
125       assert(a6 != NULL);
126       if XNN_UNPREDICTABLE(a6 != zero) {
127         a6 = (const __fp16*) ((uintptr_t) a6 + a_offset);
128       }
129       const __fp16* restrict a7 = (const __fp16*) a[7];
130       assert(a7 != NULL);
131       if XNN_UNPREDICTABLE(a7 != zero) {
132         a7 = (const __fp16*) ((uintptr_t) a7 + a_offset);
133       }
134       a += 8;
135 
136       size_t k = kc;
137       for (; k >= 4 * sizeof(__fp16); k -= 4 * sizeof(__fp16)) {
138         const float16x4_t va0 = vld1_f16(a0); a0 += 4;
139         const float16x4_t va1 = vld1_f16(a1); a1 += 4;
140         const float16x4_t va2 = vld1_f16(a2); a2 += 4;
141         const float16x4_t va3 = vld1_f16(a3); a3 += 4;
142         const float16x4_t va4 = vld1_f16(a4); a4 += 4;
143         const float16x4_t va5 = vld1_f16(a5); a5 += 4;
144         const float16x4_t va6 = vld1_f16(a6); a6 += 4;
145         const float16x4_t va7 = vld1_f16(a7); a7 += 4;
146 
147         const float16x8_t vb01234567c0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
148         const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
149 
150         #if XNN_ARCH_ARM64
151           vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0);
152           vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c0, va1, 0);
153           vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c0, va2, 0);
154           vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c0, va3, 0);
155           vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0);
156           vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0);
157           vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c0, va6, 0);
158           vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c0, va7, 0);
159           vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0);
160           vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0);
161           vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0);
162           vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0);
163           vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0);
164           vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0);
165           vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc0, va6, 0);
166           vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc0, va7, 0);
167         #else
168           const float16x8_t va0c0 = vdupq_lane_f16(va0, 0);
169           const float16x8_t va1c0 = vdupq_lane_f16(va1, 0);
170           const float16x8_t va2c0 = vdupq_lane_f16(va2, 0);
171           const float16x8_t va3c0 = vdupq_lane_f16(va3, 0);
172           const float16x8_t va4c0 = vdupq_lane_f16(va4, 0);
173           const float16x8_t va5c0 = vdupq_lane_f16(va5, 0);
174           const float16x8_t va6c0 = vdupq_lane_f16(va6, 0);
175           const float16x8_t va7c0 = vdupq_lane_f16(va7, 0);
176 
177           vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c0, vb01234567c0);
178           vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c0, vb01234567c0);
179           vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c0, vb01234567c0);
180           vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c0, vb01234567c0);
181           vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0);
182           vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0);
183           vacc6x01234567 = vfmaq_f16(vacc6x01234567, va6c0, vb01234567c0);
184           vacc7x01234567 = vfmaq_f16(vacc7x01234567, va7c0, vb01234567c0);
185           vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0);
186           vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0);
187           vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0);
188           vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0);
189           vacc4x89ABCDEF = vfmaq_f16(vacc4x89ABCDEF, va4c0, vb89ABCDEFc0);
190           vacc5x89ABCDEF = vfmaq_f16(vacc5x89ABCDEF, va5c0, vb89ABCDEFc0);
191           vacc6x89ABCDEF = vfmaq_f16(vacc6x89ABCDEF, va6c0, vb89ABCDEFc0);
192           vacc7x89ABCDEF = vfmaq_f16(vacc7x89ABCDEF, va7c0, vb89ABCDEFc0);
193         #endif
194         const float16x8_t vb01234567c1 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
195         const float16x8_t vb89ABCDEFc1 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
196 
197         #if XNN_ARCH_ARM64
198           vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c1, va0, 1);
199           vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c1, va1, 1);
200           vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c1, va2, 1);
201           vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c1, va3, 1);
202           vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1);
203           vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1);
204           vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c1, va6, 1);
205           vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c1, va7, 1);
206           vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1);
207           vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc1, va1, 1);
208           vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1);
209           vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1);
210           vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc1, va4, 1);
211           vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc1, va5, 1);
212           vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc1, va6, 1);
213           vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc1, va7, 1);
214         #else
215           const float16x8_t va0c1 = vdupq_lane_f16(va0, 1);
216           const float16x8_t va1c1 = vdupq_lane_f16(va1, 1);
217           const float16x8_t va2c1 = vdupq_lane_f16(va2, 1);
218           const float16x8_t va3c1 = vdupq_lane_f16(va3, 1);
219           const float16x8_t va4c1 = vdupq_lane_f16(va4, 1);
220           const float16x8_t va5c1 = vdupq_lane_f16(va5, 1);
221           const float16x8_t va6c1 = vdupq_lane_f16(va6, 1);
222           const float16x8_t va7c1 = vdupq_lane_f16(va7, 1);
223 
224           vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c1, vb01234567c1);
225           vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c1, vb01234567c1);
226           vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c1, vb01234567c1);
227           vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c1, vb01234567c1);
228           vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1);
229           vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1);
230           vacc6x01234567 = vfmaq_f16(vacc6x01234567, va6c1, vb01234567c1);
231           vacc7x01234567 = vfmaq_f16(vacc7x01234567, va7c1, vb01234567c1);
232           vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c1, vb89ABCDEFc1);
233           vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c1, vb89ABCDEFc1);
234           vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1);
235           vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1);
236           vacc4x89ABCDEF = vfmaq_f16(vacc4x89ABCDEF, va4c1, vb89ABCDEFc1);
237           vacc5x89ABCDEF = vfmaq_f16(vacc5x89ABCDEF, va5c1, vb89ABCDEFc1);
238           vacc6x89ABCDEF = vfmaq_f16(vacc6x89ABCDEF, va6c1, vb89ABCDEFc1);
239           vacc7x89ABCDEF = vfmaq_f16(vacc7x89ABCDEF, va7c1, vb89ABCDEFc1);
240         #endif
241         const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
242         const float16x8_t vb89ABCDEFc2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
243 
244         #if XNN_ARCH_ARM64
245           vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2);
246           vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2);
247           vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2);
248           vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2);
249           vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2);
250           vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2);
251           vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2);
252           vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2);
253           vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2);
254           vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc2, va1, 2);
255           vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2);
256           vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2);
257           vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc2, va4, 2);
258           vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc2, va5, 2);
259           vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc2, va6, 2);
260           vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc2, va7, 2);
261         #else
262           const float16x8_t va0c2 = vdupq_lane_f16(va0, 2);
263           const float16x8_t va1c2 = vdupq_lane_f16(va1, 2);
264           const float16x8_t va2c2 = vdupq_lane_f16(va2, 2);
265           const float16x8_t va3c2 = vdupq_lane_f16(va3, 2);
266           const float16x8_t va4c2 = vdupq_lane_f16(va4, 2);
267           const float16x8_t va5c2 = vdupq_lane_f16(va5, 2);
268           const float16x8_t va6c2 = vdupq_lane_f16(va6, 2);
269           const float16x8_t va7c2 = vdupq_lane_f16(va7, 2);
270 
271           vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2);
272           vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2);
273           vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2);
274           vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2);
275           vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2);
276           vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2);
277           vacc6x01234567 = vfmaq_f16(vacc6x01234567, va6c2, vb01234567c2);
278           vacc7x01234567 = vfmaq_f16(vacc7x01234567, va7c2, vb01234567c2);
279           vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c2, vb89ABCDEFc2);
280           vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c2, vb89ABCDEFc2);
281           vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2);
282           vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2);
283           vacc4x89ABCDEF = vfmaq_f16(vacc4x89ABCDEF, va4c2, vb89ABCDEFc2);
284           vacc5x89ABCDEF = vfmaq_f16(vacc5x89ABCDEF, va5c2, vb89ABCDEFc2);
285           vacc6x89ABCDEF = vfmaq_f16(vacc6x89ABCDEF, va6c2, vb89ABCDEFc2);
286           vacc7x89ABCDEF = vfmaq_f16(vacc7x89ABCDEF, va7c2, vb89ABCDEFc2);
287         #endif
288         const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
289         const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
290 
291         #if XNN_ARCH_ARM64
292           vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3);
293           vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3);
294           vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3);
295           vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3);
296           vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3);
297           vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3);
298           vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3);
299           vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3);
300           vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3);
301           vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3);
302           vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3);
303           vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3);
304           vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3);
305           vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3);
306           vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc3, va6, 3);
307           vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc3, va7, 3);
308         #else
309           const float16x8_t va0c3 = vdupq_lane_f16(va0, 3);
310           const float16x8_t va1c3 = vdupq_lane_f16(va1, 3);
311           const float16x8_t va2c3 = vdupq_lane_f16(va2, 3);
312           const float16x8_t va3c3 = vdupq_lane_f16(va3, 3);
313           const float16x8_t va4c3 = vdupq_lane_f16(va4, 3);
314           const float16x8_t va5c3 = vdupq_lane_f16(va5, 3);
315           const float16x8_t va6c3 = vdupq_lane_f16(va6, 3);
316           const float16x8_t va7c3 = vdupq_lane_f16(va7, 3);
317 
318           vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3);
319           vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3);
320           vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3);
321           vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3);
322           vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3);
323           vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3);
324           vacc6x01234567 = vfmaq_f16(vacc6x01234567, va6c3, vb01234567c3);
325           vacc7x01234567 = vfmaq_f16(vacc7x01234567, va7c3, vb01234567c3);
326           vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3);
327           vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3);
328           vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3);
329           vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3);
330           vacc4x89ABCDEF = vfmaq_f16(vacc4x89ABCDEF, va4c3, vb89ABCDEFc3);
331           vacc5x89ABCDEF = vfmaq_f16(vacc5x89ABCDEF, va5c3, vb89ABCDEFc3);
332           vacc6x89ABCDEF = vfmaq_f16(vacc6x89ABCDEF, va6c3, vb89ABCDEFc3);
333           vacc7x89ABCDEF = vfmaq_f16(vacc7x89ABCDEF, va7c3, vb89ABCDEFc3);
334         #endif
335       }
336       if XNN_UNLIKELY(k != 0) {
337         do {
338           const float16x8_t va0 = vld1q_dup_f16(a0); a0 += 1;
339           const float16x8_t va1 = vld1q_dup_f16(a1); a1 += 1;
340           const float16x8_t va2 = vld1q_dup_f16(a2); a2 += 1;
341           const float16x8_t va3 = vld1q_dup_f16(a3); a3 += 1;
342           const float16x8_t va4 = vld1q_dup_f16(a4); a4 += 1;
343           const float16x8_t va5 = vld1q_dup_f16(a5); a5 += 1;
344           const float16x8_t va6 = vld1q_dup_f16(a6); a6 += 1;
345           const float16x8_t va7 = vld1q_dup_f16(a7); a7 += 1;
346 
347           const float16x8_t vb01234567 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
348           const float16x8_t vb89ABCDEF = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8_t));
349 
350           vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0, vb01234567);
351           vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1, vb01234567);
352           vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2, vb01234567);
353           vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3, vb01234567);
354           vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567);
355           vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567);
356           vacc6x01234567 = vfmaq_f16(vacc6x01234567, va6, vb01234567);
357           vacc7x01234567 = vfmaq_f16(vacc7x01234567, va7, vb01234567);
358           vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0, vb89ABCDEF);
359           vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1, vb89ABCDEF);
360           vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF);
361           vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF);
362           vacc4x89ABCDEF = vfmaq_f16(vacc4x89ABCDEF, va4, vb89ABCDEF);
363           vacc5x89ABCDEF = vfmaq_f16(vacc5x89ABCDEF, va5, vb89ABCDEF);
364           vacc6x89ABCDEF = vfmaq_f16(vacc6x89ABCDEF, va6, vb89ABCDEF);
365           vacc7x89ABCDEF = vfmaq_f16(vacc7x89ABCDEF, va7, vb89ABCDEF);
366 
367           k -= sizeof(__fp16);
368         } while (k != 0);
369       }
370       p -= 8 * sizeof(void*);
371     } while (p != 0);
372 
373 
374     const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
375     vacc0x01234567 = vminq_f16(vacc0x01234567, vmax);
376     vacc1x01234567 = vminq_f16(vacc1x01234567, vmax);
377     vacc2x01234567 = vminq_f16(vacc2x01234567, vmax);
378     vacc3x01234567 = vminq_f16(vacc3x01234567, vmax);
379     vacc4x01234567 = vminq_f16(vacc4x01234567, vmax);
380     vacc5x01234567 = vminq_f16(vacc5x01234567, vmax);
381     vacc6x01234567 = vminq_f16(vacc6x01234567, vmax);
382     vacc7x01234567 = vminq_f16(vacc7x01234567, vmax);
383     vacc0x89ABCDEF = vminq_f16(vacc0x89ABCDEF, vmax);
384     vacc1x89ABCDEF = vminq_f16(vacc1x89ABCDEF, vmax);
385     vacc2x89ABCDEF = vminq_f16(vacc2x89ABCDEF, vmax);
386     vacc3x89ABCDEF = vminq_f16(vacc3x89ABCDEF, vmax);
387     vacc4x89ABCDEF = vminq_f16(vacc4x89ABCDEF, vmax);
388     vacc5x89ABCDEF = vminq_f16(vacc5x89ABCDEF, vmax);
389     vacc6x89ABCDEF = vminq_f16(vacc6x89ABCDEF, vmax);
390     vacc7x89ABCDEF = vminq_f16(vacc7x89ABCDEF, vmax);
391 
392     const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
393     vacc0x01234567 = vmaxq_f16(vacc0x01234567, vmin);
394     vacc1x01234567 = vmaxq_f16(vacc1x01234567, vmin);
395     vacc2x01234567 = vmaxq_f16(vacc2x01234567, vmin);
396     vacc3x01234567 = vmaxq_f16(vacc3x01234567, vmin);
397     vacc4x01234567 = vmaxq_f16(vacc4x01234567, vmin);
398     vacc5x01234567 = vmaxq_f16(vacc5x01234567, vmin);
399     vacc6x01234567 = vmaxq_f16(vacc6x01234567, vmin);
400     vacc7x01234567 = vmaxq_f16(vacc7x01234567, vmin);
401     vacc0x89ABCDEF = vmaxq_f16(vacc0x89ABCDEF, vmin);
402     vacc1x89ABCDEF = vmaxq_f16(vacc1x89ABCDEF, vmin);
403     vacc2x89ABCDEF = vmaxq_f16(vacc2x89ABCDEF, vmin);
404     vacc3x89ABCDEF = vmaxq_f16(vacc3x89ABCDEF, vmin);
405     vacc4x89ABCDEF = vmaxq_f16(vacc4x89ABCDEF, vmin);
406     vacc5x89ABCDEF = vmaxq_f16(vacc5x89ABCDEF, vmin);
407     vacc6x89ABCDEF = vmaxq_f16(vacc6x89ABCDEF, vmin);
408     vacc7x89ABCDEF = vmaxq_f16(vacc7x89ABCDEF, vmin);
409 
410     if XNN_LIKELY(nc >= 16) {
411       vst1q_f16(c7, vacc7x01234567);
412       vst1q_f16(c7 + 8, vacc7x89ABCDEF);
413       c7 = (__fp16*) ((uintptr_t) c7 + cn_stride);
414       vst1q_f16(c6, vacc6x01234567);
415       vst1q_f16(c6 + 8, vacc6x89ABCDEF);
416       c6 = (__fp16*) ((uintptr_t) c6 + cn_stride);
417       vst1q_f16(c5, vacc5x01234567);
418       vst1q_f16(c5 + 8, vacc5x89ABCDEF);
419       c5 = (__fp16*) ((uintptr_t) c5 + cn_stride);
420       vst1q_f16(c4, vacc4x01234567);
421       vst1q_f16(c4 + 8, vacc4x89ABCDEF);
422       c4 = (__fp16*) ((uintptr_t) c4 + cn_stride);
423       vst1q_f16(c3, vacc3x01234567);
424       vst1q_f16(c3 + 8, vacc3x89ABCDEF);
425       c3 = (__fp16*) ((uintptr_t) c3 + cn_stride);
426       vst1q_f16(c2, vacc2x01234567);
427       vst1q_f16(c2 + 8, vacc2x89ABCDEF);
428       c2 = (__fp16*) ((uintptr_t) c2 + cn_stride);
429       vst1q_f16(c1, vacc1x01234567);
430       vst1q_f16(c1 + 8, vacc1x89ABCDEF);
431       c1 = (__fp16*) ((uintptr_t) c1 + cn_stride);
432       vst1q_f16(c0, vacc0x01234567);
433       vst1q_f16(c0 + 8, vacc0x89ABCDEF);
434       c0 = (__fp16*) ((uintptr_t) c0 + cn_stride);
435 
436       a = (const void**restrict) ((uintptr_t) a - ks);
437       nc -= 16;
438     } else {
439       if (nc & 8) {
440         vst1q_f16(c7, vacc7x01234567); c7 += 8;
441         vst1q_f16(c6, vacc6x01234567); c6 += 8;
442         vst1q_f16(c5, vacc5x01234567); c5 += 8;
443         vst1q_f16(c4, vacc4x01234567); c4 += 8;
444         vst1q_f16(c3, vacc3x01234567); c3 += 8;
445         vst1q_f16(c2, vacc2x01234567); c2 += 8;
446         vst1q_f16(c1, vacc1x01234567); c1 += 8;
447         vst1q_f16(c0, vacc0x01234567); c0 += 8;
448 
449         vacc7x01234567 = vacc7x89ABCDEF;
450         vacc6x01234567 = vacc6x89ABCDEF;
451         vacc5x01234567 = vacc5x89ABCDEF;
452         vacc4x01234567 = vacc4x89ABCDEF;
453         vacc3x01234567 = vacc3x89ABCDEF;
454         vacc2x01234567 = vacc2x89ABCDEF;
455         vacc1x01234567 = vacc1x89ABCDEF;
456         vacc0x01234567 = vacc0x89ABCDEF;
457       }
458       float16x4_t vacc7x0123 = vget_low_f16(vacc7x01234567);
459       float16x4_t vacc6x0123 = vget_low_f16(vacc6x01234567);
460       float16x4_t vacc5x0123 = vget_low_f16(vacc5x01234567);
461       float16x4_t vacc4x0123 = vget_low_f16(vacc4x01234567);
462       float16x4_t vacc3x0123 = vget_low_f16(vacc3x01234567);
463       float16x4_t vacc2x0123 = vget_low_f16(vacc2x01234567);
464       float16x4_t vacc1x0123 = vget_low_f16(vacc1x01234567);
465       float16x4_t vacc0x0123 = vget_low_f16(vacc0x01234567);
466       if (nc & 4) {
467         vst1_f16(c7, vacc7x0123); c7 += 4;
468         vst1_f16(c6, vacc6x0123); c6 += 4;
469         vst1_f16(c5, vacc5x0123); c5 += 4;
470         vst1_f16(c4, vacc4x0123); c4 += 4;
471         vst1_f16(c3, vacc3x0123); c3 += 4;
472         vst1_f16(c2, vacc2x0123); c2 += 4;
473         vst1_f16(c1, vacc1x0123); c1 += 4;
474         vst1_f16(c0, vacc0x0123); c0 += 4;
475 
476         vacc7x0123 = vget_high_f16(vacc7x01234567);
477         vacc6x0123 = vget_high_f16(vacc6x01234567);
478         vacc5x0123 = vget_high_f16(vacc5x01234567);
479         vacc4x0123 = vget_high_f16(vacc4x01234567);
480         vacc3x0123 = vget_high_f16(vacc3x01234567);
481         vacc2x0123 = vget_high_f16(vacc2x01234567);
482         vacc1x0123 = vget_high_f16(vacc1x01234567);
483         vacc0x0123 = vget_high_f16(vacc0x01234567);
484       }
485       if (nc & 2) {
486         vst1_lane_u32((void*) c7, vreinterpret_u32_f16(vacc7x0123), 0); c7 += 2;
487         vst1_lane_u32((void*) c6, vreinterpret_u32_f16(vacc6x0123), 0); c6 += 2;
488         vst1_lane_u32((void*) c5, vreinterpret_u32_f16(vacc5x0123), 0); c5 += 2;
489         vst1_lane_u32((void*) c4, vreinterpret_u32_f16(vacc4x0123), 0); c4 += 2;
490         vst1_lane_u32((void*) c3, vreinterpret_u32_f16(vacc3x0123), 0); c3 += 2;
491         vst1_lane_u32((void*) c2, vreinterpret_u32_f16(vacc2x0123), 0); c2 += 2;
492         vst1_lane_u32((void*) c1, vreinterpret_u32_f16(vacc1x0123), 0); c1 += 2;
493         vst1_lane_u32((void*) c0, vreinterpret_u32_f16(vacc0x0123), 0); c0 += 2;
494 
495         vacc7x0123 = vext_f16(vacc7x0123, vacc7x0123, 2);
496         vacc6x0123 = vext_f16(vacc6x0123, vacc6x0123, 2);
497         vacc5x0123 = vext_f16(vacc5x0123, vacc5x0123, 2);
498         vacc4x0123 = vext_f16(vacc4x0123, vacc4x0123, 2);
499         vacc3x0123 = vext_f16(vacc3x0123, vacc3x0123, 2);
500         vacc2x0123 = vext_f16(vacc2x0123, vacc2x0123, 2);
501         vacc1x0123 = vext_f16(vacc1x0123, vacc1x0123, 2);
502         vacc0x0123 = vext_f16(vacc0x0123, vacc0x0123, 2);
503       }
504       if (nc & 1) {
505         vst1_lane_f16(c7, vacc7x0123, 0);
506         vst1_lane_f16(c6, vacc6x0123, 0);
507         vst1_lane_f16(c5, vacc5x0123, 0);
508         vst1_lane_f16(c4, vacc4x0123, 0);
509         vst1_lane_f16(c3, vacc3x0123, 0);
510         vst1_lane_f16(c2, vacc2x0123, 0);
511         vst1_lane_f16(c1, vacc1x0123, 0);
512         vst1_lane_f16(c0, vacc0x0123, 0);
513       }
514 
515       nc = 0;
516     }
517   } while (nc != 0);
518 }
519