xref: /aosp_15_r20/external/XNNPACK/src/f32-gemm/gen/8x8s4-minmax-neon.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-gemm/neon-shuffle.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/gemm.h>
16 
17 
xnn_f32_gemm_minmax_ukernel_8x8s4__neon(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_gemm_minmax_ukernel_8x8s4__neon(
19     size_t mr,
20     size_t nc,
21     size_t kc,
22     const float* restrict a,
23     size_t a_stride,
24     const float* restrict w,
25     float* restrict c,
26     size_t cm_stride,
27     size_t cn_stride,
28     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
29 {
30   assert(mr != 0);
31   assert(mr <= 8);
32   assert(nc != 0);
33   assert(kc != 0);
34   assert(kc % sizeof(float) == 0);
35   assert(a != NULL);
36   assert(w != NULL);
37   assert(c != NULL);
38 
39   const float* a0 = a;
40   float* c0 = c;
41   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
42   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
43   if XNN_UNPREDICTABLE(mr < 2) {
44     a1 = a0;
45     c1 = c0;
46   }
47   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
48   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
49   if XNN_UNPREDICTABLE(mr <= 2) {
50     a2 = a1;
51     c2 = c1;
52   }
53   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
54   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
55   if XNN_UNPREDICTABLE(mr < 4) {
56     a3 = a2;
57     c3 = c2;
58   }
59   const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
60   float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
61   if XNN_UNPREDICTABLE(mr <= 4) {
62     a4 = a3;
63     c4 = c3;
64   }
65   const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
66   float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
67   if XNN_UNPREDICTABLE(mr < 6) {
68     a5 = a4;
69     c5 = c4;
70   }
71   const float* a6 = (const float*) ((uintptr_t) a5 + a_stride);
72   float* c6 = (float*) ((uintptr_t) c5 + cm_stride);
73   if XNN_UNPREDICTABLE(mr <= 6) {
74     a6 = a5;
75     c6 = c5;
76   }
77   const float* a7 = (const float*) ((uintptr_t) a6 + a_stride);
78   float* c7 = (float*) ((uintptr_t) c6 + cm_stride);
79   if XNN_UNPREDICTABLE(mr != 8) {
80     a7 = a6;
81     c7 = c6;
82   }
83 
84   do {
85     float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
86     float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
87     float32x4_t vacc1x0123 = vacc0x0123;
88     float32x4_t vacc1x4567 = vacc0x4567;
89     float32x4_t vacc2x0123 = vacc0x0123;
90     float32x4_t vacc2x4567 = vacc0x4567;
91     float32x4_t vacc3x0123 = vacc0x0123;
92     float32x4_t vacc3x4567 = vacc0x4567;
93     float32x4_t vacc4x0123 = vacc0x0123;
94     float32x4_t vacc4x4567 = vacc0x4567;
95     float32x4_t vacc5x0123 = vacc0x0123;
96     float32x4_t vacc5x4567 = vacc0x4567;
97     float32x4_t vacc6x0123 = vacc0x0123;
98     float32x4_t vacc6x4567 = vacc0x4567;
99     float32x4_t vacc7x0123 = vacc0x0123;
100     float32x4_t vacc7x4567 = vacc0x4567;
101 
102     size_t k = kc;
103     while (k >= 4 * sizeof(float)) {
104       float32x4_t va0 = vld1q_f32(a0); a0 += 4;
105       float32x4_t va1 = vld1q_f32(a1); a1 += 4;
106       float32x4_t va2 = vld1q_f32(a2); a2 += 4;
107       float32x4_t va3 = vld1q_f32(a3); a3 += 4;
108       float32x4_t va4 = vld1q_f32(a4); a4 += 4;
109       float32x4_t va5 = vld1q_f32(a5); a5 += 4;
110       float32x4_t va6 = vld1q_f32(a6); a6 += 4;
111       float32x4_t va7 = vld1q_f32(a7); a7 += 4;
112 
113 
114       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
115       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
116 
117       vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0);
118       vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c0);
119       vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c0);
120       vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c0);
121       vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c0);
122       vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0);
123       vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c0);
124       vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c0);
125       vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0);
126       vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567c0);
127       vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c0);
128       vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c0);
129       vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567c0);
130       vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0);
131       vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c0);
132       vacc7x4567 = vmlaq_f32(vacc7x4567, va7, vb4567c0);
133 
134       va0 = vextq_f32(va0, va0, 1);
135       va1 = vextq_f32(va1, va1, 1);
136       va2 = vextq_f32(va2, va2, 1);
137       va3 = vextq_f32(va3, va3, 1);
138       va4 = vextq_f32(va4, va4, 1);
139       va5 = vextq_f32(va5, va5, 1);
140       va6 = vextq_f32(va6, va6, 1);
141       va7 = vextq_f32(va7, va7, 1);
142 
143       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
144       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
145 
146       vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1);
147       vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c1);
148       vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c1);
149       vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c1);
150       vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c1);
151       vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1);
152       vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c1);
153       vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c1);
154       vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1);
155       vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567c1);
156       vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c1);
157       vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c1);
158       vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567c1);
159       vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1);
160       vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c1);
161       vacc7x4567 = vmlaq_f32(vacc7x4567, va7, vb4567c1);
162 
163       va0 = vextq_f32(va0, va0, 1);
164       va1 = vextq_f32(va1, va1, 1);
165       va2 = vextq_f32(va2, va2, 1);
166       va3 = vextq_f32(va3, va3, 1);
167       va4 = vextq_f32(va4, va4, 1);
168       va5 = vextq_f32(va5, va5, 1);
169       va6 = vextq_f32(va6, va6, 1);
170       va7 = vextq_f32(va7, va7, 1);
171 
172       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
173       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
174 
175       vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2);
176       vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c2);
177       vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c2);
178       vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c2);
179       vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c2);
180       vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2);
181       vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c2);
182       vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c2);
183       vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2);
184       vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567c2);
185       vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c2);
186       vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c2);
187       vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567c2);
188       vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2);
189       vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c2);
190       vacc7x4567 = vmlaq_f32(vacc7x4567, va7, vb4567c2);
191 
192       va0 = vextq_f32(va0, va0, 1);
193       va1 = vextq_f32(va1, va1, 1);
194       va2 = vextq_f32(va2, va2, 1);
195       va3 = vextq_f32(va3, va3, 1);
196       va4 = vextq_f32(va4, va4, 1);
197       va5 = vextq_f32(va5, va5, 1);
198       va6 = vextq_f32(va6, va6, 1);
199       va7 = vextq_f32(va7, va7, 1);
200 
201       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
202       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
203 
204       vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c3);
205       vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123c3);
206       vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123c3);
207       vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123c3);
208       vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123c3);
209       vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c3);
210       vacc6x0123 = vmlaq_f32(vacc6x0123, va6, vb0123c3);
211       vacc7x0123 = vmlaq_f32(vacc7x0123, va7, vb0123c3);
212       vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c3);
213       vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567c3);
214       vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567c3);
215       vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567c3);
216       vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567c3);
217       vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c3);
218       vacc6x4567 = vmlaq_f32(vacc6x4567, va6, vb4567c3);
219       vacc7x4567 = vmlaq_f32(vacc7x4567, va7, vb4567c3);
220 
221 
222       k -= 4 * sizeof(float);
223     }
224     if XNN_UNLIKELY(k != 0) {
225       float32x4_t va0 = vld1q_f32(a0); a0 = (const float*) ((uintptr_t) a0 + k);
226       float32x4_t va1 = vld1q_f32(a1); a1 = (const float*) ((uintptr_t) a1 + k);
227       float32x4_t va2 = vld1q_f32(a2); a2 = (const float*) ((uintptr_t) a2 + k);
228       float32x4_t va3 = vld1q_f32(a3); a3 = (const float*) ((uintptr_t) a3 + k);
229       float32x4_t va4 = vld1q_f32(a4); a4 = (const float*) ((uintptr_t) a4 + k);
230       float32x4_t va5 = vld1q_f32(a5); a5 = (const float*) ((uintptr_t) a5 + k);
231       float32x4_t va6 = vld1q_f32(a6); a6 = (const float*) ((uintptr_t) a6 + k);
232       float32x4_t va7 = vld1q_f32(a7); a7 = (const float*) ((uintptr_t) a7 + k);
233 
234 
235       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
236       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
237 
238       const float32x4_t vmska0x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
239       vacc0x0123 = vmlaq_f32(vacc0x0123, vmska0x0123c0, vb0123c0);
240       const float32x4_t vmska1x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
241       vacc1x0123 = vmlaq_f32(vacc1x0123, vmska1x0123c0, vb0123c0);
242       const float32x4_t vmska2x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
243       vacc2x0123 = vmlaq_f32(vacc2x0123, vmska2x0123c0, vb0123c0);
244       const float32x4_t vmska3x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
245       vacc3x0123 = vmlaq_f32(vacc3x0123, vmska3x0123c0, vb0123c0);
246       const float32x4_t vmska4x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
247       vacc4x0123 = vmlaq_f32(vacc4x0123, vmska4x0123c0, vb0123c0);
248       const float32x4_t vmska5x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
249       vacc5x0123 = vmlaq_f32(vacc5x0123, vmska5x0123c0, vb0123c0);
250       const float32x4_t vmska6x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
251       vacc6x0123 = vmlaq_f32(vacc6x0123, vmska6x0123c0, vb0123c0);
252       const float32x4_t vmska7x0123c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb0123c0, vmovq_n_f32(0.0f))));
253       vacc7x0123 = vmlaq_f32(vacc7x0123, vmska7x0123c0, vb0123c0);
254       const float32x4_t vmska0x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
255       vacc0x4567 = vmlaq_f32(vacc0x4567, vmska0x4567c0, vb4567c0);
256       const float32x4_t vmska1x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
257       vacc1x4567 = vmlaq_f32(vacc1x4567, vmska1x4567c0, vb4567c0);
258       const float32x4_t vmska2x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
259       vacc2x4567 = vmlaq_f32(vacc2x4567, vmska2x4567c0, vb4567c0);
260       const float32x4_t vmska3x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
261       vacc3x4567 = vmlaq_f32(vacc3x4567, vmska3x4567c0, vb4567c0);
262       const float32x4_t vmska4x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
263       vacc4x4567 = vmlaq_f32(vacc4x4567, vmska4x4567c0, vb4567c0);
264       const float32x4_t vmska5x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
265       vacc5x4567 = vmlaq_f32(vacc5x4567, vmska5x4567c0, vb4567c0);
266       const float32x4_t vmska6x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
267       vacc6x4567 = vmlaq_f32(vacc6x4567, vmska6x4567c0, vb4567c0);
268       const float32x4_t vmska7x4567c0 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb4567c0, vmovq_n_f32(0.0f))));
269       vacc7x4567 = vmlaq_f32(vacc7x4567, vmska7x4567c0, vb4567c0);
270 
271       va0 = vextq_f32(va0, va0, 1);
272       va1 = vextq_f32(va1, va1, 1);
273       va2 = vextq_f32(va2, va2, 1);
274       va3 = vextq_f32(va3, va3, 1);
275       va4 = vextq_f32(va4, va4, 1);
276       va5 = vextq_f32(va5, va5, 1);
277       va6 = vextq_f32(va6, va6, 1);
278       va7 = vextq_f32(va7, va7, 1);
279 
280       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
281       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
282 
283       const float32x4_t vmska0x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
284       vacc0x0123 = vmlaq_f32(vacc0x0123, vmska0x0123c1, vb0123c1);
285       const float32x4_t vmska1x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
286       vacc1x0123 = vmlaq_f32(vacc1x0123, vmska1x0123c1, vb0123c1);
287       const float32x4_t vmska2x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
288       vacc2x0123 = vmlaq_f32(vacc2x0123, vmska2x0123c1, vb0123c1);
289       const float32x4_t vmska3x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
290       vacc3x0123 = vmlaq_f32(vacc3x0123, vmska3x0123c1, vb0123c1);
291       const float32x4_t vmska4x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
292       vacc4x0123 = vmlaq_f32(vacc4x0123, vmska4x0123c1, vb0123c1);
293       const float32x4_t vmska5x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
294       vacc5x0123 = vmlaq_f32(vacc5x0123, vmska5x0123c1, vb0123c1);
295       const float32x4_t vmska6x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
296       vacc6x0123 = vmlaq_f32(vacc6x0123, vmska6x0123c1, vb0123c1);
297       const float32x4_t vmska7x0123c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb0123c1, vmovq_n_f32(0.0f))));
298       vacc7x0123 = vmlaq_f32(vacc7x0123, vmska7x0123c1, vb0123c1);
299       const float32x4_t vmska0x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
300       vacc0x4567 = vmlaq_f32(vacc0x4567, vmska0x4567c1, vb4567c1);
301       const float32x4_t vmska1x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
302       vacc1x4567 = vmlaq_f32(vacc1x4567, vmska1x4567c1, vb4567c1);
303       const float32x4_t vmska2x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
304       vacc2x4567 = vmlaq_f32(vacc2x4567, vmska2x4567c1, vb4567c1);
305       const float32x4_t vmska3x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
306       vacc3x4567 = vmlaq_f32(vacc3x4567, vmska3x4567c1, vb4567c1);
307       const float32x4_t vmska4x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
308       vacc4x4567 = vmlaq_f32(vacc4x4567, vmska4x4567c1, vb4567c1);
309       const float32x4_t vmska5x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
310       vacc5x4567 = vmlaq_f32(vacc5x4567, vmska5x4567c1, vb4567c1);
311       const float32x4_t vmska6x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
312       vacc6x4567 = vmlaq_f32(vacc6x4567, vmska6x4567c1, vb4567c1);
313       const float32x4_t vmska7x4567c1 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb4567c1, vmovq_n_f32(0.0f))));
314       vacc7x4567 = vmlaq_f32(vacc7x4567, vmska7x4567c1, vb4567c1);
315 
316       va0 = vextq_f32(va0, va0, 1);
317       va1 = vextq_f32(va1, va1, 1);
318       va2 = vextq_f32(va2, va2, 1);
319       va3 = vextq_f32(va3, va3, 1);
320       va4 = vextq_f32(va4, va4, 1);
321       va5 = vextq_f32(va5, va5, 1);
322       va6 = vextq_f32(va6, va6, 1);
323       va7 = vextq_f32(va7, va7, 1);
324 
325       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
326       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
327 
328       const float32x4_t vmska0x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
329       vacc0x0123 = vmlaq_f32(vacc0x0123, vmska0x0123c2, vb0123c2);
330       const float32x4_t vmska1x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
331       vacc1x0123 = vmlaq_f32(vacc1x0123, vmska1x0123c2, vb0123c2);
332       const float32x4_t vmska2x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
333       vacc2x0123 = vmlaq_f32(vacc2x0123, vmska2x0123c2, vb0123c2);
334       const float32x4_t vmska3x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
335       vacc3x0123 = vmlaq_f32(vacc3x0123, vmska3x0123c2, vb0123c2);
336       const float32x4_t vmska4x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
337       vacc4x0123 = vmlaq_f32(vacc4x0123, vmska4x0123c2, vb0123c2);
338       const float32x4_t vmska5x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
339       vacc5x0123 = vmlaq_f32(vacc5x0123, vmska5x0123c2, vb0123c2);
340       const float32x4_t vmska6x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
341       vacc6x0123 = vmlaq_f32(vacc6x0123, vmska6x0123c2, vb0123c2);
342       const float32x4_t vmska7x0123c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb0123c2, vmovq_n_f32(0.0f))));
343       vacc7x0123 = vmlaq_f32(vacc7x0123, vmska7x0123c2, vb0123c2);
344       const float32x4_t vmska0x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
345       vacc0x4567 = vmlaq_f32(vacc0x4567, vmska0x4567c2, vb4567c2);
346       const float32x4_t vmska1x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
347       vacc1x4567 = vmlaq_f32(vacc1x4567, vmska1x4567c2, vb4567c2);
348       const float32x4_t vmska2x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
349       vacc2x4567 = vmlaq_f32(vacc2x4567, vmska2x4567c2, vb4567c2);
350       const float32x4_t vmska3x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
351       vacc3x4567 = vmlaq_f32(vacc3x4567, vmska3x4567c2, vb4567c2);
352       const float32x4_t vmska4x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
353       vacc4x4567 = vmlaq_f32(vacc4x4567, vmska4x4567c2, vb4567c2);
354       const float32x4_t vmska5x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
355       vacc5x4567 = vmlaq_f32(vacc5x4567, vmska5x4567c2, vb4567c2);
356       const float32x4_t vmska6x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
357       vacc6x4567 = vmlaq_f32(vacc6x4567, vmska6x4567c2, vb4567c2);
358       const float32x4_t vmska7x4567c2 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb4567c2, vmovq_n_f32(0.0f))));
359       vacc7x4567 = vmlaq_f32(vacc7x4567, vmska7x4567c2, vb4567c2);
360 
361       va0 = vextq_f32(va0, va0, 1);
362       va1 = vextq_f32(va1, va1, 1);
363       va2 = vextq_f32(va2, va2, 1);
364       va3 = vextq_f32(va3, va3, 1);
365       va4 = vextq_f32(va4, va4, 1);
366       va5 = vextq_f32(va5, va5, 1);
367       va6 = vextq_f32(va6, va6, 1);
368       va7 = vextq_f32(va7, va7, 1);
369 
370       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
371       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
372 
373       const float32x4_t vmska0x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
374       vacc0x0123 = vmlaq_f32(vacc0x0123, vmska0x0123c3, vb0123c3);
375       const float32x4_t vmska1x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
376       vacc1x0123 = vmlaq_f32(vacc1x0123, vmska1x0123c3, vb0123c3);
377       const float32x4_t vmska2x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
378       vacc2x0123 = vmlaq_f32(vacc2x0123, vmska2x0123c3, vb0123c3);
379       const float32x4_t vmska3x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
380       vacc3x0123 = vmlaq_f32(vacc3x0123, vmska3x0123c3, vb0123c3);
381       const float32x4_t vmska4x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
382       vacc4x0123 = vmlaq_f32(vacc4x0123, vmska4x0123c3, vb0123c3);
383       const float32x4_t vmska5x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
384       vacc5x0123 = vmlaq_f32(vacc5x0123, vmska5x0123c3, vb0123c3);
385       const float32x4_t vmska6x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
386       vacc6x0123 = vmlaq_f32(vacc6x0123, vmska6x0123c3, vb0123c3);
387       const float32x4_t vmska7x0123c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb0123c3, vmovq_n_f32(0.0f))));
388       vacc7x0123 = vmlaq_f32(vacc7x0123, vmska7x0123c3, vb0123c3);
389       const float32x4_t vmska0x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va0), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
390       vacc0x4567 = vmlaq_f32(vacc0x4567, vmska0x4567c3, vb4567c3);
391       const float32x4_t vmska1x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va1), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
392       vacc1x4567 = vmlaq_f32(vacc1x4567, vmska1x4567c3, vb4567c3);
393       const float32x4_t vmska2x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va2), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
394       vacc2x4567 = vmlaq_f32(vacc2x4567, vmska2x4567c3, vb4567c3);
395       const float32x4_t vmska3x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va3), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
396       vacc3x4567 = vmlaq_f32(vacc3x4567, vmska3x4567c3, vb4567c3);
397       const float32x4_t vmska4x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va4), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
398       vacc4x4567 = vmlaq_f32(vacc4x4567, vmska4x4567c3, vb4567c3);
399       const float32x4_t vmska5x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va5), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
400       vacc5x4567 = vmlaq_f32(vacc5x4567, vmska5x4567c3, vb4567c3);
401       const float32x4_t vmska6x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va6), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
402       vacc6x4567 = vmlaq_f32(vacc6x4567, vmska6x4567c3, vb4567c3);
403       const float32x4_t vmska7x4567c3 = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(va7), vceqq_f32(vb4567c3, vmovq_n_f32(0.0f))));
404       vacc7x4567 = vmlaq_f32(vacc7x4567, vmska7x4567c3, vb4567c3);
405 
406     }
407     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
408     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
409     vacc1x0123 = vminq_f32(vacc1x0123, vmax);
410     vacc2x0123 = vminq_f32(vacc2x0123, vmax);
411     vacc3x0123 = vminq_f32(vacc3x0123, vmax);
412     vacc4x0123 = vminq_f32(vacc4x0123, vmax);
413     vacc5x0123 = vminq_f32(vacc5x0123, vmax);
414     vacc6x0123 = vminq_f32(vacc6x0123, vmax);
415     vacc7x0123 = vminq_f32(vacc7x0123, vmax);
416     vacc0x4567 = vminq_f32(vacc0x4567, vmax);
417     vacc1x4567 = vminq_f32(vacc1x4567, vmax);
418     vacc2x4567 = vminq_f32(vacc2x4567, vmax);
419     vacc3x4567 = vminq_f32(vacc3x4567, vmax);
420     vacc4x4567 = vminq_f32(vacc4x4567, vmax);
421     vacc5x4567 = vminq_f32(vacc5x4567, vmax);
422     vacc6x4567 = vminq_f32(vacc6x4567, vmax);
423     vacc7x4567 = vminq_f32(vacc7x4567, vmax);
424 
425     const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
426     vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
427     vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
428     vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
429     vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
430     vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
431     vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
432     vacc6x0123 = vmaxq_f32(vacc6x0123, vmin);
433     vacc7x0123 = vmaxq_f32(vacc7x0123, vmin);
434     vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
435     vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
436     vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
437     vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
438     vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
439     vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
440     vacc6x4567 = vmaxq_f32(vacc6x4567, vmin);
441     vacc7x4567 = vmaxq_f32(vacc7x4567, vmin);
442 
443     if XNN_LIKELY(nc >= 8) {
444       vst1q_f32(c7, vacc7x0123);
445       vst1q_f32(c7 + 4, vacc7x4567);
446       c7 = (float*) ((uintptr_t) c7 + cn_stride);
447       vst1q_f32(c6, vacc6x0123);
448       vst1q_f32(c6 + 4, vacc6x4567);
449       c6 = (float*) ((uintptr_t) c6 + cn_stride);
450       vst1q_f32(c5, vacc5x0123);
451       vst1q_f32(c5 + 4, vacc5x4567);
452       c5 = (float*) ((uintptr_t) c5 + cn_stride);
453       vst1q_f32(c4, vacc4x0123);
454       vst1q_f32(c4 + 4, vacc4x4567);
455       c4 = (float*) ((uintptr_t) c4 + cn_stride);
456       vst1q_f32(c3, vacc3x0123);
457       vst1q_f32(c3 + 4, vacc3x4567);
458       c3 = (float*) ((uintptr_t) c3 + cn_stride);
459       vst1q_f32(c2, vacc2x0123);
460       vst1q_f32(c2 + 4, vacc2x4567);
461       c2 = (float*) ((uintptr_t) c2 + cn_stride);
462       vst1q_f32(c1, vacc1x0123);
463       vst1q_f32(c1 + 4, vacc1x4567);
464       c1 = (float*) ((uintptr_t) c1 + cn_stride);
465       vst1q_f32(c0, vacc0x0123);
466       vst1q_f32(c0 + 4, vacc0x4567);
467       c0 = (float*) ((uintptr_t) c0 + cn_stride);
468 
469       a7 = (const float*) ((uintptr_t) a7 - kc);
470       a6 = (const float*) ((uintptr_t) a6 - kc);
471       a5 = (const float*) ((uintptr_t) a5 - kc);
472       a4 = (const float*) ((uintptr_t) a4 - kc);
473       a3 = (const float*) ((uintptr_t) a3 - kc);
474       a2 = (const float*) ((uintptr_t) a2 - kc);
475       a1 = (const float*) ((uintptr_t) a1 - kc);
476       a0 = (const float*) ((uintptr_t) a0 - kc);
477 
478       nc -= 8;
479 
480     } else {
481       if (nc & 4) {
482         vst1q_f32(c7, vacc7x0123); c7 += 4;
483         vst1q_f32(c6, vacc6x0123); c6 += 4;
484         vst1q_f32(c5, vacc5x0123); c5 += 4;
485         vst1q_f32(c4, vacc4x0123); c4 += 4;
486         vst1q_f32(c3, vacc3x0123); c3 += 4;
487         vst1q_f32(c2, vacc2x0123); c2 += 4;
488         vst1q_f32(c1, vacc1x0123); c1 += 4;
489         vst1q_f32(c0, vacc0x0123); c0 += 4;
490 
491         vacc7x0123 = vacc7x4567;
492         vacc6x0123 = vacc6x4567;
493         vacc5x0123 = vacc5x4567;
494         vacc4x0123 = vacc4x4567;
495         vacc3x0123 = vacc3x4567;
496         vacc2x0123 = vacc2x4567;
497         vacc1x0123 = vacc1x4567;
498         vacc0x0123 = vacc0x4567;
499       }
500       float32x2_t vacc7x01 = vget_low_f32(vacc7x0123);
501       float32x2_t vacc6x01 = vget_low_f32(vacc6x0123);
502       float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
503       float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
504       float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
505       float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
506       float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
507       float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
508       if (nc & 2) {
509         vst1_f32(c7, vacc7x01); c7 += 2;
510         vst1_f32(c6, vacc6x01); c6 += 2;
511         vst1_f32(c5, vacc5x01); c5 += 2;
512         vst1_f32(c4, vacc4x01); c4 += 2;
513         vst1_f32(c3, vacc3x01); c3 += 2;
514         vst1_f32(c2, vacc2x01); c2 += 2;
515         vst1_f32(c1, vacc1x01); c1 += 2;
516         vst1_f32(c0, vacc0x01); c0 += 2;
517 
518         vacc7x01 = vget_high_f32(vacc7x0123);
519         vacc6x01 = vget_high_f32(vacc6x0123);
520         vacc5x01 = vget_high_f32(vacc5x0123);
521         vacc4x01 = vget_high_f32(vacc4x0123);
522         vacc3x01 = vget_high_f32(vacc3x0123);
523         vacc2x01 = vget_high_f32(vacc2x0123);
524         vacc1x01 = vget_high_f32(vacc1x0123);
525         vacc0x01 = vget_high_f32(vacc0x0123);
526       }
527       if (nc & 1) {
528         vst1_lane_f32(c7, vacc7x01, 0);
529         vst1_lane_f32(c6, vacc6x01, 0);
530         vst1_lane_f32(c5, vacc5x01, 0);
531         vst1_lane_f32(c4, vacc4x01, 0);
532         vst1_lane_f32(c3, vacc3x01, 0);
533         vst1_lane_f32(c2, vacc2x01, 0);
534         vst1_lane_f32(c1, vacc1x01, 0);
535         vst1_lane_f32(c0, vacc0x01, 0);
536       }
537 
538       nc = 0;
539     }
540   } while (nc != 0);
541 }
542