xref: /aosp_15_r20/external/XNNPACK/src/f32-spmm/gen/32x2-minmax-neonfma.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/neon-blocked.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/spmm.h>
15 
16 
xnn_f32_spmm_minmax_ukernel_32x2__neonfma(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_32x2__neonfma(
18     size_t mc,
19     size_t nc,
20     const float*restrict input,
21     const float*restrict weights,
22     const int32_t*restrict widx_dmap,
23     const uint32_t*restrict nidx_nnzmap,
24     float*restrict output,
25     size_t output_stride,
26     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(mc != 0);
29   assert(mc % sizeof(float) == 0);
30   assert(nc != 0);
31 
32   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
33   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
34   size_t output_decrement = output_stride * nc - 32 * sizeof(float);
35   while XNN_LIKELY(mc >= 32 * sizeof(float)) {
36     const float*restrict w = weights;
37     const int32_t* dmap = widx_dmap;
38     const uint32_t* nnzmap = nidx_nnzmap;
39     size_t n = nc;
40     while (n >= 2) {
41       uint32_t nnz = *nnzmap++;
42       float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
43       float32x4_t vacc4567n0 = vacc0123n0;
44       float32x4_t vacc89ABn0 = vacc0123n0;
45       float32x4_t vaccCDEFn0 = vacc0123n0;
46       float32x4_t vaccGHIJn0 = vacc0123n0;
47       float32x4_t vaccKLMNn0 = vacc0123n0;
48       float32x4_t vaccOPQRn0 = vacc0123n0;
49       float32x4_t vaccSTUVn0 = vacc0123n0;
50       float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
51       float32x4_t vacc4567n1 = vacc0123n1;
52       float32x4_t vacc89ABn1 = vacc0123n1;
53       float32x4_t vaccCDEFn1 = vacc0123n1;
54       float32x4_t vaccGHIJn1 = vacc0123n1;
55       float32x4_t vaccKLMNn1 = vacc0123n1;
56       float32x4_t vaccOPQRn1 = vacc0123n1;
57       float32x4_t vaccSTUVn1 = vacc0123n1;
58       if XNN_LIKELY(nnz != 0) {
59         do {
60           const intptr_t diff = *dmap++;
61           const float32x4_t vi0123 = vld1q_f32(input);
62           const float32x4_t vi4567 = vld1q_f32(input + 4);
63           const float32x4_t vi89AB = vld1q_f32(input + 8);
64           const float32x4_t viCDEF = vld1q_f32(input + 12);
65           const float32x4_t viGHIJ = vld1q_f32(input + 16);
66           const float32x4_t viKLMN = vld1q_f32(input + 20);
67           const float32x4_t viOPQR = vld1q_f32(input + 24);
68           const float32x4_t viSTUV = vld1q_f32(input + 28);
69           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
70           __builtin_prefetch(input + 16);
71           __builtin_prefetch(input + 32);
72           const float32x2_t vw = vld1_f32(w); w += 2;
73           __builtin_prefetch(w + 32);
74           vacc0123n0 = vfmaq_lane_f32(vacc0123n0, vi0123, vw, 0);
75           vacc4567n0 = vfmaq_lane_f32(vacc4567n0, vi4567, vw, 0);
76           vacc89ABn0 = vfmaq_lane_f32(vacc89ABn0, vi89AB, vw, 0);
77           vaccCDEFn0 = vfmaq_lane_f32(vaccCDEFn0, viCDEF, vw, 0);
78           vaccGHIJn0 = vfmaq_lane_f32(vaccGHIJn0, viGHIJ, vw, 0);
79           vaccKLMNn0 = vfmaq_lane_f32(vaccKLMNn0, viKLMN, vw, 0);
80           vaccOPQRn0 = vfmaq_lane_f32(vaccOPQRn0, viOPQR, vw, 0);
81           vaccSTUVn0 = vfmaq_lane_f32(vaccSTUVn0, viSTUV, vw, 0);
82           vacc0123n1 = vfmaq_lane_f32(vacc0123n1, vi0123, vw, 1);
83           vacc4567n1 = vfmaq_lane_f32(vacc4567n1, vi4567, vw, 1);
84           vacc89ABn1 = vfmaq_lane_f32(vacc89ABn1, vi89AB, vw, 1);
85           vaccCDEFn1 = vfmaq_lane_f32(vaccCDEFn1, viCDEF, vw, 1);
86           vaccGHIJn1 = vfmaq_lane_f32(vaccGHIJn1, viGHIJ, vw, 1);
87           vaccKLMNn1 = vfmaq_lane_f32(vaccKLMNn1, viKLMN, vw, 1);
88           vaccOPQRn1 = vfmaq_lane_f32(vaccOPQRn1, viOPQR, vw, 1);
89           vaccSTUVn1 = vfmaq_lane_f32(vaccSTUVn1, viSTUV, vw, 1);
90         } while (--nnz != 0);
91       }
92       float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
93       float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
94       float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
95       float32x4_t voutCDEFn0 = vminq_f32(vaccCDEFn0, vmax);
96       float32x4_t voutGHIJn0 = vminq_f32(vaccGHIJn0, vmax);
97       float32x4_t voutKLMNn0 = vminq_f32(vaccKLMNn0, vmax);
98       float32x4_t voutOPQRn0 = vminq_f32(vaccOPQRn0, vmax);
99       float32x4_t voutSTUVn0 = vminq_f32(vaccSTUVn0, vmax);
100       float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
101       float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
102       float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
103       float32x4_t voutCDEFn1 = vminq_f32(vaccCDEFn1, vmax);
104       float32x4_t voutGHIJn1 = vminq_f32(vaccGHIJn1, vmax);
105       float32x4_t voutKLMNn1 = vminq_f32(vaccKLMNn1, vmax);
106       float32x4_t voutOPQRn1 = vminq_f32(vaccOPQRn1, vmax);
107       float32x4_t voutSTUVn1 = vminq_f32(vaccSTUVn1, vmax);
108 
109       vout0123n0 = vmaxq_f32(vout0123n0, vmin);
110       vout4567n0 = vmaxq_f32(vout4567n0, vmin);
111       vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
112       voutCDEFn0 = vmaxq_f32(voutCDEFn0, vmin);
113       voutGHIJn0 = vmaxq_f32(voutGHIJn0, vmin);
114       voutKLMNn0 = vmaxq_f32(voutKLMNn0, vmin);
115       voutOPQRn0 = vmaxq_f32(voutOPQRn0, vmin);
116       voutSTUVn0 = vmaxq_f32(voutSTUVn0, vmin);
117       vout0123n1 = vmaxq_f32(vout0123n1, vmin);
118       vout4567n1 = vmaxq_f32(vout4567n1, vmin);
119       vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
120       voutCDEFn1 = vmaxq_f32(voutCDEFn1, vmin);
121       voutGHIJn1 = vmaxq_f32(voutGHIJn1, vmin);
122       voutKLMNn1 = vmaxq_f32(voutKLMNn1, vmin);
123       voutOPQRn1 = vmaxq_f32(voutOPQRn1, vmin);
124       voutSTUVn1 = vmaxq_f32(voutSTUVn1, vmin);
125 
126       vst1q_f32(output + 0, vout0123n0);
127       vst1q_f32(output + 4, vout4567n0);
128       vst1q_f32(output + 8, vout89ABn0);
129       vst1q_f32(output + 12, voutCDEFn0);
130       vst1q_f32(output + 16, voutGHIJn0);
131       vst1q_f32(output + 20, voutKLMNn0);
132       vst1q_f32(output + 24, voutOPQRn0);
133       vst1q_f32(output + 28, voutSTUVn0);
134       output = (float*restrict) ((uintptr_t) output + output_stride);
135       vst1q_f32(output + 0, vout0123n1);
136       vst1q_f32(output + 4, vout4567n1);
137       vst1q_f32(output + 8, vout89ABn1);
138       vst1q_f32(output + 12, voutCDEFn1);
139       vst1q_f32(output + 16, voutGHIJn1);
140       vst1q_f32(output + 20, voutKLMNn1);
141       vst1q_f32(output + 24, voutOPQRn1);
142       vst1q_f32(output + 28, voutSTUVn1);
143       output = (float*restrict) ((uintptr_t) output + output_stride);
144       n -= 2;
145     }
146 
147     // clean up loop, fall back to nr=1
148     if XNN_UNLIKELY(n != 0) {
149       do {
150         uint32_t nnz = *nnzmap++;
151         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
152         float32x4_t vacc4567 = vacc0123;
153         float32x4_t vacc89AB = vacc0123;
154         float32x4_t vaccCDEF = vacc0123;
155         float32x4_t vaccGHIJ = vacc0123;
156         float32x4_t vaccKLMN = vacc0123;
157         float32x4_t vaccOPQR = vacc0123;
158         float32x4_t vaccSTUV = vacc0123;
159         if XNN_LIKELY(nnz != 0) {
160           do {
161             const intptr_t diff = *dmap++;
162             const float32x4_t vi0123 = vld1q_f32(input);
163             const float32x4_t vi4567 = vld1q_f32(input + 4);
164             const float32x4_t vi89AB = vld1q_f32(input + 8);
165             const float32x4_t viCDEF = vld1q_f32(input + 12);
166             const float32x4_t viGHIJ = vld1q_f32(input + 16);
167             const float32x4_t viKLMN = vld1q_f32(input + 20);
168             const float32x4_t viOPQR = vld1q_f32(input + 24);
169             const float32x4_t viSTUV = vld1q_f32(input + 28);
170             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
171             __builtin_prefetch(input + 16);
172             __builtin_prefetch(input + 32);
173             const float32x4_t vw = vld1q_dup_f32(w); w += 1;
174             __builtin_prefetch(w + 32);
175             vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
176             vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
177             vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
178             vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
179             vaccGHIJ = vfmaq_f32(vaccGHIJ, viGHIJ, vw);
180             vaccKLMN = vfmaq_f32(vaccKLMN, viKLMN, vw);
181             vaccOPQR = vfmaq_f32(vaccOPQR, viOPQR, vw);
182             vaccSTUV = vfmaq_f32(vaccSTUV, viSTUV, vw);
183           } while (--nnz != 0);
184         }
185         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
186         float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
187         float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
188         float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
189         float32x4_t voutGHIJ = vminq_f32(vaccGHIJ, vmax);
190         float32x4_t voutKLMN = vminq_f32(vaccKLMN, vmax);
191         float32x4_t voutOPQR = vminq_f32(vaccOPQR, vmax);
192         float32x4_t voutSTUV = vminq_f32(vaccSTUV, vmax);
193 
194         vout0123 = vmaxq_f32(vout0123, vmin);
195         vout4567 = vmaxq_f32(vout4567, vmin);
196         vout89AB = vmaxq_f32(vout89AB, vmin);
197         voutCDEF = vmaxq_f32(voutCDEF, vmin);
198         voutGHIJ = vmaxq_f32(voutGHIJ, vmin);
199         voutKLMN = vmaxq_f32(voutKLMN, vmin);
200         voutOPQR = vmaxq_f32(voutOPQR, vmin);
201         voutSTUV = vmaxq_f32(voutSTUV, vmin);
202 
203         vst1q_f32(output + 0, vout0123);
204         vst1q_f32(output + 4, vout4567);
205         vst1q_f32(output + 8, vout89AB);
206         vst1q_f32(output + 12, voutCDEF);
207         vst1q_f32(output + 16, voutGHIJ);
208         vst1q_f32(output + 20, voutKLMN);
209         vst1q_f32(output + 24, voutOPQR);
210         vst1q_f32(output + 28, voutSTUV);
211         output = (float*restrict) ((uintptr_t) output + output_stride);
212         n -= 1;
213       } while (n != 0);
214     }
215     output = (float*restrict) ((uintptr_t) output - output_decrement);
216     input += 32;
217     mc -= 32 * sizeof(float);
218   }
219   if XNN_UNLIKELY(mc != 0) {
220     output_decrement += 16 * sizeof(float);
221     if (mc & (16 * sizeof(float))) {
222       const float*restrict w = weights;
223       const int32_t* dmap = widx_dmap;
224       const uint32_t* nnzmap = nidx_nnzmap;
225       size_t n = nc;
226       while (n >= 2) {
227         uint32_t nnz = *nnzmap++;
228         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
229         float32x4_t vacc4567n0 = vacc0123n0;
230         float32x4_t vacc89ABn0 = vacc0123n0;
231         float32x4_t vaccCDEFn0 = vacc0123n0;
232         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
233         float32x4_t vacc4567n1 = vacc0123n1;
234         float32x4_t vacc89ABn1 = vacc0123n1;
235         float32x4_t vaccCDEFn1 = vacc0123n1;
236         if XNN_LIKELY(nnz != 0) {
237           do {
238             const intptr_t diff = *dmap++;
239             const float32x4_t vi0123 = vld1q_f32(input);
240             const float32x4_t vi4567 = vld1q_f32(input + 4);
241             const float32x4_t vi89AB = vld1q_f32(input + 8);
242             const float32x4_t viCDEF = vld1q_f32(input + 12);
243             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
244             const float32x2_t vw = vld1_f32(w); w += 2;
245 
246             vacc0123n0 = vfmaq_lane_f32(vacc0123n0, vi0123, vw, 0);
247             vacc4567n0 = vfmaq_lane_f32(vacc4567n0, vi4567, vw, 0);
248             vacc89ABn0 = vfmaq_lane_f32(vacc89ABn0, vi89AB, vw, 0);
249             vaccCDEFn0 = vfmaq_lane_f32(vaccCDEFn0, viCDEF, vw, 0);
250             vacc0123n1 = vfmaq_lane_f32(vacc0123n1, vi0123, vw, 1);
251             vacc4567n1 = vfmaq_lane_f32(vacc4567n1, vi4567, vw, 1);
252             vacc89ABn1 = vfmaq_lane_f32(vacc89ABn1, vi89AB, vw, 1);
253             vaccCDEFn1 = vfmaq_lane_f32(vaccCDEFn1, viCDEF, vw, 1);
254           } while (--nnz != 0);
255         }
256         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
257         float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
258         float32x4_t vout89ABn0 = vminq_f32(vacc89ABn0, vmax);
259         float32x4_t voutCDEFn0 = vminq_f32(vaccCDEFn0, vmax);
260         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
261         float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
262         float32x4_t vout89ABn1 = vminq_f32(vacc89ABn1, vmax);
263         float32x4_t voutCDEFn1 = vminq_f32(vaccCDEFn1, vmax);
264 
265         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
266         vout4567n0 = vmaxq_f32(vout4567n0, vmin);
267         vout89ABn0 = vmaxq_f32(vout89ABn0, vmin);
268         voutCDEFn0 = vmaxq_f32(voutCDEFn0, vmin);
269         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
270         vout4567n1 = vmaxq_f32(vout4567n1, vmin);
271         vout89ABn1 = vmaxq_f32(vout89ABn1, vmin);
272         voutCDEFn1 = vmaxq_f32(voutCDEFn1, vmin);
273 
274         vst1q_f32(output + 0, vout0123n0);
275         vst1q_f32(output + 4, vout4567n0);
276         vst1q_f32(output + 8, vout89ABn0);
277         vst1q_f32(output + 12, voutCDEFn0);
278         output = (float*restrict) ((uintptr_t) output + output_stride);
279         vst1q_f32(output + 0, vout0123n1);
280         vst1q_f32(output + 4, vout4567n1);
281         vst1q_f32(output + 8, vout89ABn1);
282         vst1q_f32(output + 12, voutCDEFn1);
283         output = (float*restrict) ((uintptr_t) output + output_stride);
284         n -= 2;
285       }
286 
287       // clean up loop, fall back to nr=1
288       if XNN_UNLIKELY(n != 0) {
289         do {
290           uint32_t nnz = *nnzmap++;
291           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
292           float32x4_t vacc4567 = vacc0123;
293           float32x4_t vacc89AB = vacc0123;
294           float32x4_t vaccCDEF = vacc0123;
295           if XNN_LIKELY(nnz != 0) {
296             do {
297               const intptr_t diff = *dmap++;
298               const float32x4_t vi0123 = vld1q_f32(input);
299               const float32x4_t vi4567 = vld1q_f32(input + 4);
300               const float32x4_t vi89AB = vld1q_f32(input + 8);
301               const float32x4_t viCDEF = vld1q_f32(input + 12);
302               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
303               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
304               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
305               vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
306               vacc89AB = vfmaq_f32(vacc89AB, vi89AB, vw);
307               vaccCDEF = vfmaq_f32(vaccCDEF, viCDEF, vw);
308             } while (--nnz != 0);
309           }
310           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
311           float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
312           float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
313           float32x4_t voutCDEF = vminq_f32(vaccCDEF, vmax);
314 
315           vout0123 = vmaxq_f32(vout0123, vmin);
316           vout4567 = vmaxq_f32(vout4567, vmin);
317           vout89AB = vmaxq_f32(vout89AB, vmin);
318           voutCDEF = vmaxq_f32(voutCDEF, vmin);
319 
320           vst1q_f32(output + 0, vout0123);
321           vst1q_f32(output + 4, vout4567);
322           vst1q_f32(output + 8, vout89AB);
323           vst1q_f32(output + 12, voutCDEF);
324           output = (float*restrict) ((uintptr_t) output + output_stride);
325           n -= 1;
326         } while (n != 0);
327       }
328       output = (float*restrict) ((uintptr_t) output - output_decrement);
329       input += 16;
330     }
331     output_decrement += 8 * sizeof(float);
332     if (mc & (8 * sizeof(float))) {
333       const float*restrict w = weights;
334       const int32_t* dmap = widx_dmap;
335       const uint32_t* nnzmap = nidx_nnzmap;
336       size_t n = nc;
337       while (n >= 2) {
338         uint32_t nnz = *nnzmap++;
339         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
340         float32x4_t vacc4567n0 = vacc0123n0;
341         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
342         float32x4_t vacc4567n1 = vacc0123n1;
343         if XNN_LIKELY(nnz != 0) {
344           do {
345             const intptr_t diff = *dmap++;
346             const float32x4_t vi0123 = vld1q_f32(input);
347             const float32x4_t vi4567 = vld1q_f32(input + 4);
348             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
349             const float32x2_t vw = vld1_f32(w); w += 2;
350 
351             vacc0123n0 = vfmaq_lane_f32(vacc0123n0, vi0123, vw, 0);
352             vacc4567n0 = vfmaq_lane_f32(vacc4567n0, vi4567, vw, 0);
353             vacc0123n1 = vfmaq_lane_f32(vacc0123n1, vi0123, vw, 1);
354             vacc4567n1 = vfmaq_lane_f32(vacc4567n1, vi4567, vw, 1);
355           } while (--nnz != 0);
356         }
357         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
358         float32x4_t vout4567n0 = vminq_f32(vacc4567n0, vmax);
359         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
360         float32x4_t vout4567n1 = vminq_f32(vacc4567n1, vmax);
361 
362         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
363         vout4567n0 = vmaxq_f32(vout4567n0, vmin);
364         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
365         vout4567n1 = vmaxq_f32(vout4567n1, vmin);
366 
367         vst1q_f32(output + 0, vout0123n0);
368         vst1q_f32(output + 4, vout4567n0);
369         output = (float*restrict) ((uintptr_t) output + output_stride);
370         vst1q_f32(output + 0, vout0123n1);
371         vst1q_f32(output + 4, vout4567n1);
372         output = (float*restrict) ((uintptr_t) output + output_stride);
373         n -= 2;
374       }
375 
376       // clean up loop, fall back to nr=1
377       if XNN_UNLIKELY(n != 0) {
378         do {
379           uint32_t nnz = *nnzmap++;
380           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
381           float32x4_t vacc4567 = vacc0123;
382           if XNN_LIKELY(nnz != 0) {
383             do {
384               const intptr_t diff = *dmap++;
385               const float32x4_t vi0123 = vld1q_f32(input);
386               const float32x4_t vi4567 = vld1q_f32(input + 4);
387               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
388               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
389               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
390               vacc4567 = vfmaq_f32(vacc4567, vi4567, vw);
391             } while (--nnz != 0);
392           }
393           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
394           float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
395 
396           vout0123 = vmaxq_f32(vout0123, vmin);
397           vout4567 = vmaxq_f32(vout4567, vmin);
398 
399           vst1q_f32(output + 0, vout0123);
400           vst1q_f32(output + 4, vout4567);
401           output = (float*restrict) ((uintptr_t) output + output_stride);
402           n -= 1;
403         } while (n != 0);
404       }
405       output = (float*restrict) ((uintptr_t) output - output_decrement);
406       input += 8;
407     }
408     output_decrement += 4 * sizeof(float);
409     if (mc & (4 * sizeof(float))) {
410       const float*restrict w = weights;
411       const int32_t* dmap = widx_dmap;
412       const uint32_t* nnzmap = nidx_nnzmap;
413       size_t n = nc;
414       while (n >= 2) {
415         uint32_t nnz = *nnzmap++;
416         float32x4_t vacc0123n0 = vld1q_dup_f32(w); w += 1;
417         float32x4_t vacc0123n1 = vld1q_dup_f32(w); w += 1;
418         if XNN_LIKELY(nnz != 0) {
419           do {
420             const intptr_t diff = *dmap++;
421             const float32x4_t vi0123 = vld1q_f32(input);
422             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
423             const float32x2_t vw = vld1_f32(w); w += 2;
424 
425             vacc0123n0 = vfmaq_lane_f32(vacc0123n0, vi0123, vw, 0);
426             vacc0123n1 = vfmaq_lane_f32(vacc0123n1, vi0123, vw, 1);
427           } while (--nnz != 0);
428         }
429         float32x4_t vout0123n0 = vminq_f32(vacc0123n0, vmax);
430         float32x4_t vout0123n1 = vminq_f32(vacc0123n1, vmax);
431 
432         vout0123n0 = vmaxq_f32(vout0123n0, vmin);
433         vout0123n1 = vmaxq_f32(vout0123n1, vmin);
434 
435         vst1q_f32(output + 0, vout0123n0);
436         output = (float*restrict) ((uintptr_t) output + output_stride);
437         vst1q_f32(output + 0, vout0123n1);
438         output = (float*restrict) ((uintptr_t) output + output_stride);
439         n -= 2;
440       }
441 
442       // clean up loop, fall back to nr=1
443       if XNN_UNLIKELY(n != 0) {
444         do {
445           uint32_t nnz = *nnzmap++;
446           float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
447           if XNN_LIKELY(nnz != 0) {
448             do {
449               const intptr_t diff = *dmap++;
450               const float32x4_t vi0123 = vld1q_f32(input);
451               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
452               const float32x4_t vw = vld1q_dup_f32(w); w += 1;
453               vacc0123 = vfmaq_f32(vacc0123, vi0123, vw);
454             } while (--nnz != 0);
455           }
456           float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
457 
458           vout0123 = vmaxq_f32(vout0123, vmin);
459 
460           vst1q_f32(output + 0, vout0123);
461           output = (float*restrict) ((uintptr_t) output + output_stride);
462           n -= 1;
463         } while (n != 0);
464       }
465       output = (float*restrict) ((uintptr_t) output - output_decrement);
466       input += 4;
467     }
468     output_decrement += 2 * sizeof(float);
469     if (mc & (2 * sizeof(float))) {
470       const float*restrict w = weights;
471       const int32_t* dmap = widx_dmap;
472       const uint32_t* nnzmap = nidx_nnzmap;
473       size_t n = nc;
474       while (n >= 2) {
475         uint32_t nnz = *nnzmap++;
476         float32x2_t vacc01n0 = vld1_dup_f32(w); w += 1;
477         float32x2_t vacc01n1 = vld1_dup_f32(w); w += 1;
478         if XNN_LIKELY(nnz != 0) {
479           do {
480             const intptr_t diff = *dmap++;
481             const float32x2_t vi01 = vld1_f32(input);
482             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
483             const float32x2_t vw = vld1_f32(w); w += 2;
484 
485             vacc01n0 = vfma_lane_f32(vacc01n0, vi01, vw, 0);
486             vacc01n1 = vfma_lane_f32(vacc01n1, vi01, vw, 1);
487           } while (--nnz != 0);
488         }
489         float32x2_t vout01n0 = vmin_f32(vacc01n0, vget_low_f32(vmax));
490         float32x2_t vout01n1 = vmin_f32(vacc01n1, vget_low_f32(vmax));
491 
492         vout01n0 = vmax_f32(vout01n0, vget_low_f32(vmin));
493         vout01n1 = vmax_f32(vout01n1, vget_low_f32(vmin));
494 
495         vst1_f32(output + 0, vout01n0);
496         output = (float*restrict) ((uintptr_t) output + output_stride);
497         vst1_f32(output + 0, vout01n1);
498         output = (float*restrict) ((uintptr_t) output + output_stride);
499         n -= 2;
500       }
501 
502       // clean up loop, fall back to nr=1
503       if XNN_UNLIKELY(n != 0) {
504         do {
505           uint32_t nnz = *nnzmap++;
506           float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
507           if XNN_LIKELY(nnz != 0) {
508             do {
509               const intptr_t diff = *dmap++;
510               const float32x2_t vi01 = vld1_f32(input);
511               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
512               const float32x2_t vw = vld1_dup_f32(w); w += 1;
513               vacc01 = vfma_f32(vacc01, vi01, vw);
514             } while (--nnz != 0);
515           }
516           float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
517           vout01 = vmax_f32(vout01, vget_low_f32(vmin));
518 
519           vst1_f32(output, vout01);
520           output = (float*restrict) ((uintptr_t) output + output_stride);
521           n -= 1;
522         } while (n != 0);
523       }
524       output = (float*restrict) ((uintptr_t) output - output_decrement);
525       input += 2;
526     }
527     output_decrement += 1 * sizeof(float);
528     if (mc & (1 * sizeof(float))) {
529       const float*restrict w = weights;
530       const int32_t* dmap = widx_dmap;
531       const uint32_t* nnzmap = nidx_nnzmap;
532       size_t n = nc;
533       while (n >= 2) {
534         uint32_t nnz = *nnzmap++;
535         float32x2_t vacc0n0 = vld1_dup_f32(w); w += 1;
536         float32x2_t vacc0n1 = vld1_dup_f32(w); w += 1;
537         if XNN_LIKELY(nnz != 0) {
538           do {
539             const intptr_t diff = *dmap++;
540             const float32x2_t vi0 = vld1_dup_f32(input);
541             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
542             const float32x2_t vw = vld1_f32(w); w += 2;
543 
544             vacc0n0 = vfma_lane_f32(vacc0n0, vi0, vw, 0);
545             vacc0n1 = vfma_lane_f32(vacc0n1, vi0, vw, 1);
546           } while (--nnz != 0);
547         }
548         float32x2_t vout0n0 = vmin_f32(vacc0n0, vget_low_f32(vmax));
549         float32x2_t vout0n1 = vmin_f32(vacc0n1, vget_low_f32(vmax));
550 
551         vout0n0 = vmax_f32(vout0n0, vget_low_f32(vmin));
552         vout0n1 = vmax_f32(vout0n1, vget_low_f32(vmin));
553 
554         vst1_lane_f32(output + 0, vout0n0, 0);
555         output = (float*restrict) ((uintptr_t) output + output_stride);
556         vst1_lane_f32(output + 0, vout0n1, 0);
557         output = (float*restrict) ((uintptr_t) output + output_stride);
558         n -= 2;
559       }
560 
561       // clean up loop, fall back to nr=1
562       if XNN_UNLIKELY(n != 0) {
563         do {
564           uint32_t nnz = *nnzmap++;
565           float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
566           if XNN_LIKELY(nnz != 0) {
567             do {
568               const intptr_t diff = *dmap++;
569               const float32x2_t vi0 = vld1_dup_f32(input);
570               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
571               const float32x2_t vw = vld1_dup_f32(w); w += 1;
572               vacc0 = vfma_f32(vacc0, vi0, vw);
573             } while (--nnz != 0);
574           }
575           float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
576           vout0 = vmax_f32(vout0, vget_low_f32(vmin));
577 
578           vst1_lane_f32(output, vout0, 1);
579           output = (float*restrict) ((uintptr_t) output + output_stride);
580           n -= 1;
581         } while (n != 0);
582       }
583       output = (float*restrict) ((uintptr_t) output - output_decrement);
584       input += 1;
585     }
586     }
587 }
588