xref: /aosp_15_r20/external/XNNPACK/src/f32-spmm/gen/8x4-minmax-scalar.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xnnpack/math.h>
13 #include <xnnpack/spmm.h>
14 
15 
xnn_f32_spmm_minmax_ukernel_8x4__scalar(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_spmm_minmax_ukernel_8x4__scalar(
17     size_t mc,
18     size_t nc,
19     const float*restrict input,
20     const float*restrict weights,
21     const int32_t*restrict widx_dmap,
22     const uint32_t*restrict nidx_nnzmap,
23     float*restrict output,
24     size_t output_stride,
25     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27   assert(mc != 0);
28   assert(mc % sizeof(float) == 0);
29   assert(nc != 0);
30 
31   const float vmin = params->scalar.min;
32   const float vmax = params->scalar.max;
33   size_t output_decrement = output_stride * nc - 8 * sizeof(float);
34   while (mc >= 8 * sizeof(float)) {
35     const float*restrict w = weights;
36     const int32_t* dmap = widx_dmap;
37     const uint32_t* nnzmap = nidx_nnzmap;
38     size_t n = nc;
39     while (n >= 4) {
40       uint32_t nnz = *nnzmap++;
41       float vacc0x0 = *w++;
42       float vacc1x0 = vacc0x0;
43       float vacc2x0 = vacc0x0;
44       float vacc3x0 = vacc0x0;
45       float vacc4x0 = vacc0x0;
46       float vacc5x0 = vacc0x0;
47       float vacc6x0 = vacc0x0;
48       float vacc7x0 = vacc0x0;
49       float vacc0x1 = *w++;
50       float vacc1x1 = vacc0x1;
51       float vacc2x1 = vacc0x1;
52       float vacc3x1 = vacc0x1;
53       float vacc4x1 = vacc0x1;
54       float vacc5x1 = vacc0x1;
55       float vacc6x1 = vacc0x1;
56       float vacc7x1 = vacc0x1;
57       float vacc0x2 = *w++;
58       float vacc1x2 = vacc0x2;
59       float vacc2x2 = vacc0x2;
60       float vacc3x2 = vacc0x2;
61       float vacc4x2 = vacc0x2;
62       float vacc5x2 = vacc0x2;
63       float vacc6x2 = vacc0x2;
64       float vacc7x2 = vacc0x2;
65       float vacc0x3 = *w++;
66       float vacc1x3 = vacc0x3;
67       float vacc2x3 = vacc0x3;
68       float vacc3x3 = vacc0x3;
69       float vacc4x3 = vacc0x3;
70       float vacc5x3 = vacc0x3;
71       float vacc6x3 = vacc0x3;
72       float vacc7x3 = vacc0x3;
73       if XNN_LIKELY(nnz != 0) {
74         do {
75           const intptr_t diff = *dmap++;
76           const float vi0 = input[0];
77           const float vi1 = input[1];
78           const float vi2 = input[2];
79           const float vi3 = input[3];
80           const float vi4 = input[4];
81           const float vi5 = input[5];
82           const float vi6 = input[6];
83           const float vi7 = input[7];
84           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
85           const float vw0 = *w++;
86           const float vw1 = *w++;
87           const float vw2 = *w++;
88           const float vw3 = *w++;
89           vacc0x0 += vi0 * vw0;
90           vacc1x0 += vi1 * vw0;
91           vacc2x0 += vi2 * vw0;
92           vacc3x0 += vi3 * vw0;
93           vacc4x0 += vi4 * vw0;
94           vacc5x0 += vi5 * vw0;
95           vacc6x0 += vi6 * vw0;
96           vacc7x0 += vi7 * vw0;
97           vacc0x1 += vi0 * vw1;
98           vacc1x1 += vi1 * vw1;
99           vacc2x1 += vi2 * vw1;
100           vacc3x1 += vi3 * vw1;
101           vacc4x1 += vi4 * vw1;
102           vacc5x1 += vi5 * vw1;
103           vacc6x1 += vi6 * vw1;
104           vacc7x1 += vi7 * vw1;
105           vacc0x2 += vi0 * vw2;
106           vacc1x2 += vi1 * vw2;
107           vacc2x2 += vi2 * vw2;
108           vacc3x2 += vi3 * vw2;
109           vacc4x2 += vi4 * vw2;
110           vacc5x2 += vi5 * vw2;
111           vacc6x2 += vi6 * vw2;
112           vacc7x2 += vi7 * vw2;
113           vacc0x3 += vi0 * vw3;
114           vacc1x3 += vi1 * vw3;
115           vacc2x3 += vi2 * vw3;
116           vacc3x3 += vi3 * vw3;
117           vacc4x3 += vi4 * vw3;
118           vacc5x3 += vi5 * vw3;
119           vacc6x3 += vi6 * vw3;
120           vacc7x3 += vi7 * vw3;
121         } while (--nnz != 0);
122       }
123       float vout0x0 = math_min_f32(vacc0x0, vmax);
124       float vout1x0 = math_min_f32(vacc1x0, vmax);
125       float vout2x0 = math_min_f32(vacc2x0, vmax);
126       float vout3x0 = math_min_f32(vacc3x0, vmax);
127       float vout4x0 = math_min_f32(vacc4x0, vmax);
128       float vout5x0 = math_min_f32(vacc5x0, vmax);
129       float vout6x0 = math_min_f32(vacc6x0, vmax);
130       float vout7x0 = math_min_f32(vacc7x0, vmax);
131       float vout0x1 = math_min_f32(vacc0x1, vmax);
132       float vout1x1 = math_min_f32(vacc1x1, vmax);
133       float vout2x1 = math_min_f32(vacc2x1, vmax);
134       float vout3x1 = math_min_f32(vacc3x1, vmax);
135       float vout4x1 = math_min_f32(vacc4x1, vmax);
136       float vout5x1 = math_min_f32(vacc5x1, vmax);
137       float vout6x1 = math_min_f32(vacc6x1, vmax);
138       float vout7x1 = math_min_f32(vacc7x1, vmax);
139       float vout0x2 = math_min_f32(vacc0x2, vmax);
140       float vout1x2 = math_min_f32(vacc1x2, vmax);
141       float vout2x2 = math_min_f32(vacc2x2, vmax);
142       float vout3x2 = math_min_f32(vacc3x2, vmax);
143       float vout4x2 = math_min_f32(vacc4x2, vmax);
144       float vout5x2 = math_min_f32(vacc5x2, vmax);
145       float vout6x2 = math_min_f32(vacc6x2, vmax);
146       float vout7x2 = math_min_f32(vacc7x2, vmax);
147       float vout0x3 = math_min_f32(vacc0x3, vmax);
148       float vout1x3 = math_min_f32(vacc1x3, vmax);
149       float vout2x3 = math_min_f32(vacc2x3, vmax);
150       float vout3x3 = math_min_f32(vacc3x3, vmax);
151       float vout4x3 = math_min_f32(vacc4x3, vmax);
152       float vout5x3 = math_min_f32(vacc5x3, vmax);
153       float vout6x3 = math_min_f32(vacc6x3, vmax);
154       float vout7x3 = math_min_f32(vacc7x3, vmax);
155       vout0x0 = math_max_f32(vout0x0, vmin);
156       vout1x0 = math_max_f32(vout1x0, vmin);
157       vout2x0 = math_max_f32(vout2x0, vmin);
158       vout3x0 = math_max_f32(vout3x0, vmin);
159       vout4x0 = math_max_f32(vout4x0, vmin);
160       vout5x0 = math_max_f32(vout5x0, vmin);
161       vout6x0 = math_max_f32(vout6x0, vmin);
162       vout7x0 = math_max_f32(vout7x0, vmin);
163       vout0x1 = math_max_f32(vout0x1, vmin);
164       vout1x1 = math_max_f32(vout1x1, vmin);
165       vout2x1 = math_max_f32(vout2x1, vmin);
166       vout3x1 = math_max_f32(vout3x1, vmin);
167       vout4x1 = math_max_f32(vout4x1, vmin);
168       vout5x1 = math_max_f32(vout5x1, vmin);
169       vout6x1 = math_max_f32(vout6x1, vmin);
170       vout7x1 = math_max_f32(vout7x1, vmin);
171       vout0x2 = math_max_f32(vout0x2, vmin);
172       vout1x2 = math_max_f32(vout1x2, vmin);
173       vout2x2 = math_max_f32(vout2x2, vmin);
174       vout3x2 = math_max_f32(vout3x2, vmin);
175       vout4x2 = math_max_f32(vout4x2, vmin);
176       vout5x2 = math_max_f32(vout5x2, vmin);
177       vout6x2 = math_max_f32(vout6x2, vmin);
178       vout7x2 = math_max_f32(vout7x2, vmin);
179       vout0x3 = math_max_f32(vout0x3, vmin);
180       vout1x3 = math_max_f32(vout1x3, vmin);
181       vout2x3 = math_max_f32(vout2x3, vmin);
182       vout3x3 = math_max_f32(vout3x3, vmin);
183       vout4x3 = math_max_f32(vout4x3, vmin);
184       vout5x3 = math_max_f32(vout5x3, vmin);
185       vout6x3 = math_max_f32(vout6x3, vmin);
186       vout7x3 = math_max_f32(vout7x3, vmin);
187       output[0] = vout0x3;
188       output[1] = vout1x3;
189       output[2] = vout2x3;
190       output[3] = vout3x3;
191       output[4] = vout4x3;
192       output[5] = vout5x3;
193       output[6] = vout6x3;
194       output[7] = vout7x3;
195       output[0] = vout0x0;
196       output[1] = vout1x0;
197       output[2] = vout2x0;
198       output[3] = vout3x0;
199       output[4] = vout4x0;
200       output[5] = vout5x0;
201       output[6] = vout6x0;
202       output[7] = vout7x0;
203       output = (float*restrict) ((uintptr_t) output + output_stride);
204       output[0] = vout0x1;
205       output[1] = vout1x1;
206       output[2] = vout2x1;
207       output[3] = vout3x1;
208       output[4] = vout4x1;
209       output[5] = vout5x1;
210       output[6] = vout6x1;
211       output[7] = vout7x1;
212       output = (float*restrict) ((uintptr_t) output + output_stride);
213       output[0] = vout0x2;
214       output[1] = vout1x2;
215       output[2] = vout2x2;
216       output[3] = vout3x2;
217       output[4] = vout4x2;
218       output[5] = vout5x2;
219       output[6] = vout6x2;
220       output[7] = vout7x2;
221       output = (float*restrict) ((uintptr_t) output + output_stride);
222       output[0] = vout0x3;
223       output[1] = vout1x3;
224       output[2] = vout2x3;
225       output[3] = vout3x3;
226       output[4] = vout4x3;
227       output[5] = vout5x3;
228       output[6] = vout6x3;
229       output[7] = vout7x3;
230       output = (float*restrict) ((uintptr_t) output + output_stride);
231       n -= 4;
232     }
233     if XNN_UNLIKELY(n != 0) {
234       do {
235         uint32_t nnz = *nnzmap++;
236         float vacc0 = *w++;
237         float vacc1 = vacc0;
238         float vacc2 = vacc0;
239         float vacc3 = vacc0;
240         float vacc4 = vacc0;
241         float vacc5 = vacc0;
242         float vacc6 = vacc0;
243         float vacc7 = vacc0;
244         if XNN_LIKELY(nnz != 0) {
245           do {
246             const intptr_t diff = *dmap++;
247             const float vi0 = input[0];
248             const float vi1 = input[1];
249             const float vi2 = input[2];
250             const float vi3 = input[3];
251             const float vi4 = input[4];
252             const float vi5 = input[5];
253             const float vi6 = input[6];
254             const float vi7 = input[7];
255             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
256             const float vw = *w++;
257             vacc0 += vi0 * vw;
258             vacc1 += vi1 * vw;
259             vacc2 += vi2 * vw;
260             vacc3 += vi3 * vw;
261             vacc4 += vi4 * vw;
262             vacc5 += vi5 * vw;
263             vacc6 += vi6 * vw;
264             vacc7 += vi7 * vw;
265           } while (--nnz != 0);
266         }
267         float vout0 = math_min_f32(vacc0, vmax);
268         float vout1 = math_min_f32(vacc1, vmax);
269         float vout2 = math_min_f32(vacc2, vmax);
270         float vout3 = math_min_f32(vacc3, vmax);
271         float vout4 = math_min_f32(vacc4, vmax);
272         float vout5 = math_min_f32(vacc5, vmax);
273         float vout6 = math_min_f32(vacc6, vmax);
274         float vout7 = math_min_f32(vacc7, vmax);
275         vout0 = math_max_f32(vout0, vmin);
276         vout1 = math_max_f32(vout1, vmin);
277         vout2 = math_max_f32(vout2, vmin);
278         vout3 = math_max_f32(vout3, vmin);
279         vout4 = math_max_f32(vout4, vmin);
280         vout5 = math_max_f32(vout5, vmin);
281         vout6 = math_max_f32(vout6, vmin);
282         vout7 = math_max_f32(vout7, vmin);
283         output[0] = vout0;
284         output[1] = vout1;
285         output[2] = vout2;
286         output[3] = vout3;
287         output[4] = vout4;
288         output[5] = vout5;
289         output[6] = vout6;
290         output[7] = vout7;
291         output = (float*restrict) ((uintptr_t) output + output_stride);
292         n -= 1;
293       } while (n != 0);
294     }
295     output = (float*restrict) ((uintptr_t) output - output_decrement);
296     input += 8;
297     mc -= 8 * sizeof(float);
298   }
299   if XNN_UNLIKELY(mc != 0) {
300     output_decrement += 4 * sizeof(float);
301     if (mc & (4 * sizeof(float))) {
302       const float*restrict w = weights;
303       const int32_t* dmap = widx_dmap;
304       const uint32_t* nnzmap = nidx_nnzmap;
305       size_t n = nc;
306       while (n >= 4) {
307         uint32_t nnz = *nnzmap++;
308         float vacc0x0 = *w++;
309         float vacc1x0 = vacc0x0;
310         float vacc2x0 = vacc0x0;
311         float vacc3x0 = vacc0x0;
312         float vacc0x1 = *w++;
313         float vacc1x1 = vacc0x1;
314         float vacc2x1 = vacc0x1;
315         float vacc3x1 = vacc0x1;
316         float vacc0x2 = *w++;
317         float vacc1x2 = vacc0x2;
318         float vacc2x2 = vacc0x2;
319         float vacc3x2 = vacc0x2;
320         float vacc0x3 = *w++;
321         float vacc1x3 = vacc0x3;
322         float vacc2x3 = vacc0x3;
323         float vacc3x3 = vacc0x3;
324         if XNN_LIKELY(nnz != 0) {
325           do {
326             const intptr_t diff = *dmap++;
327             const float vi0 = input[0];
328             const float vi1 = input[1];
329             const float vi2 = input[2];
330             const float vi3 = input[3];
331             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
332             const float vw0 = *w++;
333             const float vw1 = *w++;
334             const float vw2 = *w++;
335             const float vw3 = *w++;
336             vacc0x0 += vi0 * vw0;
337             vacc1x0 += vi1 * vw0;
338             vacc2x0 += vi2 * vw0;
339             vacc3x0 += vi3 * vw0;
340             vacc0x1 += vi0 * vw1;
341             vacc1x1 += vi1 * vw1;
342             vacc2x1 += vi2 * vw1;
343             vacc3x1 += vi3 * vw1;
344             vacc0x2 += vi0 * vw2;
345             vacc1x2 += vi1 * vw2;
346             vacc2x2 += vi2 * vw2;
347             vacc3x2 += vi3 * vw2;
348             vacc0x3 += vi0 * vw3;
349             vacc1x3 += vi1 * vw3;
350             vacc2x3 += vi2 * vw3;
351             vacc3x3 += vi3 * vw3;
352           } while (--nnz != 0);
353         }
354         float vout0x0 = math_min_f32(vacc0x0, vmax);
355         float vout1x0 = math_min_f32(vacc1x0, vmax);
356         float vout2x0 = math_min_f32(vacc2x0, vmax);
357         float vout3x0 = math_min_f32(vacc3x0, vmax);
358         float vout0x1 = math_min_f32(vacc0x1, vmax);
359         float vout1x1 = math_min_f32(vacc1x1, vmax);
360         float vout2x1 = math_min_f32(vacc2x1, vmax);
361         float vout3x1 = math_min_f32(vacc3x1, vmax);
362         float vout0x2 = math_min_f32(vacc0x2, vmax);
363         float vout1x2 = math_min_f32(vacc1x2, vmax);
364         float vout2x2 = math_min_f32(vacc2x2, vmax);
365         float vout3x2 = math_min_f32(vacc3x2, vmax);
366         float vout0x3 = math_min_f32(vacc0x3, vmax);
367         float vout1x3 = math_min_f32(vacc1x3, vmax);
368         float vout2x3 = math_min_f32(vacc2x3, vmax);
369         float vout3x3 = math_min_f32(vacc3x3, vmax);
370         vout0x0 = math_max_f32(vout0x0, vmin);
371         vout1x0 = math_max_f32(vout1x0, vmin);
372         vout2x0 = math_max_f32(vout2x0, vmin);
373         vout3x0 = math_max_f32(vout3x0, vmin);
374         vout0x1 = math_max_f32(vout0x1, vmin);
375         vout1x1 = math_max_f32(vout1x1, vmin);
376         vout2x1 = math_max_f32(vout2x1, vmin);
377         vout3x1 = math_max_f32(vout3x1, vmin);
378         vout0x2 = math_max_f32(vout0x2, vmin);
379         vout1x2 = math_max_f32(vout1x2, vmin);
380         vout2x2 = math_max_f32(vout2x2, vmin);
381         vout3x2 = math_max_f32(vout3x2, vmin);
382         vout0x3 = math_max_f32(vout0x3, vmin);
383         vout1x3 = math_max_f32(vout1x3, vmin);
384         vout2x3 = math_max_f32(vout2x3, vmin);
385         vout3x3 = math_max_f32(vout3x3, vmin);
386         output[0] = vout0x0;
387         output[1] = vout1x0;
388         output[2] = vout2x0;
389         output[3] = vout3x0;
390         output = (float*restrict) ((uintptr_t) output + output_stride);
391         output[0] = vout0x1;
392         output[1] = vout1x1;
393         output[2] = vout2x1;
394         output[3] = vout3x1;
395         output = (float*restrict) ((uintptr_t) output + output_stride);
396         output[0] = vout0x2;
397         output[1] = vout1x2;
398         output[2] = vout2x2;
399         output[3] = vout3x2;
400         output = (float*restrict) ((uintptr_t) output + output_stride);
401         output[0] = vout0x3;
402         output[1] = vout1x3;
403         output[2] = vout2x3;
404         output[3] = vout3x3;
405         output = (float*restrict) ((uintptr_t) output + output_stride);
406         n -= 4;
407       }
408       if XNN_UNLIKELY(n != 0) {
409         do {
410           uint32_t nnz = *nnzmap++;
411           float vacc0 = *w++;
412           float vacc1 = vacc0;
413           float vacc2 = vacc0;
414           float vacc3 = vacc0;
415           if XNN_LIKELY(nnz != 0) {
416             do {
417               const intptr_t diff = *dmap++;
418               const float vi0 = input[0];
419               const float vi1 = input[1];
420               const float vi2 = input[2];
421               const float vi3 = input[3];
422               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
423               const float vw = *w++;
424               vacc0 += vi0 * vw;
425               vacc1 += vi1 * vw;
426               vacc2 += vi2 * vw;
427               vacc3 += vi3 * vw;
428             } while (--nnz != 0);
429           }
430           float vout0 = math_min_f32(vacc0, vmax);
431           float vout1 = math_min_f32(vacc1, vmax);
432           float vout2 = math_min_f32(vacc2, vmax);
433           float vout3 = math_min_f32(vacc3, vmax);
434           vout0 = math_max_f32(vout0, vmin);
435           vout1 = math_max_f32(vout1, vmin);
436           vout2 = math_max_f32(vout2, vmin);
437           vout3 = math_max_f32(vout3, vmin);
438           output[0] = vout0;
439           output[1] = vout1;
440           output[2] = vout2;
441           output[3] = vout3;
442           output = (float*restrict) ((uintptr_t) output + output_stride);
443           n -= 1;
444         } while (n != 0);
445       }
446       output = (float*restrict) ((uintptr_t) output - output_decrement);
447       input += 4;
448     }
449     output_decrement += 2 * sizeof(float);
450     if (mc & (2 * sizeof(float))) {
451       const float*restrict w = weights;
452       const int32_t* dmap = widx_dmap;
453       const uint32_t* nnzmap = nidx_nnzmap;
454       size_t n = nc;
455       while (n >= 4) {
456         uint32_t nnz = *nnzmap++;
457         float vacc0x0 = *w++;
458         float vacc1x0 = vacc0x0;
459         float vacc0x1 = *w++;
460         float vacc1x1 = vacc0x1;
461         float vacc0x2 = *w++;
462         float vacc1x2 = vacc0x2;
463         float vacc0x3 = *w++;
464         float vacc1x3 = vacc0x3;
465         if XNN_LIKELY(nnz != 0) {
466           do {
467             const intptr_t diff = *dmap++;
468             const float vi0 = input[0];
469             const float vi1 = input[1];
470             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
471             const float vw0 = *w++;
472             const float vw1 = *w++;
473             const float vw2 = *w++;
474             const float vw3 = *w++;
475             vacc0x0 += vi0 * vw0;
476             vacc1x0 += vi1 * vw0;
477             vacc0x1 += vi0 * vw1;
478             vacc1x1 += vi1 * vw1;
479             vacc0x2 += vi0 * vw2;
480             vacc1x2 += vi1 * vw2;
481             vacc0x3 += vi0 * vw3;
482             vacc1x3 += vi1 * vw3;
483           } while (--nnz != 0);
484         }
485         float vout0x0 = math_min_f32(vacc0x0, vmax);
486         float vout1x0 = math_min_f32(vacc1x0, vmax);
487         float vout0x1 = math_min_f32(vacc0x1, vmax);
488         float vout1x1 = math_min_f32(vacc1x1, vmax);
489         float vout0x2 = math_min_f32(vacc0x2, vmax);
490         float vout1x2 = math_min_f32(vacc1x2, vmax);
491         float vout0x3 = math_min_f32(vacc0x3, vmax);
492         float vout1x3 = math_min_f32(vacc1x3, vmax);
493         vout0x0 = math_max_f32(vout0x0, vmin);
494         vout1x0 = math_max_f32(vout1x0, vmin);
495         vout0x1 = math_max_f32(vout0x1, vmin);
496         vout1x1 = math_max_f32(vout1x1, vmin);
497         vout0x2 = math_max_f32(vout0x2, vmin);
498         vout1x2 = math_max_f32(vout1x2, vmin);
499         vout0x3 = math_max_f32(vout0x3, vmin);
500         vout1x3 = math_max_f32(vout1x3, vmin);
501         output[0] = vout0x0;
502         output[1] = vout1x0;
503         output = (float*restrict) ((uintptr_t) output + output_stride);
504         output[0] = vout0x1;
505         output[1] = vout1x1;
506         output = (float*restrict) ((uintptr_t) output + output_stride);
507         output[0] = vout0x2;
508         output[1] = vout1x2;
509         output = (float*restrict) ((uintptr_t) output + output_stride);
510         output[0] = vout0x3;
511         output[1] = vout1x3;
512         output = (float*restrict) ((uintptr_t) output + output_stride);
513         n -= 4;
514       }
515       if XNN_UNLIKELY(n != 0) {
516         do {
517           uint32_t nnz = *nnzmap++;
518           float vacc0 = *w++;
519           float vacc1 = vacc0;
520           if XNN_LIKELY(nnz != 0) {
521             do {
522               const intptr_t diff = *dmap++;
523               const float vi0 = input[0];
524               const float vi1 = input[1];
525               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
526               const float vw = *w++;
527               vacc0 += vi0 * vw;
528               vacc1 += vi1 * vw;
529             } while (--nnz != 0);
530           }
531           float vout0 = math_min_f32(vacc0, vmax);
532           float vout1 = math_min_f32(vacc1, vmax);
533           vout0 = math_max_f32(vout0, vmin);
534           vout1 = math_max_f32(vout1, vmin);
535           output[0] = vout0;
536           output[1] = vout1;
537           output = (float*restrict) ((uintptr_t) output + output_stride);
538           n -= 1;
539         } while (n != 0);
540       }
541       output = (float*restrict) ((uintptr_t) output - output_decrement);
542       input += 2;
543     }
544     output_decrement += 1 * sizeof(float);
545     if (mc & (1 * sizeof(float))) {
546       const float*restrict w = weights;
547       const int32_t* dmap = widx_dmap;
548       const uint32_t* nnzmap = nidx_nnzmap;
549       size_t n = nc;
550       while (n >= 4) {
551         uint32_t nnz = *nnzmap++;
552         float vacc0x0 = *w++;
553         float vacc0x1 = *w++;
554         float vacc0x2 = *w++;
555         float vacc0x3 = *w++;
556         if XNN_LIKELY(nnz != 0) {
557           do {
558             const intptr_t diff = *dmap++;
559             const float vi0 = input[0];
560             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
561             const float vw0 = *w++;
562             const float vw1 = *w++;
563             const float vw2 = *w++;
564             const float vw3 = *w++;
565             vacc0x0 += vi0 * vw0;
566             vacc0x1 += vi0 * vw1;
567             vacc0x2 += vi0 * vw2;
568             vacc0x3 += vi0 * vw3;
569           } while (--nnz != 0);
570         }
571         float vout0x0 = math_min_f32(vacc0x0, vmax);
572         float vout0x1 = math_min_f32(vacc0x1, vmax);
573         float vout0x2 = math_min_f32(vacc0x2, vmax);
574         float vout0x3 = math_min_f32(vacc0x3, vmax);
575         vout0x0 = math_max_f32(vout0x0, vmin);
576         vout0x1 = math_max_f32(vout0x1, vmin);
577         vout0x2 = math_max_f32(vout0x2, vmin);
578         vout0x3 = math_max_f32(vout0x3, vmin);
579         output[0] = vout0x0;
580         output = (float*restrict) ((uintptr_t) output + output_stride);
581         output[0] = vout0x1;
582         output = (float*restrict) ((uintptr_t) output + output_stride);
583         output[0] = vout0x2;
584         output = (float*restrict) ((uintptr_t) output + output_stride);
585         output[0] = vout0x3;
586         output = (float*restrict) ((uintptr_t) output + output_stride);
587         n -= 4;
588       }
589       if XNN_UNLIKELY(n != 0) {
590         do {
591           uint32_t nnz = *nnzmap++;
592           float vacc0 = *w++;
593           if XNN_LIKELY(nnz != 0) {
594             do {
595               const intptr_t diff = *dmap++;
596               const float vi0 = input[0];
597               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
598               const float vw = *w++;
599               vacc0 += vi0 * vw;
600             } while (--nnz != 0);
601           }
602           float vout0 = math_min_f32(vacc0, vmax);
603           vout0 = math_max_f32(vout0, vmin);
604           output[0] = vout0;
605           output = (float*restrict) ((uintptr_t) output + output_stride);
606           n -= 1;
607         } while (n != 0);
608       }
609       output = (float*restrict) ((uintptr_t) output - output_decrement);
610       input += 1;
611     }
612   }
613 }
614