1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xnnpack/math.h>
13 #include <xnnpack/spmm.h>
14
15
xnn_f32_spmm_minmax_ukernel_8x4__scalar(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_spmm_minmax_ukernel_8x4__scalar(
17 size_t mc,
18 size_t nc,
19 const float*restrict input,
20 const float*restrict weights,
21 const int32_t*restrict widx_dmap,
22 const uint32_t*restrict nidx_nnzmap,
23 float*restrict output,
24 size_t output_stride,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27 assert(mc != 0);
28 assert(mc % sizeof(float) == 0);
29 assert(nc != 0);
30
31 const float vmin = params->scalar.min;
32 const float vmax = params->scalar.max;
33 size_t output_decrement = output_stride * nc - 8 * sizeof(float);
34 while (mc >= 8 * sizeof(float)) {
35 const float*restrict w = weights;
36 const int32_t* dmap = widx_dmap;
37 const uint32_t* nnzmap = nidx_nnzmap;
38 size_t n = nc;
39 while (n >= 4) {
40 uint32_t nnz = *nnzmap++;
41 float vacc0x0 = *w++;
42 float vacc1x0 = vacc0x0;
43 float vacc2x0 = vacc0x0;
44 float vacc3x0 = vacc0x0;
45 float vacc4x0 = vacc0x0;
46 float vacc5x0 = vacc0x0;
47 float vacc6x0 = vacc0x0;
48 float vacc7x0 = vacc0x0;
49 float vacc0x1 = *w++;
50 float vacc1x1 = vacc0x1;
51 float vacc2x1 = vacc0x1;
52 float vacc3x1 = vacc0x1;
53 float vacc4x1 = vacc0x1;
54 float vacc5x1 = vacc0x1;
55 float vacc6x1 = vacc0x1;
56 float vacc7x1 = vacc0x1;
57 float vacc0x2 = *w++;
58 float vacc1x2 = vacc0x2;
59 float vacc2x2 = vacc0x2;
60 float vacc3x2 = vacc0x2;
61 float vacc4x2 = vacc0x2;
62 float vacc5x2 = vacc0x2;
63 float vacc6x2 = vacc0x2;
64 float vacc7x2 = vacc0x2;
65 float vacc0x3 = *w++;
66 float vacc1x3 = vacc0x3;
67 float vacc2x3 = vacc0x3;
68 float vacc3x3 = vacc0x3;
69 float vacc4x3 = vacc0x3;
70 float vacc5x3 = vacc0x3;
71 float vacc6x3 = vacc0x3;
72 float vacc7x3 = vacc0x3;
73 if XNN_LIKELY(nnz != 0) {
74 do {
75 const intptr_t diff = *dmap++;
76 const float vi0 = input[0];
77 const float vi1 = input[1];
78 const float vi2 = input[2];
79 const float vi3 = input[3];
80 const float vi4 = input[4];
81 const float vi5 = input[5];
82 const float vi6 = input[6];
83 const float vi7 = input[7];
84 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
85 const float vw0 = *w++;
86 const float vw1 = *w++;
87 const float vw2 = *w++;
88 const float vw3 = *w++;
89 vacc0x0 += vi0 * vw0;
90 vacc1x0 += vi1 * vw0;
91 vacc2x0 += vi2 * vw0;
92 vacc3x0 += vi3 * vw0;
93 vacc4x0 += vi4 * vw0;
94 vacc5x0 += vi5 * vw0;
95 vacc6x0 += vi6 * vw0;
96 vacc7x0 += vi7 * vw0;
97 vacc0x1 += vi0 * vw1;
98 vacc1x1 += vi1 * vw1;
99 vacc2x1 += vi2 * vw1;
100 vacc3x1 += vi3 * vw1;
101 vacc4x1 += vi4 * vw1;
102 vacc5x1 += vi5 * vw1;
103 vacc6x1 += vi6 * vw1;
104 vacc7x1 += vi7 * vw1;
105 vacc0x2 += vi0 * vw2;
106 vacc1x2 += vi1 * vw2;
107 vacc2x2 += vi2 * vw2;
108 vacc3x2 += vi3 * vw2;
109 vacc4x2 += vi4 * vw2;
110 vacc5x2 += vi5 * vw2;
111 vacc6x2 += vi6 * vw2;
112 vacc7x2 += vi7 * vw2;
113 vacc0x3 += vi0 * vw3;
114 vacc1x3 += vi1 * vw3;
115 vacc2x3 += vi2 * vw3;
116 vacc3x3 += vi3 * vw3;
117 vacc4x3 += vi4 * vw3;
118 vacc5x3 += vi5 * vw3;
119 vacc6x3 += vi6 * vw3;
120 vacc7x3 += vi7 * vw3;
121 } while (--nnz != 0);
122 }
123 float vout0x0 = math_min_f32(vacc0x0, vmax);
124 float vout1x0 = math_min_f32(vacc1x0, vmax);
125 float vout2x0 = math_min_f32(vacc2x0, vmax);
126 float vout3x0 = math_min_f32(vacc3x0, vmax);
127 float vout4x0 = math_min_f32(vacc4x0, vmax);
128 float vout5x0 = math_min_f32(vacc5x0, vmax);
129 float vout6x0 = math_min_f32(vacc6x0, vmax);
130 float vout7x0 = math_min_f32(vacc7x0, vmax);
131 float vout0x1 = math_min_f32(vacc0x1, vmax);
132 float vout1x1 = math_min_f32(vacc1x1, vmax);
133 float vout2x1 = math_min_f32(vacc2x1, vmax);
134 float vout3x1 = math_min_f32(vacc3x1, vmax);
135 float vout4x1 = math_min_f32(vacc4x1, vmax);
136 float vout5x1 = math_min_f32(vacc5x1, vmax);
137 float vout6x1 = math_min_f32(vacc6x1, vmax);
138 float vout7x1 = math_min_f32(vacc7x1, vmax);
139 float vout0x2 = math_min_f32(vacc0x2, vmax);
140 float vout1x2 = math_min_f32(vacc1x2, vmax);
141 float vout2x2 = math_min_f32(vacc2x2, vmax);
142 float vout3x2 = math_min_f32(vacc3x2, vmax);
143 float vout4x2 = math_min_f32(vacc4x2, vmax);
144 float vout5x2 = math_min_f32(vacc5x2, vmax);
145 float vout6x2 = math_min_f32(vacc6x2, vmax);
146 float vout7x2 = math_min_f32(vacc7x2, vmax);
147 float vout0x3 = math_min_f32(vacc0x3, vmax);
148 float vout1x3 = math_min_f32(vacc1x3, vmax);
149 float vout2x3 = math_min_f32(vacc2x3, vmax);
150 float vout3x3 = math_min_f32(vacc3x3, vmax);
151 float vout4x3 = math_min_f32(vacc4x3, vmax);
152 float vout5x3 = math_min_f32(vacc5x3, vmax);
153 float vout6x3 = math_min_f32(vacc6x3, vmax);
154 float vout7x3 = math_min_f32(vacc7x3, vmax);
155 vout0x0 = math_max_f32(vout0x0, vmin);
156 vout1x0 = math_max_f32(vout1x0, vmin);
157 vout2x0 = math_max_f32(vout2x0, vmin);
158 vout3x0 = math_max_f32(vout3x0, vmin);
159 vout4x0 = math_max_f32(vout4x0, vmin);
160 vout5x0 = math_max_f32(vout5x0, vmin);
161 vout6x0 = math_max_f32(vout6x0, vmin);
162 vout7x0 = math_max_f32(vout7x0, vmin);
163 vout0x1 = math_max_f32(vout0x1, vmin);
164 vout1x1 = math_max_f32(vout1x1, vmin);
165 vout2x1 = math_max_f32(vout2x1, vmin);
166 vout3x1 = math_max_f32(vout3x1, vmin);
167 vout4x1 = math_max_f32(vout4x1, vmin);
168 vout5x1 = math_max_f32(vout5x1, vmin);
169 vout6x1 = math_max_f32(vout6x1, vmin);
170 vout7x1 = math_max_f32(vout7x1, vmin);
171 vout0x2 = math_max_f32(vout0x2, vmin);
172 vout1x2 = math_max_f32(vout1x2, vmin);
173 vout2x2 = math_max_f32(vout2x2, vmin);
174 vout3x2 = math_max_f32(vout3x2, vmin);
175 vout4x2 = math_max_f32(vout4x2, vmin);
176 vout5x2 = math_max_f32(vout5x2, vmin);
177 vout6x2 = math_max_f32(vout6x2, vmin);
178 vout7x2 = math_max_f32(vout7x2, vmin);
179 vout0x3 = math_max_f32(vout0x3, vmin);
180 vout1x3 = math_max_f32(vout1x3, vmin);
181 vout2x3 = math_max_f32(vout2x3, vmin);
182 vout3x3 = math_max_f32(vout3x3, vmin);
183 vout4x3 = math_max_f32(vout4x3, vmin);
184 vout5x3 = math_max_f32(vout5x3, vmin);
185 vout6x3 = math_max_f32(vout6x3, vmin);
186 vout7x3 = math_max_f32(vout7x3, vmin);
187 output[0] = vout0x3;
188 output[1] = vout1x3;
189 output[2] = vout2x3;
190 output[3] = vout3x3;
191 output[4] = vout4x3;
192 output[5] = vout5x3;
193 output[6] = vout6x3;
194 output[7] = vout7x3;
195 output[0] = vout0x0;
196 output[1] = vout1x0;
197 output[2] = vout2x0;
198 output[3] = vout3x0;
199 output[4] = vout4x0;
200 output[5] = vout5x0;
201 output[6] = vout6x0;
202 output[7] = vout7x0;
203 output = (float*restrict) ((uintptr_t) output + output_stride);
204 output[0] = vout0x1;
205 output[1] = vout1x1;
206 output[2] = vout2x1;
207 output[3] = vout3x1;
208 output[4] = vout4x1;
209 output[5] = vout5x1;
210 output[6] = vout6x1;
211 output[7] = vout7x1;
212 output = (float*restrict) ((uintptr_t) output + output_stride);
213 output[0] = vout0x2;
214 output[1] = vout1x2;
215 output[2] = vout2x2;
216 output[3] = vout3x2;
217 output[4] = vout4x2;
218 output[5] = vout5x2;
219 output[6] = vout6x2;
220 output[7] = vout7x2;
221 output = (float*restrict) ((uintptr_t) output + output_stride);
222 output[0] = vout0x3;
223 output[1] = vout1x3;
224 output[2] = vout2x3;
225 output[3] = vout3x3;
226 output[4] = vout4x3;
227 output[5] = vout5x3;
228 output[6] = vout6x3;
229 output[7] = vout7x3;
230 output = (float*restrict) ((uintptr_t) output + output_stride);
231 n -= 4;
232 }
233 if XNN_UNLIKELY(n != 0) {
234 do {
235 uint32_t nnz = *nnzmap++;
236 float vacc0 = *w++;
237 float vacc1 = vacc0;
238 float vacc2 = vacc0;
239 float vacc3 = vacc0;
240 float vacc4 = vacc0;
241 float vacc5 = vacc0;
242 float vacc6 = vacc0;
243 float vacc7 = vacc0;
244 if XNN_LIKELY(nnz != 0) {
245 do {
246 const intptr_t diff = *dmap++;
247 const float vi0 = input[0];
248 const float vi1 = input[1];
249 const float vi2 = input[2];
250 const float vi3 = input[3];
251 const float vi4 = input[4];
252 const float vi5 = input[5];
253 const float vi6 = input[6];
254 const float vi7 = input[7];
255 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
256 const float vw = *w++;
257 vacc0 += vi0 * vw;
258 vacc1 += vi1 * vw;
259 vacc2 += vi2 * vw;
260 vacc3 += vi3 * vw;
261 vacc4 += vi4 * vw;
262 vacc5 += vi5 * vw;
263 vacc6 += vi6 * vw;
264 vacc7 += vi7 * vw;
265 } while (--nnz != 0);
266 }
267 float vout0 = math_min_f32(vacc0, vmax);
268 float vout1 = math_min_f32(vacc1, vmax);
269 float vout2 = math_min_f32(vacc2, vmax);
270 float vout3 = math_min_f32(vacc3, vmax);
271 float vout4 = math_min_f32(vacc4, vmax);
272 float vout5 = math_min_f32(vacc5, vmax);
273 float vout6 = math_min_f32(vacc6, vmax);
274 float vout7 = math_min_f32(vacc7, vmax);
275 vout0 = math_max_f32(vout0, vmin);
276 vout1 = math_max_f32(vout1, vmin);
277 vout2 = math_max_f32(vout2, vmin);
278 vout3 = math_max_f32(vout3, vmin);
279 vout4 = math_max_f32(vout4, vmin);
280 vout5 = math_max_f32(vout5, vmin);
281 vout6 = math_max_f32(vout6, vmin);
282 vout7 = math_max_f32(vout7, vmin);
283 output[0] = vout0;
284 output[1] = vout1;
285 output[2] = vout2;
286 output[3] = vout3;
287 output[4] = vout4;
288 output[5] = vout5;
289 output[6] = vout6;
290 output[7] = vout7;
291 output = (float*restrict) ((uintptr_t) output + output_stride);
292 n -= 1;
293 } while (n != 0);
294 }
295 output = (float*restrict) ((uintptr_t) output - output_decrement);
296 input += 8;
297 mc -= 8 * sizeof(float);
298 }
299 if XNN_UNLIKELY(mc != 0) {
300 output_decrement += 4 * sizeof(float);
301 if (mc & (4 * sizeof(float))) {
302 const float*restrict w = weights;
303 const int32_t* dmap = widx_dmap;
304 const uint32_t* nnzmap = nidx_nnzmap;
305 size_t n = nc;
306 while (n >= 4) {
307 uint32_t nnz = *nnzmap++;
308 float vacc0x0 = *w++;
309 float vacc1x0 = vacc0x0;
310 float vacc2x0 = vacc0x0;
311 float vacc3x0 = vacc0x0;
312 float vacc0x1 = *w++;
313 float vacc1x1 = vacc0x1;
314 float vacc2x1 = vacc0x1;
315 float vacc3x1 = vacc0x1;
316 float vacc0x2 = *w++;
317 float vacc1x2 = vacc0x2;
318 float vacc2x2 = vacc0x2;
319 float vacc3x2 = vacc0x2;
320 float vacc0x3 = *w++;
321 float vacc1x3 = vacc0x3;
322 float vacc2x3 = vacc0x3;
323 float vacc3x3 = vacc0x3;
324 if XNN_LIKELY(nnz != 0) {
325 do {
326 const intptr_t diff = *dmap++;
327 const float vi0 = input[0];
328 const float vi1 = input[1];
329 const float vi2 = input[2];
330 const float vi3 = input[3];
331 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
332 const float vw0 = *w++;
333 const float vw1 = *w++;
334 const float vw2 = *w++;
335 const float vw3 = *w++;
336 vacc0x0 += vi0 * vw0;
337 vacc1x0 += vi1 * vw0;
338 vacc2x0 += vi2 * vw0;
339 vacc3x0 += vi3 * vw0;
340 vacc0x1 += vi0 * vw1;
341 vacc1x1 += vi1 * vw1;
342 vacc2x1 += vi2 * vw1;
343 vacc3x1 += vi3 * vw1;
344 vacc0x2 += vi0 * vw2;
345 vacc1x2 += vi1 * vw2;
346 vacc2x2 += vi2 * vw2;
347 vacc3x2 += vi3 * vw2;
348 vacc0x3 += vi0 * vw3;
349 vacc1x3 += vi1 * vw3;
350 vacc2x3 += vi2 * vw3;
351 vacc3x3 += vi3 * vw3;
352 } while (--nnz != 0);
353 }
354 float vout0x0 = math_min_f32(vacc0x0, vmax);
355 float vout1x0 = math_min_f32(vacc1x0, vmax);
356 float vout2x0 = math_min_f32(vacc2x0, vmax);
357 float vout3x0 = math_min_f32(vacc3x0, vmax);
358 float vout0x1 = math_min_f32(vacc0x1, vmax);
359 float vout1x1 = math_min_f32(vacc1x1, vmax);
360 float vout2x1 = math_min_f32(vacc2x1, vmax);
361 float vout3x1 = math_min_f32(vacc3x1, vmax);
362 float vout0x2 = math_min_f32(vacc0x2, vmax);
363 float vout1x2 = math_min_f32(vacc1x2, vmax);
364 float vout2x2 = math_min_f32(vacc2x2, vmax);
365 float vout3x2 = math_min_f32(vacc3x2, vmax);
366 float vout0x3 = math_min_f32(vacc0x3, vmax);
367 float vout1x3 = math_min_f32(vacc1x3, vmax);
368 float vout2x3 = math_min_f32(vacc2x3, vmax);
369 float vout3x3 = math_min_f32(vacc3x3, vmax);
370 vout0x0 = math_max_f32(vout0x0, vmin);
371 vout1x0 = math_max_f32(vout1x0, vmin);
372 vout2x0 = math_max_f32(vout2x0, vmin);
373 vout3x0 = math_max_f32(vout3x0, vmin);
374 vout0x1 = math_max_f32(vout0x1, vmin);
375 vout1x1 = math_max_f32(vout1x1, vmin);
376 vout2x1 = math_max_f32(vout2x1, vmin);
377 vout3x1 = math_max_f32(vout3x1, vmin);
378 vout0x2 = math_max_f32(vout0x2, vmin);
379 vout1x2 = math_max_f32(vout1x2, vmin);
380 vout2x2 = math_max_f32(vout2x2, vmin);
381 vout3x2 = math_max_f32(vout3x2, vmin);
382 vout0x3 = math_max_f32(vout0x3, vmin);
383 vout1x3 = math_max_f32(vout1x3, vmin);
384 vout2x3 = math_max_f32(vout2x3, vmin);
385 vout3x3 = math_max_f32(vout3x3, vmin);
386 output[0] = vout0x0;
387 output[1] = vout1x0;
388 output[2] = vout2x0;
389 output[3] = vout3x0;
390 output = (float*restrict) ((uintptr_t) output + output_stride);
391 output[0] = vout0x1;
392 output[1] = vout1x1;
393 output[2] = vout2x1;
394 output[3] = vout3x1;
395 output = (float*restrict) ((uintptr_t) output + output_stride);
396 output[0] = vout0x2;
397 output[1] = vout1x2;
398 output[2] = vout2x2;
399 output[3] = vout3x2;
400 output = (float*restrict) ((uintptr_t) output + output_stride);
401 output[0] = vout0x3;
402 output[1] = vout1x3;
403 output[2] = vout2x3;
404 output[3] = vout3x3;
405 output = (float*restrict) ((uintptr_t) output + output_stride);
406 n -= 4;
407 }
408 if XNN_UNLIKELY(n != 0) {
409 do {
410 uint32_t nnz = *nnzmap++;
411 float vacc0 = *w++;
412 float vacc1 = vacc0;
413 float vacc2 = vacc0;
414 float vacc3 = vacc0;
415 if XNN_LIKELY(nnz != 0) {
416 do {
417 const intptr_t diff = *dmap++;
418 const float vi0 = input[0];
419 const float vi1 = input[1];
420 const float vi2 = input[2];
421 const float vi3 = input[3];
422 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
423 const float vw = *w++;
424 vacc0 += vi0 * vw;
425 vacc1 += vi1 * vw;
426 vacc2 += vi2 * vw;
427 vacc3 += vi3 * vw;
428 } while (--nnz != 0);
429 }
430 float vout0 = math_min_f32(vacc0, vmax);
431 float vout1 = math_min_f32(vacc1, vmax);
432 float vout2 = math_min_f32(vacc2, vmax);
433 float vout3 = math_min_f32(vacc3, vmax);
434 vout0 = math_max_f32(vout0, vmin);
435 vout1 = math_max_f32(vout1, vmin);
436 vout2 = math_max_f32(vout2, vmin);
437 vout3 = math_max_f32(vout3, vmin);
438 output[0] = vout0;
439 output[1] = vout1;
440 output[2] = vout2;
441 output[3] = vout3;
442 output = (float*restrict) ((uintptr_t) output + output_stride);
443 n -= 1;
444 } while (n != 0);
445 }
446 output = (float*restrict) ((uintptr_t) output - output_decrement);
447 input += 4;
448 }
449 output_decrement += 2 * sizeof(float);
450 if (mc & (2 * sizeof(float))) {
451 const float*restrict w = weights;
452 const int32_t* dmap = widx_dmap;
453 const uint32_t* nnzmap = nidx_nnzmap;
454 size_t n = nc;
455 while (n >= 4) {
456 uint32_t nnz = *nnzmap++;
457 float vacc0x0 = *w++;
458 float vacc1x0 = vacc0x0;
459 float vacc0x1 = *w++;
460 float vacc1x1 = vacc0x1;
461 float vacc0x2 = *w++;
462 float vacc1x2 = vacc0x2;
463 float vacc0x3 = *w++;
464 float vacc1x3 = vacc0x3;
465 if XNN_LIKELY(nnz != 0) {
466 do {
467 const intptr_t diff = *dmap++;
468 const float vi0 = input[0];
469 const float vi1 = input[1];
470 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
471 const float vw0 = *w++;
472 const float vw1 = *w++;
473 const float vw2 = *w++;
474 const float vw3 = *w++;
475 vacc0x0 += vi0 * vw0;
476 vacc1x0 += vi1 * vw0;
477 vacc0x1 += vi0 * vw1;
478 vacc1x1 += vi1 * vw1;
479 vacc0x2 += vi0 * vw2;
480 vacc1x2 += vi1 * vw2;
481 vacc0x3 += vi0 * vw3;
482 vacc1x3 += vi1 * vw3;
483 } while (--nnz != 0);
484 }
485 float vout0x0 = math_min_f32(vacc0x0, vmax);
486 float vout1x0 = math_min_f32(vacc1x0, vmax);
487 float vout0x1 = math_min_f32(vacc0x1, vmax);
488 float vout1x1 = math_min_f32(vacc1x1, vmax);
489 float vout0x2 = math_min_f32(vacc0x2, vmax);
490 float vout1x2 = math_min_f32(vacc1x2, vmax);
491 float vout0x3 = math_min_f32(vacc0x3, vmax);
492 float vout1x3 = math_min_f32(vacc1x3, vmax);
493 vout0x0 = math_max_f32(vout0x0, vmin);
494 vout1x0 = math_max_f32(vout1x0, vmin);
495 vout0x1 = math_max_f32(vout0x1, vmin);
496 vout1x1 = math_max_f32(vout1x1, vmin);
497 vout0x2 = math_max_f32(vout0x2, vmin);
498 vout1x2 = math_max_f32(vout1x2, vmin);
499 vout0x3 = math_max_f32(vout0x3, vmin);
500 vout1x3 = math_max_f32(vout1x3, vmin);
501 output[0] = vout0x0;
502 output[1] = vout1x0;
503 output = (float*restrict) ((uintptr_t) output + output_stride);
504 output[0] = vout0x1;
505 output[1] = vout1x1;
506 output = (float*restrict) ((uintptr_t) output + output_stride);
507 output[0] = vout0x2;
508 output[1] = vout1x2;
509 output = (float*restrict) ((uintptr_t) output + output_stride);
510 output[0] = vout0x3;
511 output[1] = vout1x3;
512 output = (float*restrict) ((uintptr_t) output + output_stride);
513 n -= 4;
514 }
515 if XNN_UNLIKELY(n != 0) {
516 do {
517 uint32_t nnz = *nnzmap++;
518 float vacc0 = *w++;
519 float vacc1 = vacc0;
520 if XNN_LIKELY(nnz != 0) {
521 do {
522 const intptr_t diff = *dmap++;
523 const float vi0 = input[0];
524 const float vi1 = input[1];
525 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
526 const float vw = *w++;
527 vacc0 += vi0 * vw;
528 vacc1 += vi1 * vw;
529 } while (--nnz != 0);
530 }
531 float vout0 = math_min_f32(vacc0, vmax);
532 float vout1 = math_min_f32(vacc1, vmax);
533 vout0 = math_max_f32(vout0, vmin);
534 vout1 = math_max_f32(vout1, vmin);
535 output[0] = vout0;
536 output[1] = vout1;
537 output = (float*restrict) ((uintptr_t) output + output_stride);
538 n -= 1;
539 } while (n != 0);
540 }
541 output = (float*restrict) ((uintptr_t) output - output_decrement);
542 input += 2;
543 }
544 output_decrement += 1 * sizeof(float);
545 if (mc & (1 * sizeof(float))) {
546 const float*restrict w = weights;
547 const int32_t* dmap = widx_dmap;
548 const uint32_t* nnzmap = nidx_nnzmap;
549 size_t n = nc;
550 while (n >= 4) {
551 uint32_t nnz = *nnzmap++;
552 float vacc0x0 = *w++;
553 float vacc0x1 = *w++;
554 float vacc0x2 = *w++;
555 float vacc0x3 = *w++;
556 if XNN_LIKELY(nnz != 0) {
557 do {
558 const intptr_t diff = *dmap++;
559 const float vi0 = input[0];
560 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
561 const float vw0 = *w++;
562 const float vw1 = *w++;
563 const float vw2 = *w++;
564 const float vw3 = *w++;
565 vacc0x0 += vi0 * vw0;
566 vacc0x1 += vi0 * vw1;
567 vacc0x2 += vi0 * vw2;
568 vacc0x3 += vi0 * vw3;
569 } while (--nnz != 0);
570 }
571 float vout0x0 = math_min_f32(vacc0x0, vmax);
572 float vout0x1 = math_min_f32(vacc0x1, vmax);
573 float vout0x2 = math_min_f32(vacc0x2, vmax);
574 float vout0x3 = math_min_f32(vacc0x3, vmax);
575 vout0x0 = math_max_f32(vout0x0, vmin);
576 vout0x1 = math_max_f32(vout0x1, vmin);
577 vout0x2 = math_max_f32(vout0x2, vmin);
578 vout0x3 = math_max_f32(vout0x3, vmin);
579 output[0] = vout0x0;
580 output = (float*restrict) ((uintptr_t) output + output_stride);
581 output[0] = vout0x1;
582 output = (float*restrict) ((uintptr_t) output + output_stride);
583 output[0] = vout0x2;
584 output = (float*restrict) ((uintptr_t) output + output_stride);
585 output[0] = vout0x3;
586 output = (float*restrict) ((uintptr_t) output + output_stride);
587 n -= 4;
588 }
589 if XNN_UNLIKELY(n != 0) {
590 do {
591 uint32_t nnz = *nnzmap++;
592 float vacc0 = *w++;
593 if XNN_LIKELY(nnz != 0) {
594 do {
595 const intptr_t diff = *dmap++;
596 const float vi0 = input[0];
597 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
598 const float vw = *w++;
599 vacc0 += vi0 * vw;
600 } while (--nnz != 0);
601 }
602 float vout0 = math_min_f32(vacc0, vmax);
603 vout0 = math_max_f32(vout0, vmin);
604 output[0] = vout0;
605 output = (float*restrict) ((uintptr_t) output + output_stride);
606 n -= 1;
607 } while (n != 0);
608 }
609 output = (float*restrict) ((uintptr_t) output - output_decrement);
610 input += 1;
611 }
612 }
613 }
614