1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xnnpack/math.h>
13 #include <xnnpack/spmm.h>
14
15
xnn_f32_spmm_minmax_ukernel_8x2__scalar(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_spmm_minmax_ukernel_8x2__scalar(
17 size_t mc,
18 size_t nc,
19 const float*restrict input,
20 const float*restrict weights,
21 const int32_t*restrict widx_dmap,
22 const uint32_t*restrict nidx_nnzmap,
23 float*restrict output,
24 size_t output_stride,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27 assert(mc != 0);
28 assert(mc % sizeof(float) == 0);
29 assert(nc != 0);
30
31 const float vmin = params->scalar.min;
32 const float vmax = params->scalar.max;
33 size_t output_decrement = output_stride * nc - 8 * sizeof(float);
34 while (mc >= 8 * sizeof(float)) {
35 const float*restrict w = weights;
36 const int32_t* dmap = widx_dmap;
37 const uint32_t* nnzmap = nidx_nnzmap;
38 size_t n = nc;
39 while (n >= 2) {
40 uint32_t nnz = *nnzmap++;
41 float vacc0x0 = *w++;
42 float vacc1x0 = vacc0x0;
43 float vacc2x0 = vacc0x0;
44 float vacc3x0 = vacc0x0;
45 float vacc4x0 = vacc0x0;
46 float vacc5x0 = vacc0x0;
47 float vacc6x0 = vacc0x0;
48 float vacc7x0 = vacc0x0;
49 float vacc0x1 = *w++;
50 float vacc1x1 = vacc0x1;
51 float vacc2x1 = vacc0x1;
52 float vacc3x1 = vacc0x1;
53 float vacc4x1 = vacc0x1;
54 float vacc5x1 = vacc0x1;
55 float vacc6x1 = vacc0x1;
56 float vacc7x1 = vacc0x1;
57 if XNN_LIKELY(nnz != 0) {
58 do {
59 const intptr_t diff = *dmap++;
60 const float vi0 = input[0];
61 const float vi1 = input[1];
62 const float vi2 = input[2];
63 const float vi3 = input[3];
64 const float vi4 = input[4];
65 const float vi5 = input[5];
66 const float vi6 = input[6];
67 const float vi7 = input[7];
68 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
69 const float vw0 = *w++;
70 const float vw1 = *w++;
71 vacc0x0 += vi0 * vw0;
72 vacc1x0 += vi1 * vw0;
73 vacc2x0 += vi2 * vw0;
74 vacc3x0 += vi3 * vw0;
75 vacc4x0 += vi4 * vw0;
76 vacc5x0 += vi5 * vw0;
77 vacc6x0 += vi6 * vw0;
78 vacc7x0 += vi7 * vw0;
79 vacc0x1 += vi0 * vw1;
80 vacc1x1 += vi1 * vw1;
81 vacc2x1 += vi2 * vw1;
82 vacc3x1 += vi3 * vw1;
83 vacc4x1 += vi4 * vw1;
84 vacc5x1 += vi5 * vw1;
85 vacc6x1 += vi6 * vw1;
86 vacc7x1 += vi7 * vw1;
87 } while (--nnz != 0);
88 }
89 float vout0x0 = math_min_f32(vacc0x0, vmax);
90 float vout1x0 = math_min_f32(vacc1x0, vmax);
91 float vout2x0 = math_min_f32(vacc2x0, vmax);
92 float vout3x0 = math_min_f32(vacc3x0, vmax);
93 float vout4x0 = math_min_f32(vacc4x0, vmax);
94 float vout5x0 = math_min_f32(vacc5x0, vmax);
95 float vout6x0 = math_min_f32(vacc6x0, vmax);
96 float vout7x0 = math_min_f32(vacc7x0, vmax);
97 float vout0x1 = math_min_f32(vacc0x1, vmax);
98 float vout1x1 = math_min_f32(vacc1x1, vmax);
99 float vout2x1 = math_min_f32(vacc2x1, vmax);
100 float vout3x1 = math_min_f32(vacc3x1, vmax);
101 float vout4x1 = math_min_f32(vacc4x1, vmax);
102 float vout5x1 = math_min_f32(vacc5x1, vmax);
103 float vout6x1 = math_min_f32(vacc6x1, vmax);
104 float vout7x1 = math_min_f32(vacc7x1, vmax);
105 vout0x0 = math_max_f32(vout0x0, vmin);
106 vout1x0 = math_max_f32(vout1x0, vmin);
107 vout2x0 = math_max_f32(vout2x0, vmin);
108 vout3x0 = math_max_f32(vout3x0, vmin);
109 vout4x0 = math_max_f32(vout4x0, vmin);
110 vout5x0 = math_max_f32(vout5x0, vmin);
111 vout6x0 = math_max_f32(vout6x0, vmin);
112 vout7x0 = math_max_f32(vout7x0, vmin);
113 vout0x1 = math_max_f32(vout0x1, vmin);
114 vout1x1 = math_max_f32(vout1x1, vmin);
115 vout2x1 = math_max_f32(vout2x1, vmin);
116 vout3x1 = math_max_f32(vout3x1, vmin);
117 vout4x1 = math_max_f32(vout4x1, vmin);
118 vout5x1 = math_max_f32(vout5x1, vmin);
119 vout6x1 = math_max_f32(vout6x1, vmin);
120 vout7x1 = math_max_f32(vout7x1, vmin);
121 output[0] = vout0x1;
122 output[1] = vout1x1;
123 output[2] = vout2x1;
124 output[3] = vout3x1;
125 output[4] = vout4x1;
126 output[5] = vout5x1;
127 output[6] = vout6x1;
128 output[7] = vout7x1;
129 output[0] = vout0x0;
130 output[1] = vout1x0;
131 output[2] = vout2x0;
132 output[3] = vout3x0;
133 output[4] = vout4x0;
134 output[5] = vout5x0;
135 output[6] = vout6x0;
136 output[7] = vout7x0;
137 output = (float*restrict) ((uintptr_t) output + output_stride);
138 output[0] = vout0x1;
139 output[1] = vout1x1;
140 output[2] = vout2x1;
141 output[3] = vout3x1;
142 output[4] = vout4x1;
143 output[5] = vout5x1;
144 output[6] = vout6x1;
145 output[7] = vout7x1;
146 output = (float*restrict) ((uintptr_t) output + output_stride);
147 n -= 2;
148 }
149 if XNN_UNLIKELY(n != 0) {
150 do {
151 uint32_t nnz = *nnzmap++;
152 float vacc0 = *w++;
153 float vacc1 = vacc0;
154 float vacc2 = vacc0;
155 float vacc3 = vacc0;
156 float vacc4 = vacc0;
157 float vacc5 = vacc0;
158 float vacc6 = vacc0;
159 float vacc7 = vacc0;
160 if XNN_LIKELY(nnz != 0) {
161 do {
162 const intptr_t diff = *dmap++;
163 const float vi0 = input[0];
164 const float vi1 = input[1];
165 const float vi2 = input[2];
166 const float vi3 = input[3];
167 const float vi4 = input[4];
168 const float vi5 = input[5];
169 const float vi6 = input[6];
170 const float vi7 = input[7];
171 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
172 const float vw = *w++;
173 vacc0 += vi0 * vw;
174 vacc1 += vi1 * vw;
175 vacc2 += vi2 * vw;
176 vacc3 += vi3 * vw;
177 vacc4 += vi4 * vw;
178 vacc5 += vi5 * vw;
179 vacc6 += vi6 * vw;
180 vacc7 += vi7 * vw;
181 } while (--nnz != 0);
182 }
183 float vout0 = math_min_f32(vacc0, vmax);
184 float vout1 = math_min_f32(vacc1, vmax);
185 float vout2 = math_min_f32(vacc2, vmax);
186 float vout3 = math_min_f32(vacc3, vmax);
187 float vout4 = math_min_f32(vacc4, vmax);
188 float vout5 = math_min_f32(vacc5, vmax);
189 float vout6 = math_min_f32(vacc6, vmax);
190 float vout7 = math_min_f32(vacc7, vmax);
191 vout0 = math_max_f32(vout0, vmin);
192 vout1 = math_max_f32(vout1, vmin);
193 vout2 = math_max_f32(vout2, vmin);
194 vout3 = math_max_f32(vout3, vmin);
195 vout4 = math_max_f32(vout4, vmin);
196 vout5 = math_max_f32(vout5, vmin);
197 vout6 = math_max_f32(vout6, vmin);
198 vout7 = math_max_f32(vout7, vmin);
199 output[0] = vout0;
200 output[1] = vout1;
201 output[2] = vout2;
202 output[3] = vout3;
203 output[4] = vout4;
204 output[5] = vout5;
205 output[6] = vout6;
206 output[7] = vout7;
207 output = (float*restrict) ((uintptr_t) output + output_stride);
208 n -= 1;
209 } while (n != 0);
210 }
211 output = (float*restrict) ((uintptr_t) output - output_decrement);
212 input += 8;
213 mc -= 8 * sizeof(float);
214 }
215 if XNN_UNLIKELY(mc != 0) {
216 output_decrement += 4 * sizeof(float);
217 if (mc & (4 * sizeof(float))) {
218 const float*restrict w = weights;
219 const int32_t* dmap = widx_dmap;
220 const uint32_t* nnzmap = nidx_nnzmap;
221 size_t n = nc;
222 while (n >= 2) {
223 uint32_t nnz = *nnzmap++;
224 float vacc0x0 = *w++;
225 float vacc1x0 = vacc0x0;
226 float vacc2x0 = vacc0x0;
227 float vacc3x0 = vacc0x0;
228 float vacc0x1 = *w++;
229 float vacc1x1 = vacc0x1;
230 float vacc2x1 = vacc0x1;
231 float vacc3x1 = vacc0x1;
232 if XNN_LIKELY(nnz != 0) {
233 do {
234 const intptr_t diff = *dmap++;
235 const float vi0 = input[0];
236 const float vi1 = input[1];
237 const float vi2 = input[2];
238 const float vi3 = input[3];
239 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
240 const float vw0 = *w++;
241 const float vw1 = *w++;
242 vacc0x0 += vi0 * vw0;
243 vacc1x0 += vi1 * vw0;
244 vacc2x0 += vi2 * vw0;
245 vacc3x0 += vi3 * vw0;
246 vacc0x1 += vi0 * vw1;
247 vacc1x1 += vi1 * vw1;
248 vacc2x1 += vi2 * vw1;
249 vacc3x1 += vi3 * vw1;
250 } while (--nnz != 0);
251 }
252 float vout0x0 = math_min_f32(vacc0x0, vmax);
253 float vout1x0 = math_min_f32(vacc1x0, vmax);
254 float vout2x0 = math_min_f32(vacc2x0, vmax);
255 float vout3x0 = math_min_f32(vacc3x0, vmax);
256 float vout0x1 = math_min_f32(vacc0x1, vmax);
257 float vout1x1 = math_min_f32(vacc1x1, vmax);
258 float vout2x1 = math_min_f32(vacc2x1, vmax);
259 float vout3x1 = math_min_f32(vacc3x1, vmax);
260 vout0x0 = math_max_f32(vout0x0, vmin);
261 vout1x0 = math_max_f32(vout1x0, vmin);
262 vout2x0 = math_max_f32(vout2x0, vmin);
263 vout3x0 = math_max_f32(vout3x0, vmin);
264 vout0x1 = math_max_f32(vout0x1, vmin);
265 vout1x1 = math_max_f32(vout1x1, vmin);
266 vout2x1 = math_max_f32(vout2x1, vmin);
267 vout3x1 = math_max_f32(vout3x1, vmin);
268 output[0] = vout0x0;
269 output[1] = vout1x0;
270 output[2] = vout2x0;
271 output[3] = vout3x0;
272 output = (float*restrict) ((uintptr_t) output + output_stride);
273 output[0] = vout0x1;
274 output[1] = vout1x1;
275 output[2] = vout2x1;
276 output[3] = vout3x1;
277 output = (float*restrict) ((uintptr_t) output + output_stride);
278 n -= 2;
279 }
280 if XNN_UNLIKELY(n != 0) {
281 do {
282 uint32_t nnz = *nnzmap++;
283 float vacc0 = *w++;
284 float vacc1 = vacc0;
285 float vacc2 = vacc0;
286 float vacc3 = vacc0;
287 if XNN_LIKELY(nnz != 0) {
288 do {
289 const intptr_t diff = *dmap++;
290 const float vi0 = input[0];
291 const float vi1 = input[1];
292 const float vi2 = input[2];
293 const float vi3 = input[3];
294 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
295 const float vw = *w++;
296 vacc0 += vi0 * vw;
297 vacc1 += vi1 * vw;
298 vacc2 += vi2 * vw;
299 vacc3 += vi3 * vw;
300 } while (--nnz != 0);
301 }
302 float vout0 = math_min_f32(vacc0, vmax);
303 float vout1 = math_min_f32(vacc1, vmax);
304 float vout2 = math_min_f32(vacc2, vmax);
305 float vout3 = math_min_f32(vacc3, vmax);
306 vout0 = math_max_f32(vout0, vmin);
307 vout1 = math_max_f32(vout1, vmin);
308 vout2 = math_max_f32(vout2, vmin);
309 vout3 = math_max_f32(vout3, vmin);
310 output[0] = vout0;
311 output[1] = vout1;
312 output[2] = vout2;
313 output[3] = vout3;
314 output = (float*restrict) ((uintptr_t) output + output_stride);
315 n -= 1;
316 } while (n != 0);
317 }
318 output = (float*restrict) ((uintptr_t) output - output_decrement);
319 input += 4;
320 }
321 output_decrement += 2 * sizeof(float);
322 if (mc & (2 * sizeof(float))) {
323 const float*restrict w = weights;
324 const int32_t* dmap = widx_dmap;
325 const uint32_t* nnzmap = nidx_nnzmap;
326 size_t n = nc;
327 while (n >= 2) {
328 uint32_t nnz = *nnzmap++;
329 float vacc0x0 = *w++;
330 float vacc1x0 = vacc0x0;
331 float vacc0x1 = *w++;
332 float vacc1x1 = vacc0x1;
333 if XNN_LIKELY(nnz != 0) {
334 do {
335 const intptr_t diff = *dmap++;
336 const float vi0 = input[0];
337 const float vi1 = input[1];
338 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
339 const float vw0 = *w++;
340 const float vw1 = *w++;
341 vacc0x0 += vi0 * vw0;
342 vacc1x0 += vi1 * vw0;
343 vacc0x1 += vi0 * vw1;
344 vacc1x1 += vi1 * vw1;
345 } while (--nnz != 0);
346 }
347 float vout0x0 = math_min_f32(vacc0x0, vmax);
348 float vout1x0 = math_min_f32(vacc1x0, vmax);
349 float vout0x1 = math_min_f32(vacc0x1, vmax);
350 float vout1x1 = math_min_f32(vacc1x1, vmax);
351 vout0x0 = math_max_f32(vout0x0, vmin);
352 vout1x0 = math_max_f32(vout1x0, vmin);
353 vout0x1 = math_max_f32(vout0x1, vmin);
354 vout1x1 = math_max_f32(vout1x1, vmin);
355 output[0] = vout0x0;
356 output[1] = vout1x0;
357 output = (float*restrict) ((uintptr_t) output + output_stride);
358 output[0] = vout0x1;
359 output[1] = vout1x1;
360 output = (float*restrict) ((uintptr_t) output + output_stride);
361 n -= 2;
362 }
363 if XNN_UNLIKELY(n != 0) {
364 do {
365 uint32_t nnz = *nnzmap++;
366 float vacc0 = *w++;
367 float vacc1 = vacc0;
368 if XNN_LIKELY(nnz != 0) {
369 do {
370 const intptr_t diff = *dmap++;
371 const float vi0 = input[0];
372 const float vi1 = input[1];
373 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
374 const float vw = *w++;
375 vacc0 += vi0 * vw;
376 vacc1 += vi1 * vw;
377 } while (--nnz != 0);
378 }
379 float vout0 = math_min_f32(vacc0, vmax);
380 float vout1 = math_min_f32(vacc1, vmax);
381 vout0 = math_max_f32(vout0, vmin);
382 vout1 = math_max_f32(vout1, vmin);
383 output[0] = vout0;
384 output[1] = vout1;
385 output = (float*restrict) ((uintptr_t) output + output_stride);
386 n -= 1;
387 } while (n != 0);
388 }
389 output = (float*restrict) ((uintptr_t) output - output_decrement);
390 input += 2;
391 }
392 output_decrement += 1 * sizeof(float);
393 if (mc & (1 * sizeof(float))) {
394 const float*restrict w = weights;
395 const int32_t* dmap = widx_dmap;
396 const uint32_t* nnzmap = nidx_nnzmap;
397 size_t n = nc;
398 while (n >= 2) {
399 uint32_t nnz = *nnzmap++;
400 float vacc0x0 = *w++;
401 float vacc0x1 = *w++;
402 if XNN_LIKELY(nnz != 0) {
403 do {
404 const intptr_t diff = *dmap++;
405 const float vi0 = input[0];
406 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
407 const float vw0 = *w++;
408 const float vw1 = *w++;
409 vacc0x0 += vi0 * vw0;
410 vacc0x1 += vi0 * vw1;
411 } while (--nnz != 0);
412 }
413 float vout0x0 = math_min_f32(vacc0x0, vmax);
414 float vout0x1 = math_min_f32(vacc0x1, vmax);
415 vout0x0 = math_max_f32(vout0x0, vmin);
416 vout0x1 = math_max_f32(vout0x1, vmin);
417 output[0] = vout0x0;
418 output = (float*restrict) ((uintptr_t) output + output_stride);
419 output[0] = vout0x1;
420 output = (float*restrict) ((uintptr_t) output + output_stride);
421 n -= 2;
422 }
423 if XNN_UNLIKELY(n != 0) {
424 do {
425 uint32_t nnz = *nnzmap++;
426 float vacc0 = *w++;
427 if XNN_LIKELY(nnz != 0) {
428 do {
429 const intptr_t diff = *dmap++;
430 const float vi0 = input[0];
431 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
432 const float vw = *w++;
433 vacc0 += vi0 * vw;
434 } while (--nnz != 0);
435 }
436 float vout0 = math_min_f32(vacc0, vmax);
437 vout0 = math_max_f32(vout0, vmin);
438 output[0] = vout0;
439 output = (float*restrict) ((uintptr_t) output + output_stride);
440 n -= 1;
441 } while (n != 0);
442 }
443 output = (float*restrict) ((uintptr_t) output - output_decrement);
444 input += 1;
445 }
446 }
447 }
448