xref: /aosp_15_r20/external/XNNPACK/src/f32-spmm/gen/8x2-minmax-scalar.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xnnpack/math.h>
13 #include <xnnpack/spmm.h>
14 
15 
xnn_f32_spmm_minmax_ukernel_8x2__scalar(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_spmm_minmax_ukernel_8x2__scalar(
17     size_t mc,
18     size_t nc,
19     const float*restrict input,
20     const float*restrict weights,
21     const int32_t*restrict widx_dmap,
22     const uint32_t*restrict nidx_nnzmap,
23     float*restrict output,
24     size_t output_stride,
25     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27   assert(mc != 0);
28   assert(mc % sizeof(float) == 0);
29   assert(nc != 0);
30 
31   const float vmin = params->scalar.min;
32   const float vmax = params->scalar.max;
33   size_t output_decrement = output_stride * nc - 8 * sizeof(float);
34   while (mc >= 8 * sizeof(float)) {
35     const float*restrict w = weights;
36     const int32_t* dmap = widx_dmap;
37     const uint32_t* nnzmap = nidx_nnzmap;
38     size_t n = nc;
39     while (n >= 2) {
40       uint32_t nnz = *nnzmap++;
41       float vacc0x0 = *w++;
42       float vacc1x0 = vacc0x0;
43       float vacc2x0 = vacc0x0;
44       float vacc3x0 = vacc0x0;
45       float vacc4x0 = vacc0x0;
46       float vacc5x0 = vacc0x0;
47       float vacc6x0 = vacc0x0;
48       float vacc7x0 = vacc0x0;
49       float vacc0x1 = *w++;
50       float vacc1x1 = vacc0x1;
51       float vacc2x1 = vacc0x1;
52       float vacc3x1 = vacc0x1;
53       float vacc4x1 = vacc0x1;
54       float vacc5x1 = vacc0x1;
55       float vacc6x1 = vacc0x1;
56       float vacc7x1 = vacc0x1;
57       if XNN_LIKELY(nnz != 0) {
58         do {
59           const intptr_t diff = *dmap++;
60           const float vi0 = input[0];
61           const float vi1 = input[1];
62           const float vi2 = input[2];
63           const float vi3 = input[3];
64           const float vi4 = input[4];
65           const float vi5 = input[5];
66           const float vi6 = input[6];
67           const float vi7 = input[7];
68           input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
69           const float vw0 = *w++;
70           const float vw1 = *w++;
71           vacc0x0 += vi0 * vw0;
72           vacc1x0 += vi1 * vw0;
73           vacc2x0 += vi2 * vw0;
74           vacc3x0 += vi3 * vw0;
75           vacc4x0 += vi4 * vw0;
76           vacc5x0 += vi5 * vw0;
77           vacc6x0 += vi6 * vw0;
78           vacc7x0 += vi7 * vw0;
79           vacc0x1 += vi0 * vw1;
80           vacc1x1 += vi1 * vw1;
81           vacc2x1 += vi2 * vw1;
82           vacc3x1 += vi3 * vw1;
83           vacc4x1 += vi4 * vw1;
84           vacc5x1 += vi5 * vw1;
85           vacc6x1 += vi6 * vw1;
86           vacc7x1 += vi7 * vw1;
87         } while (--nnz != 0);
88       }
89       float vout0x0 = math_min_f32(vacc0x0, vmax);
90       float vout1x0 = math_min_f32(vacc1x0, vmax);
91       float vout2x0 = math_min_f32(vacc2x0, vmax);
92       float vout3x0 = math_min_f32(vacc3x0, vmax);
93       float vout4x0 = math_min_f32(vacc4x0, vmax);
94       float vout5x0 = math_min_f32(vacc5x0, vmax);
95       float vout6x0 = math_min_f32(vacc6x0, vmax);
96       float vout7x0 = math_min_f32(vacc7x0, vmax);
97       float vout0x1 = math_min_f32(vacc0x1, vmax);
98       float vout1x1 = math_min_f32(vacc1x1, vmax);
99       float vout2x1 = math_min_f32(vacc2x1, vmax);
100       float vout3x1 = math_min_f32(vacc3x1, vmax);
101       float vout4x1 = math_min_f32(vacc4x1, vmax);
102       float vout5x1 = math_min_f32(vacc5x1, vmax);
103       float vout6x1 = math_min_f32(vacc6x1, vmax);
104       float vout7x1 = math_min_f32(vacc7x1, vmax);
105       vout0x0 = math_max_f32(vout0x0, vmin);
106       vout1x0 = math_max_f32(vout1x0, vmin);
107       vout2x0 = math_max_f32(vout2x0, vmin);
108       vout3x0 = math_max_f32(vout3x0, vmin);
109       vout4x0 = math_max_f32(vout4x0, vmin);
110       vout5x0 = math_max_f32(vout5x0, vmin);
111       vout6x0 = math_max_f32(vout6x0, vmin);
112       vout7x0 = math_max_f32(vout7x0, vmin);
113       vout0x1 = math_max_f32(vout0x1, vmin);
114       vout1x1 = math_max_f32(vout1x1, vmin);
115       vout2x1 = math_max_f32(vout2x1, vmin);
116       vout3x1 = math_max_f32(vout3x1, vmin);
117       vout4x1 = math_max_f32(vout4x1, vmin);
118       vout5x1 = math_max_f32(vout5x1, vmin);
119       vout6x1 = math_max_f32(vout6x1, vmin);
120       vout7x1 = math_max_f32(vout7x1, vmin);
121       output[0] = vout0x1;
122       output[1] = vout1x1;
123       output[2] = vout2x1;
124       output[3] = vout3x1;
125       output[4] = vout4x1;
126       output[5] = vout5x1;
127       output[6] = vout6x1;
128       output[7] = vout7x1;
129       output[0] = vout0x0;
130       output[1] = vout1x0;
131       output[2] = vout2x0;
132       output[3] = vout3x0;
133       output[4] = vout4x0;
134       output[5] = vout5x0;
135       output[6] = vout6x0;
136       output[7] = vout7x0;
137       output = (float*restrict) ((uintptr_t) output + output_stride);
138       output[0] = vout0x1;
139       output[1] = vout1x1;
140       output[2] = vout2x1;
141       output[3] = vout3x1;
142       output[4] = vout4x1;
143       output[5] = vout5x1;
144       output[6] = vout6x1;
145       output[7] = vout7x1;
146       output = (float*restrict) ((uintptr_t) output + output_stride);
147       n -= 2;
148     }
149     if XNN_UNLIKELY(n != 0) {
150       do {
151         uint32_t nnz = *nnzmap++;
152         float vacc0 = *w++;
153         float vacc1 = vacc0;
154         float vacc2 = vacc0;
155         float vacc3 = vacc0;
156         float vacc4 = vacc0;
157         float vacc5 = vacc0;
158         float vacc6 = vacc0;
159         float vacc7 = vacc0;
160         if XNN_LIKELY(nnz != 0) {
161           do {
162             const intptr_t diff = *dmap++;
163             const float vi0 = input[0];
164             const float vi1 = input[1];
165             const float vi2 = input[2];
166             const float vi3 = input[3];
167             const float vi4 = input[4];
168             const float vi5 = input[5];
169             const float vi6 = input[6];
170             const float vi7 = input[7];
171             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
172             const float vw = *w++;
173             vacc0 += vi0 * vw;
174             vacc1 += vi1 * vw;
175             vacc2 += vi2 * vw;
176             vacc3 += vi3 * vw;
177             vacc4 += vi4 * vw;
178             vacc5 += vi5 * vw;
179             vacc6 += vi6 * vw;
180             vacc7 += vi7 * vw;
181           } while (--nnz != 0);
182         }
183         float vout0 = math_min_f32(vacc0, vmax);
184         float vout1 = math_min_f32(vacc1, vmax);
185         float vout2 = math_min_f32(vacc2, vmax);
186         float vout3 = math_min_f32(vacc3, vmax);
187         float vout4 = math_min_f32(vacc4, vmax);
188         float vout5 = math_min_f32(vacc5, vmax);
189         float vout6 = math_min_f32(vacc6, vmax);
190         float vout7 = math_min_f32(vacc7, vmax);
191         vout0 = math_max_f32(vout0, vmin);
192         vout1 = math_max_f32(vout1, vmin);
193         vout2 = math_max_f32(vout2, vmin);
194         vout3 = math_max_f32(vout3, vmin);
195         vout4 = math_max_f32(vout4, vmin);
196         vout5 = math_max_f32(vout5, vmin);
197         vout6 = math_max_f32(vout6, vmin);
198         vout7 = math_max_f32(vout7, vmin);
199         output[0] = vout0;
200         output[1] = vout1;
201         output[2] = vout2;
202         output[3] = vout3;
203         output[4] = vout4;
204         output[5] = vout5;
205         output[6] = vout6;
206         output[7] = vout7;
207         output = (float*restrict) ((uintptr_t) output + output_stride);
208         n -= 1;
209       } while (n != 0);
210     }
211     output = (float*restrict) ((uintptr_t) output - output_decrement);
212     input += 8;
213     mc -= 8 * sizeof(float);
214   }
215   if XNN_UNLIKELY(mc != 0) {
216     output_decrement += 4 * sizeof(float);
217     if (mc & (4 * sizeof(float))) {
218       const float*restrict w = weights;
219       const int32_t* dmap = widx_dmap;
220       const uint32_t* nnzmap = nidx_nnzmap;
221       size_t n = nc;
222       while (n >= 2) {
223         uint32_t nnz = *nnzmap++;
224         float vacc0x0 = *w++;
225         float vacc1x0 = vacc0x0;
226         float vacc2x0 = vacc0x0;
227         float vacc3x0 = vacc0x0;
228         float vacc0x1 = *w++;
229         float vacc1x1 = vacc0x1;
230         float vacc2x1 = vacc0x1;
231         float vacc3x1 = vacc0x1;
232         if XNN_LIKELY(nnz != 0) {
233           do {
234             const intptr_t diff = *dmap++;
235             const float vi0 = input[0];
236             const float vi1 = input[1];
237             const float vi2 = input[2];
238             const float vi3 = input[3];
239             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
240             const float vw0 = *w++;
241             const float vw1 = *w++;
242             vacc0x0 += vi0 * vw0;
243             vacc1x0 += vi1 * vw0;
244             vacc2x0 += vi2 * vw0;
245             vacc3x0 += vi3 * vw0;
246             vacc0x1 += vi0 * vw1;
247             vacc1x1 += vi1 * vw1;
248             vacc2x1 += vi2 * vw1;
249             vacc3x1 += vi3 * vw1;
250           } while (--nnz != 0);
251         }
252         float vout0x0 = math_min_f32(vacc0x0, vmax);
253         float vout1x0 = math_min_f32(vacc1x0, vmax);
254         float vout2x0 = math_min_f32(vacc2x0, vmax);
255         float vout3x0 = math_min_f32(vacc3x0, vmax);
256         float vout0x1 = math_min_f32(vacc0x1, vmax);
257         float vout1x1 = math_min_f32(vacc1x1, vmax);
258         float vout2x1 = math_min_f32(vacc2x1, vmax);
259         float vout3x1 = math_min_f32(vacc3x1, vmax);
260         vout0x0 = math_max_f32(vout0x0, vmin);
261         vout1x0 = math_max_f32(vout1x0, vmin);
262         vout2x0 = math_max_f32(vout2x0, vmin);
263         vout3x0 = math_max_f32(vout3x0, vmin);
264         vout0x1 = math_max_f32(vout0x1, vmin);
265         vout1x1 = math_max_f32(vout1x1, vmin);
266         vout2x1 = math_max_f32(vout2x1, vmin);
267         vout3x1 = math_max_f32(vout3x1, vmin);
268         output[0] = vout0x0;
269         output[1] = vout1x0;
270         output[2] = vout2x0;
271         output[3] = vout3x0;
272         output = (float*restrict) ((uintptr_t) output + output_stride);
273         output[0] = vout0x1;
274         output[1] = vout1x1;
275         output[2] = vout2x1;
276         output[3] = vout3x1;
277         output = (float*restrict) ((uintptr_t) output + output_stride);
278         n -= 2;
279       }
280       if XNN_UNLIKELY(n != 0) {
281         do {
282           uint32_t nnz = *nnzmap++;
283           float vacc0 = *w++;
284           float vacc1 = vacc0;
285           float vacc2 = vacc0;
286           float vacc3 = vacc0;
287           if XNN_LIKELY(nnz != 0) {
288             do {
289               const intptr_t diff = *dmap++;
290               const float vi0 = input[0];
291               const float vi1 = input[1];
292               const float vi2 = input[2];
293               const float vi3 = input[3];
294               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
295               const float vw = *w++;
296               vacc0 += vi0 * vw;
297               vacc1 += vi1 * vw;
298               vacc2 += vi2 * vw;
299               vacc3 += vi3 * vw;
300             } while (--nnz != 0);
301           }
302           float vout0 = math_min_f32(vacc0, vmax);
303           float vout1 = math_min_f32(vacc1, vmax);
304           float vout2 = math_min_f32(vacc2, vmax);
305           float vout3 = math_min_f32(vacc3, vmax);
306           vout0 = math_max_f32(vout0, vmin);
307           vout1 = math_max_f32(vout1, vmin);
308           vout2 = math_max_f32(vout2, vmin);
309           vout3 = math_max_f32(vout3, vmin);
310           output[0] = vout0;
311           output[1] = vout1;
312           output[2] = vout2;
313           output[3] = vout3;
314           output = (float*restrict) ((uintptr_t) output + output_stride);
315           n -= 1;
316         } while (n != 0);
317       }
318       output = (float*restrict) ((uintptr_t) output - output_decrement);
319       input += 4;
320     }
321     output_decrement += 2 * sizeof(float);
322     if (mc & (2 * sizeof(float))) {
323       const float*restrict w = weights;
324       const int32_t* dmap = widx_dmap;
325       const uint32_t* nnzmap = nidx_nnzmap;
326       size_t n = nc;
327       while (n >= 2) {
328         uint32_t nnz = *nnzmap++;
329         float vacc0x0 = *w++;
330         float vacc1x0 = vacc0x0;
331         float vacc0x1 = *w++;
332         float vacc1x1 = vacc0x1;
333         if XNN_LIKELY(nnz != 0) {
334           do {
335             const intptr_t diff = *dmap++;
336             const float vi0 = input[0];
337             const float vi1 = input[1];
338             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
339             const float vw0 = *w++;
340             const float vw1 = *w++;
341             vacc0x0 += vi0 * vw0;
342             vacc1x0 += vi1 * vw0;
343             vacc0x1 += vi0 * vw1;
344             vacc1x1 += vi1 * vw1;
345           } while (--nnz != 0);
346         }
347         float vout0x0 = math_min_f32(vacc0x0, vmax);
348         float vout1x0 = math_min_f32(vacc1x0, vmax);
349         float vout0x1 = math_min_f32(vacc0x1, vmax);
350         float vout1x1 = math_min_f32(vacc1x1, vmax);
351         vout0x0 = math_max_f32(vout0x0, vmin);
352         vout1x0 = math_max_f32(vout1x0, vmin);
353         vout0x1 = math_max_f32(vout0x1, vmin);
354         vout1x1 = math_max_f32(vout1x1, vmin);
355         output[0] = vout0x0;
356         output[1] = vout1x0;
357         output = (float*restrict) ((uintptr_t) output + output_stride);
358         output[0] = vout0x1;
359         output[1] = vout1x1;
360         output = (float*restrict) ((uintptr_t) output + output_stride);
361         n -= 2;
362       }
363       if XNN_UNLIKELY(n != 0) {
364         do {
365           uint32_t nnz = *nnzmap++;
366           float vacc0 = *w++;
367           float vacc1 = vacc0;
368           if XNN_LIKELY(nnz != 0) {
369             do {
370               const intptr_t diff = *dmap++;
371               const float vi0 = input[0];
372               const float vi1 = input[1];
373               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
374               const float vw = *w++;
375               vacc0 += vi0 * vw;
376               vacc1 += vi1 * vw;
377             } while (--nnz != 0);
378           }
379           float vout0 = math_min_f32(vacc0, vmax);
380           float vout1 = math_min_f32(vacc1, vmax);
381           vout0 = math_max_f32(vout0, vmin);
382           vout1 = math_max_f32(vout1, vmin);
383           output[0] = vout0;
384           output[1] = vout1;
385           output = (float*restrict) ((uintptr_t) output + output_stride);
386           n -= 1;
387         } while (n != 0);
388       }
389       output = (float*restrict) ((uintptr_t) output - output_decrement);
390       input += 2;
391     }
392     output_decrement += 1 * sizeof(float);
393     if (mc & (1 * sizeof(float))) {
394       const float*restrict w = weights;
395       const int32_t* dmap = widx_dmap;
396       const uint32_t* nnzmap = nidx_nnzmap;
397       size_t n = nc;
398       while (n >= 2) {
399         uint32_t nnz = *nnzmap++;
400         float vacc0x0 = *w++;
401         float vacc0x1 = *w++;
402         if XNN_LIKELY(nnz != 0) {
403           do {
404             const intptr_t diff = *dmap++;
405             const float vi0 = input[0];
406             input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
407             const float vw0 = *w++;
408             const float vw1 = *w++;
409             vacc0x0 += vi0 * vw0;
410             vacc0x1 += vi0 * vw1;
411           } while (--nnz != 0);
412         }
413         float vout0x0 = math_min_f32(vacc0x0, vmax);
414         float vout0x1 = math_min_f32(vacc0x1, vmax);
415         vout0x0 = math_max_f32(vout0x0, vmin);
416         vout0x1 = math_max_f32(vout0x1, vmin);
417         output[0] = vout0x0;
418         output = (float*restrict) ((uintptr_t) output + output_stride);
419         output[0] = vout0x1;
420         output = (float*restrict) ((uintptr_t) output + output_stride);
421         n -= 2;
422       }
423       if XNN_UNLIKELY(n != 0) {
424         do {
425           uint32_t nnz = *nnzmap++;
426           float vacc0 = *w++;
427           if XNN_LIKELY(nnz != 0) {
428             do {
429               const intptr_t diff = *dmap++;
430               const float vi0 = input[0];
431               input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
432               const float vw = *w++;
433               vacc0 += vi0 * vw;
434             } while (--nnz != 0);
435           }
436           float vout0 = math_min_f32(vacc0, vmax);
437           vout0 = math_max_f32(vout0, vmin);
438           output[0] = vout0;
439           output = (float*restrict) ((uintptr_t) output + output_stride);
440           n -= 1;
441         } while (n != 0);
442       }
443       output = (float*restrict) ((uintptr_t) output - output_decrement);
444       input += 1;
445     }
446   }
447 }
448