1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/wasmsimd.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86_x2(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86_x2(
18 size_t mc,
19 size_t nc,
20 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
25 size_t output_stride,
26 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(mc != 0);
29 assert(mc % sizeof(float) == 0);
30 assert(nc != 0);
31
32 const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min);
33 const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max);
34 size_t output_decrement = output_stride * nc - 32 * sizeof(float);
35 while XNN_LIKELY(mc >= 32 * sizeof(float)) {
36 const float*restrict w = weights;
37 const int32_t* dmap = widx_dmap;
38 const uint32_t* nnzmap = nidx_nnzmap;
39 size_t n = nc;
40 do {
41 uint32_t nnz = *nnzmap++;
42 v128_t vacc0123x0 = wasm_v128_load32_splat(w);
43 w += 1;
44 v128_t vacc0123x1 = wasm_f32x4_const_splat(0.0f);
45 v128_t vacc4567x0 = vacc0123x0;
46 v128_t vacc4567x1 = wasm_f32x4_const_splat(0.0f);
47 v128_t vacc89ABx0 = vacc0123x0;
48 v128_t vacc89ABx1 = wasm_f32x4_const_splat(0.0f);
49 v128_t vaccCDEFx0 = vacc0123x0;
50 v128_t vaccCDEFx1 = wasm_f32x4_const_splat(0.0f);
51 v128_t vaccGHIJx0 = vacc0123x0;
52 v128_t vaccGHIJx1 = wasm_f32x4_const_splat(0.0f);
53 v128_t vaccKLMNx0 = vacc0123x0;
54 v128_t vaccKLMNx1 = wasm_f32x4_const_splat(0.0f);
55 v128_t vaccOPQRx0 = vacc0123x0;
56 v128_t vaccOPQRx1 = wasm_f32x4_const_splat(0.0f);
57 v128_t vaccSTUVx0 = vacc0123x0;
58 v128_t vaccSTUVx1 = wasm_f32x4_const_splat(0.0f);
59 for (; nnz >= 2; nnz -= 2) {
60 const intptr_t diff0 = dmap[0];
61 const intptr_t diff1 = dmap[1];
62 dmap += 2;
63 const v128_t vi0123x0 = wasm_v128_load(input);
64 const v128_t vi4567x0 = wasm_v128_load(input + 4);
65 const v128_t vi89ABx0 = wasm_v128_load(input + 8);
66 const v128_t viCDEFx0 = wasm_v128_load(input + 12);
67 const v128_t viGHIJx0 = wasm_v128_load(input + 16);
68 const v128_t viKLMNx0 = wasm_v128_load(input + 20);
69 const v128_t viOPQRx0 = wasm_v128_load(input + 24);
70 const v128_t viSTUVx0 = wasm_v128_load(input + 28);
71 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
72 const v128_t vw0 = wasm_v128_load32_splat(w);
73 w += 1;
74 vacc0123x0 = wasm_f32x4_add(vacc0123x0, wasm_f32x4_mul(vi0123x0, vw0));
75 vacc4567x0 = wasm_f32x4_add(vacc4567x0, wasm_f32x4_mul(vi4567x0, vw0));
76 vacc89ABx0 = wasm_f32x4_add(vacc89ABx0, wasm_f32x4_mul(vi89ABx0, vw0));
77 vaccCDEFx0 = wasm_f32x4_add(vaccCDEFx0, wasm_f32x4_mul(viCDEFx0, vw0));
78 vaccGHIJx0 = wasm_f32x4_add(vaccGHIJx0, wasm_f32x4_mul(viGHIJx0, vw0));
79 vaccKLMNx0 = wasm_f32x4_add(vaccKLMNx0, wasm_f32x4_mul(viKLMNx0, vw0));
80 vaccOPQRx0 = wasm_f32x4_add(vaccOPQRx0, wasm_f32x4_mul(viOPQRx0, vw0));
81 vaccSTUVx0 = wasm_f32x4_add(vaccSTUVx0, wasm_f32x4_mul(viSTUVx0, vw0));
82 const v128_t vi0123x1 = wasm_v128_load(input);
83 const v128_t vi4567x1 = wasm_v128_load(input + 4);
84 const v128_t vi89ABx1 = wasm_v128_load(input + 8);
85 const v128_t viCDEFx1 = wasm_v128_load(input + 12);
86 const v128_t viGHIJx1 = wasm_v128_load(input + 16);
87 const v128_t viKLMNx1 = wasm_v128_load(input + 20);
88 const v128_t viOPQRx1 = wasm_v128_load(input + 24);
89 const v128_t viSTUVx1 = wasm_v128_load(input + 28);
90 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
91 const v128_t vw1 = wasm_v128_load32_splat(w);
92 w += 1;
93 vacc0123x1 = wasm_f32x4_add(vacc0123x1, wasm_f32x4_mul(vi0123x1, vw1));
94 vacc4567x1 = wasm_f32x4_add(vacc4567x1, wasm_f32x4_mul(vi4567x1, vw1));
95 vacc89ABx1 = wasm_f32x4_add(vacc89ABx1, wasm_f32x4_mul(vi89ABx1, vw1));
96 vaccCDEFx1 = wasm_f32x4_add(vaccCDEFx1, wasm_f32x4_mul(viCDEFx1, vw1));
97 vaccGHIJx1 = wasm_f32x4_add(vaccGHIJx1, wasm_f32x4_mul(viGHIJx1, vw1));
98 vaccKLMNx1 = wasm_f32x4_add(vaccKLMNx1, wasm_f32x4_mul(viKLMNx1, vw1));
99 vaccOPQRx1 = wasm_f32x4_add(vaccOPQRx1, wasm_f32x4_mul(viOPQRx1, vw1));
100 vaccSTUVx1 = wasm_f32x4_add(vaccSTUVx1, wasm_f32x4_mul(viSTUVx1, vw1));
101 }
102 v128_t vacc0123 = vacc0123x0;
103 v128_t vacc4567 = vacc4567x0;
104 v128_t vacc89AB = vacc89ABx0;
105 v128_t vaccCDEF = vaccCDEFx0;
106 v128_t vaccGHIJ = vaccGHIJx0;
107 v128_t vaccKLMN = vaccKLMNx0;
108 v128_t vaccOPQR = vaccOPQRx0;
109 v128_t vaccSTUV = vaccSTUVx0;
110 vacc0123 = wasm_f32x4_add(vacc0123, vacc0123x1);
111 vacc4567 = wasm_f32x4_add(vacc4567, vacc4567x1);
112 vacc89AB = wasm_f32x4_add(vacc89AB, vacc89ABx1);
113 vaccCDEF = wasm_f32x4_add(vaccCDEF, vaccCDEFx1);
114 vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vaccGHIJx1);
115 vaccKLMN = wasm_f32x4_add(vaccKLMN, vaccKLMNx1);
116 vaccOPQR = wasm_f32x4_add(vaccOPQR, vaccOPQRx1);
117 vaccSTUV = wasm_f32x4_add(vaccSTUV, vaccSTUVx1);
118 if XNN_LIKELY(nnz != 0) {
119 do {
120 const intptr_t diff = *dmap++;
121 const v128_t vi0123 = wasm_v128_load(input);
122 const v128_t vi4567 = wasm_v128_load(input + 4);
123 const v128_t vi89AB = wasm_v128_load(input + 8);
124 const v128_t viCDEF = wasm_v128_load(input + 12);
125 const v128_t viGHIJ = wasm_v128_load(input + 16);
126 const v128_t viKLMN = wasm_v128_load(input + 20);
127 const v128_t viOPQR = wasm_v128_load(input + 24);
128 const v128_t viSTUV = wasm_v128_load(input + 28);
129 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
130 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
131 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
132 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
133 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
134 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
135 vaccGHIJ = wasm_f32x4_add(vaccGHIJ, wasm_f32x4_mul(viGHIJ, vw));
136 vaccKLMN = wasm_f32x4_add(vaccKLMN, wasm_f32x4_mul(viKLMN, vw));
137 vaccOPQR = wasm_f32x4_add(vaccOPQR, wasm_f32x4_mul(viOPQR, vw));
138 vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw));
139 } while (--nnz != 0);
140 }
141 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
142 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
143 v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
144 v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
145 v128_t voutGHIJ = wasm_f32x4_pmin(vmax, vaccGHIJ);
146 v128_t voutKLMN = wasm_f32x4_pmin(vmax, vaccKLMN);
147 v128_t voutOPQR = wasm_f32x4_pmin(vmax, vaccOPQR);
148 v128_t voutSTUV = wasm_f32x4_pmin(vmax, vaccSTUV);
149 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
150 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
151 vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
152 voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
153 voutGHIJ = wasm_f32x4_pmax(vmin, voutGHIJ);
154 voutKLMN = wasm_f32x4_pmax(vmin, voutKLMN);
155 voutOPQR = wasm_f32x4_pmax(vmin, voutOPQR);
156 voutSTUV = wasm_f32x4_pmax(vmin, voutSTUV);
157 wasm_v128_store(output, vout0123);
158 wasm_v128_store(output + 4, vout4567);
159 wasm_v128_store(output + 8, vout89AB);
160 wasm_v128_store(output + 12, voutCDEF);
161 wasm_v128_store(output + 16, voutGHIJ);
162 wasm_v128_store(output + 20, voutKLMN);
163 wasm_v128_store(output + 24, voutOPQR);
164 wasm_v128_store(output + 28, voutSTUV);
165 output = (float*restrict) ((uintptr_t) output + output_stride);
166 } while (--n != 0);
167 output = (float*restrict) ((uintptr_t) output - output_decrement);
168 input += 32;
169 mc -= 32 * sizeof(float);
170 }
171 if XNN_UNLIKELY(mc != 0) {
172 output_decrement += 16 * sizeof(float);
173 if (mc & (16 * sizeof(float))) {
174 const float*restrict w = weights;
175 const int32_t* dmap = widx_dmap;
176 const uint32_t* nnzmap = nidx_nnzmap;
177 size_t n = nc;
178 do {
179 uint32_t nnz = *nnzmap++;
180 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
181 v128_t vacc4567 = vacc0123;
182 v128_t vacc89AB = vacc0123;
183 v128_t vaccCDEF = vacc0123;
184 if XNN_LIKELY(nnz != 0) {
185 do {
186 const intptr_t diff = *dmap++;
187 const v128_t vi0123 = wasm_v128_load(input);
188 const v128_t vi4567 = wasm_v128_load(input + 4);
189 const v128_t vi89AB = wasm_v128_load(input + 8);
190 const v128_t viCDEF = wasm_v128_load(input + 12);
191 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
192 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
193 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
194 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
195 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
196 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
197 } while (--nnz != 0);
198 }
199 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
200 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
201 v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
202 v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
203 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
204 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
205 vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
206 voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
207 wasm_v128_store(output, vout0123);
208
209 wasm_v128_store(output + 4, vout4567);
210 wasm_v128_store(output + 8, vout89AB);
211 wasm_v128_store(output + 12, voutCDEF);
212 output = (float*restrict) ((uintptr_t) output + output_stride);
213 } while (--n != 0);
214 output = (float*restrict) ((uintptr_t) output - output_decrement);
215 input += 16;
216 }
217 output_decrement += 8 * sizeof(float);
218 if (mc & (8 * sizeof(float))) {
219 const float*restrict w = weights;
220 const int32_t* dmap = widx_dmap;
221 const uint32_t* nnzmap = nidx_nnzmap;
222 size_t n = nc;
223 do {
224 uint32_t nnz = *nnzmap++;
225 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
226 v128_t vacc4567 = vacc0123;
227 if XNN_LIKELY(nnz != 0) {
228 do {
229 const intptr_t diff = *dmap++;
230 const v128_t vi0123 = wasm_v128_load(input);
231 const v128_t vi4567 = wasm_v128_load(input + 4);
232 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
233 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
234 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
235 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
236 } while (--nnz != 0);
237 }
238 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
239 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
240 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
241 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
242 wasm_v128_store(output, vout0123);
243
244 wasm_v128_store(output + 4, vout4567);
245 output = (float*restrict) ((uintptr_t) output + output_stride);
246 } while (--n != 0);
247 output = (float*restrict) ((uintptr_t) output - output_decrement);
248 input += 8;
249 }
250 output_decrement += 4 * sizeof(float);
251 if (mc & (4 * sizeof(float))) {
252 const float*restrict w = weights;
253 const int32_t* dmap = widx_dmap;
254 const uint32_t* nnzmap = nidx_nnzmap;
255 size_t n = nc;
256 do {
257 uint32_t nnz = *nnzmap++;
258 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
259 if XNN_LIKELY(nnz != 0) {
260 do {
261 const intptr_t diff = *dmap++;
262 const v128_t vi0123 = wasm_v128_load(input);
263 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
264 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
265 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
266 } while (--nnz != 0);
267 }
268 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
269 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
270 wasm_v128_store(output, vout0123);
271
272 output = (float*restrict) ((uintptr_t) output + output_stride);
273 } while (--n != 0);
274 output = (float*restrict) ((uintptr_t) output - output_decrement);
275 input += 4;
276 }
277 output_decrement += 2 * sizeof(float);
278 if (mc & (2 * sizeof(float))) {
279 const float*restrict w = weights;
280 const int32_t* dmap = widx_dmap;
281 const uint32_t* nnzmap = nidx_nnzmap;
282 size_t n = nc;
283 do {
284 uint32_t nnz = *nnzmap++;
285 v128_t vacc01 = wasm_v128_load32_splat(w); w += 1;
286 if XNN_LIKELY(nnz != 0) {
287 do {
288 const intptr_t diff = *dmap++;
289 const v128_t vi01 = wasm_v128_load64_splat(input);
290 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
291 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
292 vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
293 } while (--nnz != 0);
294 }
295 v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
296 vout01 = wasm_f32x4_pmax(vmin, vout01);
297 *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
298
299 output = (float*restrict) ((uintptr_t) output + output_stride);
300 } while (--n != 0);
301 output = (float*restrict) ((uintptr_t) output - output_decrement);
302 input += 2;
303 }
304 output_decrement += 1 * sizeof(float);
305 if (mc & (1 * sizeof(float))) {
306 const float*restrict w = weights;
307 const int32_t* dmap = widx_dmap;
308 const uint32_t* nnzmap = nidx_nnzmap;
309 size_t n = nc;
310 do {
311 uint32_t nnz = *nnzmap++;
312 v128_t vacc0 = wasm_v128_load32_splat(w); w += 1;
313 if XNN_LIKELY(nnz != 0) {
314 do {
315 const intptr_t diff = *dmap++;
316 const v128_t vi0 = wasm_v128_load32_splat(input);
317 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
318 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
319 vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
320 } while (--nnz != 0);
321 }
322 v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
323 vout0 = wasm_f32x4_pmax(vmin, vout0);
324 *output = wasm_f32x4_extract_lane(vout0, 0);
325
326 output = (float*restrict) ((uintptr_t) output + output_stride);
327 } while (--n != 0);
328 output = (float*restrict) ((uintptr_t) output - output_decrement);
329 input += 1;
330 }
331 }
332 }
333