1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/wasmsimd.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x2(size_t mc,size_t nc,const float * restrict input,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict output,size_t output_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x2(
18 size_t mc,
19 size_t nc,
20 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
25 size_t output_stride,
26 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(mc != 0);
29 assert(mc % sizeof(float) == 0);
30 assert(nc != 0);
31
32 const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min);
33 const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max);
34 size_t output_decrement = output_stride * nc - 16 * sizeof(float);
35 while XNN_LIKELY(mc >= 16 * sizeof(float)) {
36 const float*restrict w = weights;
37 const int32_t* dmap = widx_dmap;
38 const uint32_t* nnzmap = nidx_nnzmap;
39 size_t n = nc;
40 do {
41 uint32_t nnz = *nnzmap++;
42 v128_t vacc0123x0 = wasm_v128_load32_splat(w);
43 w += 1;
44 v128_t vacc0123x1 = wasm_f32x4_const_splat(0.0f);
45 v128_t vacc4567x0 = vacc0123x0;
46 v128_t vacc4567x1 = wasm_f32x4_const_splat(0.0f);
47 v128_t vacc89ABx0 = vacc0123x0;
48 v128_t vacc89ABx1 = wasm_f32x4_const_splat(0.0f);
49 v128_t vaccCDEFx0 = vacc0123x0;
50 v128_t vaccCDEFx1 = wasm_f32x4_const_splat(0.0f);
51 for (; nnz >= 2; nnz -= 2) {
52 const intptr_t diff0 = dmap[0];
53 const intptr_t diff1 = dmap[1];
54 dmap += 2;
55 const v128_t vi0123x0 = wasm_v128_load(input);
56 const v128_t vi4567x0 = wasm_v128_load(input + 4);
57 const v128_t vi89ABx0 = wasm_v128_load(input + 8);
58 const v128_t viCDEFx0 = wasm_v128_load(input + 12);
59 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
60 const v128_t vw0 = wasm_v128_load32_splat(w);
61 w += 1;
62 vacc0123x0 = wasm_f32x4_add(vacc0123x0, wasm_f32x4_mul(vi0123x0, vw0));
63 vacc4567x0 = wasm_f32x4_add(vacc4567x0, wasm_f32x4_mul(vi4567x0, vw0));
64 vacc89ABx0 = wasm_f32x4_add(vacc89ABx0, wasm_f32x4_mul(vi89ABx0, vw0));
65 vaccCDEFx0 = wasm_f32x4_add(vaccCDEFx0, wasm_f32x4_mul(viCDEFx0, vw0));
66 const v128_t vi0123x1 = wasm_v128_load(input);
67 const v128_t vi4567x1 = wasm_v128_load(input + 4);
68 const v128_t vi89ABx1 = wasm_v128_load(input + 8);
69 const v128_t viCDEFx1 = wasm_v128_load(input + 12);
70 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
71 const v128_t vw1 = wasm_v128_load32_splat(w);
72 w += 1;
73 vacc0123x1 = wasm_f32x4_add(vacc0123x1, wasm_f32x4_mul(vi0123x1, vw1));
74 vacc4567x1 = wasm_f32x4_add(vacc4567x1, wasm_f32x4_mul(vi4567x1, vw1));
75 vacc89ABx1 = wasm_f32x4_add(vacc89ABx1, wasm_f32x4_mul(vi89ABx1, vw1));
76 vaccCDEFx1 = wasm_f32x4_add(vaccCDEFx1, wasm_f32x4_mul(viCDEFx1, vw1));
77 }
78 v128_t vacc0123 = vacc0123x0;
79 v128_t vacc4567 = vacc4567x0;
80 v128_t vacc89AB = vacc89ABx0;
81 v128_t vaccCDEF = vaccCDEFx0;
82 vacc0123 = wasm_f32x4_add(vacc0123, vacc0123x1);
83 vacc4567 = wasm_f32x4_add(vacc4567, vacc4567x1);
84 vacc89AB = wasm_f32x4_add(vacc89AB, vacc89ABx1);
85 vaccCDEF = wasm_f32x4_add(vaccCDEF, vaccCDEFx1);
86 if XNN_LIKELY(nnz != 0) {
87 do {
88 const intptr_t diff = *dmap++;
89 const v128_t vi0123 = wasm_v128_load(input);
90 const v128_t vi4567 = wasm_v128_load(input + 4);
91 const v128_t vi89AB = wasm_v128_load(input + 8);
92 const v128_t viCDEF = wasm_v128_load(input + 12);
93 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
94 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
95 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
96 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
97 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
98 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
99 } while (--nnz != 0);
100 }
101 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
102 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
103 v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
104 v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
105 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
106 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
107 vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
108 voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
109 wasm_v128_store(output, vout0123);
110 wasm_v128_store(output + 4, vout4567);
111 wasm_v128_store(output + 8, vout89AB);
112 wasm_v128_store(output + 12, voutCDEF);
113 output = (float*restrict) ((uintptr_t) output + output_stride);
114 } while (--n != 0);
115 output = (float*restrict) ((uintptr_t) output - output_decrement);
116 input += 16;
117 mc -= 16 * sizeof(float);
118 }
119 if XNN_UNLIKELY(mc != 0) {
120 output_decrement += 8 * sizeof(float);
121 if (mc & (8 * sizeof(float))) {
122 const float*restrict w = weights;
123 const int32_t* dmap = widx_dmap;
124 const uint32_t* nnzmap = nidx_nnzmap;
125 size_t n = nc;
126 do {
127 uint32_t nnz = *nnzmap++;
128 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
129 v128_t vacc4567 = vacc0123;
130 if XNN_LIKELY(nnz != 0) {
131 do {
132 const intptr_t diff = *dmap++;
133 const v128_t vi0123 = wasm_v128_load(input);
134 const v128_t vi4567 = wasm_v128_load(input + 4);
135 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
136 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
137 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
138 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
139 } while (--nnz != 0);
140 }
141 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
142 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
143 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
144 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
145 wasm_v128_store(output, vout0123);
146
147 wasm_v128_store(output + 4, vout4567);
148 output = (float*restrict) ((uintptr_t) output + output_stride);
149 } while (--n != 0);
150 output = (float*restrict) ((uintptr_t) output - output_decrement);
151 input += 8;
152 }
153 output_decrement += 4 * sizeof(float);
154 if (mc & (4 * sizeof(float))) {
155 const float*restrict w = weights;
156 const int32_t* dmap = widx_dmap;
157 const uint32_t* nnzmap = nidx_nnzmap;
158 size_t n = nc;
159 do {
160 uint32_t nnz = *nnzmap++;
161 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
162 if XNN_LIKELY(nnz != 0) {
163 do {
164 const intptr_t diff = *dmap++;
165 const v128_t vi0123 = wasm_v128_load(input);
166 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
167 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
168 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
169 } while (--nnz != 0);
170 }
171 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
172 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
173 wasm_v128_store(output, vout0123);
174
175 output = (float*restrict) ((uintptr_t) output + output_stride);
176 } while (--n != 0);
177 output = (float*restrict) ((uintptr_t) output - output_decrement);
178 input += 4;
179 }
180 output_decrement += 2 * sizeof(float);
181 if (mc & (2 * sizeof(float))) {
182 const float*restrict w = weights;
183 const int32_t* dmap = widx_dmap;
184 const uint32_t* nnzmap = nidx_nnzmap;
185 size_t n = nc;
186 do {
187 uint32_t nnz = *nnzmap++;
188 v128_t vacc01 = wasm_v128_load32_splat(w); w += 1;
189 if XNN_LIKELY(nnz != 0) {
190 do {
191 const intptr_t diff = *dmap++;
192 const v128_t vi01 = wasm_v128_load64_splat(input);
193 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
194 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
195 vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
196 } while (--nnz != 0);
197 }
198 v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
199 vout01 = wasm_f32x4_pmax(vmin, vout01);
200 *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
201
202 output = (float*restrict) ((uintptr_t) output + output_stride);
203 } while (--n != 0);
204 output = (float*restrict) ((uintptr_t) output - output_decrement);
205 input += 2;
206 }
207 output_decrement += 1 * sizeof(float);
208 if (mc & (1 * sizeof(float))) {
209 const float*restrict w = weights;
210 const int32_t* dmap = widx_dmap;
211 const uint32_t* nnzmap = nidx_nnzmap;
212 size_t n = nc;
213 do {
214 uint32_t nnz = *nnzmap++;
215 v128_t vacc0 = wasm_v128_load32_splat(w); w += 1;
216 if XNN_LIKELY(nnz != 0) {
217 do {
218 const intptr_t diff = *dmap++;
219 const v128_t vi0 = wasm_v128_load32_splat(input);
220 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
221 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
222 vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
223 } while (--nnz != 0);
224 }
225 v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
226 vout0 = wasm_f32x4_pmax(vmin, vout0);
227 *output = wasm_f32x4_extract_lane(vout0, 0);
228
229 output = (float*restrict) ((uintptr_t) output + output_stride);
230 } while (--n != 0);
231 output = (float*restrict) ((uintptr_t) output - output_decrement);
232 input += 1;
233 }
234 }
235 }
236