1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-dwconv/unipass-wasmsimd-mul16.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/dwconv.h>
15
16
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16(
18 size_t channels,
19 size_t output_width,
20 const uint8_t** input,
21 const void* weights,
22 uint8_t* output,
23 size_t input_stride,
24 size_t output_increment,
25 size_t input_offset,
26 const uint8_t* zero,
27 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(channels != 0);
30 assert(output_width != 0);
31
32 const v128_t vkernel_zero_point = wasm_u32x4_load16x4(params->fp32_wasmsimd.kernel_zero_point);
33 do {
34 const uint8_t* i0 = input[0];
35 assert(i0 != NULL);
36 if XNN_UNPREDICTABLE(i0 != zero) {
37 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
38 }
39 const uint8_t* i1 = input[1];
40 assert(i1 != NULL);
41 if XNN_UNPREDICTABLE(i1 != zero) {
42 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
43 }
44 const uint8_t* i2 = input[2];
45 assert(i2 != NULL);
46 if XNN_UNPREDICTABLE(i2 != zero) {
47 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
48 }
49 const uint8_t* i3 = input[3];
50 assert(i3 != NULL);
51 if XNN_UNPREDICTABLE(i3 != zero) {
52 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
53 }
54 const uint8_t* i4 = input[4];
55 assert(i4 != NULL);
56 if XNN_UNPREDICTABLE(i4 != zero) {
57 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
58 }
59 const uint8_t* i5 = input[5];
60 assert(i5 != NULL);
61 if XNN_UNPREDICTABLE(i5 != zero) {
62 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
63 }
64 const uint8_t* i6 = input[6];
65 assert(i6 != NULL);
66 if XNN_UNPREDICTABLE(i6 != zero) {
67 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
68 }
69 const uint8_t* i7 = input[7];
70 assert(i7 != NULL);
71 if XNN_UNPREDICTABLE(i7 != zero) {
72 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
73 }
74 const uint8_t* i8 = input[8];
75 assert(i8 != NULL);
76 if XNN_UNPREDICTABLE(i8 != zero) {
77 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
78 }
79 const uint8_t* i9 = input[9];
80 assert(i9 != NULL);
81 if XNN_UNPREDICTABLE(i9 != zero) {
82 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
83 }
84 const uint8_t* i10 = input[10];
85 assert(i10 != NULL);
86 if XNN_UNPREDICTABLE(i10 != zero) {
87 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
88 }
89 const uint8_t* i11 = input[11];
90 assert(i11 != NULL);
91 if XNN_UNPREDICTABLE(i11 != zero) {
92 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
93 }
94 const uint8_t* i12 = input[12];
95 assert(i12 != NULL);
96 if XNN_UNPREDICTABLE(i12 != zero) {
97 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
98 }
99 const uint8_t* i13 = input[13];
100 assert(i13 != NULL);
101 if XNN_UNPREDICTABLE(i13 != zero) {
102 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
103 }
104 const uint8_t* i14 = input[14];
105 assert(i14 != NULL);
106 if XNN_UNPREDICTABLE(i14 != zero) {
107 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
108 }
109 const uint8_t* i15 = input[15];
110 assert(i15 != NULL);
111 if XNN_UNPREDICTABLE(i15 != zero) {
112 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
113 }
114 const uint8_t* i16 = input[16];
115 assert(i16 != NULL);
116 if XNN_UNPREDICTABLE(i16 != zero) {
117 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
118 }
119 const uint8_t* i17 = input[17];
120 assert(i17 != NULL);
121 if XNN_UNPREDICTABLE(i17 != zero) {
122 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
123 }
124 const uint8_t* i18 = input[18];
125 assert(i18 != NULL);
126 if XNN_UNPREDICTABLE(i18 != zero) {
127 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
128 }
129 const uint8_t* i19 = input[19];
130 assert(i19 != NULL);
131 if XNN_UNPREDICTABLE(i19 != zero) {
132 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
133 }
134 const uint8_t* i20 = input[20];
135 assert(i20 != NULL);
136 if XNN_UNPREDICTABLE(i20 != zero) {
137 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
138 }
139 const uint8_t* i21 = input[21];
140 assert(i21 != NULL);
141 if XNN_UNPREDICTABLE(i21 != zero) {
142 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
143 }
144 const uint8_t* i22 = input[22];
145 assert(i22 != NULL);
146 if XNN_UNPREDICTABLE(i22 != zero) {
147 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
148 }
149 const uint8_t* i23 = input[23];
150 assert(i23 != NULL);
151 if XNN_UNPREDICTABLE(i23 != zero) {
152 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
153 }
154 const uint8_t* i24 = input[24];
155 assert(i24 != NULL);
156 if XNN_UNPREDICTABLE(i24 != zero) {
157 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
158 }
159 input = (const uint8_t**) ((uintptr_t) input + input_stride);
160
161 size_t c = channels;
162 const void* w = weights;
163 for (; c >= 16; c -= 16) {
164 v128_t vacc0123 = wasm_v128_load(w);
165 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
166 v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t)));
167 v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t)));
168
169
170 const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0);
171 const v128_t vk0x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
172 const v128_t vi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
173 const v128_t vk0x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
174 i0 += 16;
175
176 v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567);
177 v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF);
178
179 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
180 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
181 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
182 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
183
184 const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1);
185 const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
186 const v128_t vi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
187 const v128_t vk1x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
188 v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567);
189 v128_t vsumx89ABCDEF = wasm_i16x8_add(vi0x89ABCDEF, vi1x89ABCDEF);
190 i1 += 16;
191
192 vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567);
193 vprod89ABCDEF = wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF);
194
195 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
196 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
197 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
198 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
199
200 const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2);
201 const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
202 const v128_t vi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
203 const v128_t vk2x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
204 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567);
205 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi2x89ABCDEF);
206 i2 += 16;
207
208 vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567);
209 vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF);
210
211 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
212 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
213 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
214 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
215
216 const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3);
217 const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
218 const v128_t vi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
219 const v128_t vk3x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
220 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567);
221 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi3x89ABCDEF);
222 i3 += 16;
223
224 vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567);
225 vprod89ABCDEF = wasm_i16x8_mul(vi3x89ABCDEF, vk3x89ABCDEF);
226
227 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
228 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
229 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
230 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
231
232 const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4);
233 const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
234 const v128_t vi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
235 const v128_t vk4x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
236 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567);
237 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi4x89ABCDEF);
238 i4 += 16;
239
240 vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567);
241 vprod89ABCDEF = wasm_i16x8_mul(vi4x89ABCDEF, vk4x89ABCDEF);
242
243 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
244 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
245 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
246 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
247
248 const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5);
249 const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
250 const v128_t vi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
251 const v128_t vk5x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
252 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567);
253 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi5x89ABCDEF);
254 i5 += 16;
255
256 vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567);
257 vprod89ABCDEF = wasm_i16x8_mul(vi5x89ABCDEF, vk5x89ABCDEF);
258
259 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
260 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
261 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
262 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
263
264 const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6);
265 const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
266 const v128_t vi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
267 const v128_t vk6x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
268 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567);
269 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi6x89ABCDEF);
270 i6 += 16;
271
272 vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567);
273 vprod89ABCDEF = wasm_i16x8_mul(vi6x89ABCDEF, vk6x89ABCDEF);
274
275 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
276 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
277 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
278 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
279
280 const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7);
281 const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
282 const v128_t vi7x89ABCDEF = wasm_u16x8_load8x8(i7 + 8);
283 const v128_t vk7x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
284 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567);
285 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi7x89ABCDEF);
286 i7 += 16;
287
288 vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567);
289 vprod89ABCDEF = wasm_i16x8_mul(vi7x89ABCDEF, vk7x89ABCDEF);
290
291 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
292 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
293 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
294 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
295
296 const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8);
297 const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
298 const v128_t vi8x89ABCDEF = wasm_u16x8_load8x8(i8 + 8);
299 const v128_t vk8x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
300 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567);
301 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi8x89ABCDEF);
302 i8 += 16;
303
304 vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567);
305 vprod89ABCDEF = wasm_i16x8_mul(vi8x89ABCDEF, vk8x89ABCDEF);
306
307 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
308 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
309 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
310 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
311
312 const v128_t vi9x01234567 = wasm_u16x8_load8x8(i9);
313 const v128_t vk9x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
314 const v128_t vi9x89ABCDEF = wasm_u16x8_load8x8(i9 + 8);
315 const v128_t vk9x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
316 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi9x01234567);
317 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi9x89ABCDEF);
318 i9 += 16;
319
320 vprod01234567 = wasm_i16x8_mul(vi9x01234567, vk9x01234567);
321 vprod89ABCDEF = wasm_i16x8_mul(vi9x89ABCDEF, vk9x89ABCDEF);
322
323 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
324 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
325 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
326 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
327
328 const v128_t vi10x01234567 = wasm_u16x8_load8x8(i10);
329 const v128_t vk10x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
330 const v128_t vi10x89ABCDEF = wasm_u16x8_load8x8(i10 + 8);
331 const v128_t vk10x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
332 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi10x01234567);
333 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi10x89ABCDEF);
334 i10 += 16;
335
336 vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567);
337 vprod89ABCDEF = wasm_i16x8_mul(vi10x89ABCDEF, vk10x89ABCDEF);
338
339 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
340 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
341 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
342 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
343
344 const v128_t vi11x01234567 = wasm_u16x8_load8x8(i11);
345 const v128_t vk11x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
346 const v128_t vi11x89ABCDEF = wasm_u16x8_load8x8(i11 + 8);
347 const v128_t vk11x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
348 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi11x01234567);
349 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi11x89ABCDEF);
350 i11 += 16;
351
352 vprod01234567 = wasm_i16x8_mul(vi11x01234567, vk11x01234567);
353 vprod89ABCDEF = wasm_i16x8_mul(vi11x89ABCDEF, vk11x89ABCDEF);
354
355 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
356 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
357 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
358 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
359
360 const v128_t vi12x01234567 = wasm_u16x8_load8x8(i12);
361 const v128_t vk12x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
362 const v128_t vi12x89ABCDEF = wasm_u16x8_load8x8(i12 + 8);
363 const v128_t vk12x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)));
364 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi12x01234567);
365 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi12x89ABCDEF);
366 i12 += 16;
367
368 vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567);
369 vprod89ABCDEF = wasm_i16x8_mul(vi12x89ABCDEF, vk12x89ABCDEF);
370
371 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
372 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
373 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
374 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
375
376 const v128_t vi13x01234567 = wasm_u16x8_load8x8(i13);
377 const v128_t vk13x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)));
378 const v128_t vi13x89ABCDEF = wasm_u16x8_load8x8(i13 + 8);
379 const v128_t vk13x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)));
380 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi13x01234567);
381 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi13x89ABCDEF);
382 i13 += 16;
383
384 vprod01234567 = wasm_i16x8_mul(vi13x01234567, vk13x01234567);
385 vprod89ABCDEF = wasm_i16x8_mul(vi13x89ABCDEF, vk13x89ABCDEF);
386
387 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
388 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
389 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
390 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
391
392 const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14);
393 const v128_t vk14x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)));
394 const v128_t vi14x89ABCDEF = wasm_u16x8_load8x8(i14 + 8);
395 const v128_t vk14x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)));
396 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567);
397 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi14x89ABCDEF);
398 i14 += 16;
399
400 vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567);
401 vprod89ABCDEF = wasm_i16x8_mul(vi14x89ABCDEF, vk14x89ABCDEF);
402
403 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
404 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
405 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
406 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
407
408 const v128_t vi15x01234567 = wasm_u16x8_load8x8(i15);
409 const v128_t vk15x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)));
410 const v128_t vi15x89ABCDEF = wasm_u16x8_load8x8(i15 + 8);
411 const v128_t vk15x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)));
412 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi15x01234567);
413 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi15x89ABCDEF);
414 i15 += 16;
415
416 vprod01234567 = wasm_i16x8_mul(vi15x01234567, vk15x01234567);
417 vprod89ABCDEF = wasm_i16x8_mul(vi15x89ABCDEF, vk15x89ABCDEF);
418
419 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
420 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
421 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
422 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
423
424 const v128_t vi16x01234567 = wasm_u16x8_load8x8(i16);
425 const v128_t vk16x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)));
426 const v128_t vi16x89ABCDEF = wasm_u16x8_load8x8(i16 + 8);
427 const v128_t vk16x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)));
428 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi16x01234567);
429 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi16x89ABCDEF);
430 i16 += 16;
431
432 vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567);
433 vprod89ABCDEF = wasm_i16x8_mul(vi16x89ABCDEF, vk16x89ABCDEF);
434
435 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
436 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
437 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
438 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
439
440 const v128_t vi17x01234567 = wasm_u16x8_load8x8(i17);
441 const v128_t vk17x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)));
442 const v128_t vi17x89ABCDEF = wasm_u16x8_load8x8(i17 + 8);
443 const v128_t vk17x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)));
444 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi17x01234567);
445 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi17x89ABCDEF);
446 i17 += 16;
447
448 vprod01234567 = wasm_i16x8_mul(vi17x01234567, vk17x01234567);
449 vprod89ABCDEF = wasm_i16x8_mul(vi17x89ABCDEF, vk17x89ABCDEF);
450
451 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
452 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
453 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
454 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
455
456 const v128_t vi18x01234567 = wasm_u16x8_load8x8(i18);
457 const v128_t vk18x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)));
458 const v128_t vi18x89ABCDEF = wasm_u16x8_load8x8(i18 + 8);
459 const v128_t vk18x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)));
460 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi18x01234567);
461 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi18x89ABCDEF);
462 i18 += 16;
463
464 vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567);
465 vprod89ABCDEF = wasm_i16x8_mul(vi18x89ABCDEF, vk18x89ABCDEF);
466
467 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
468 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
469 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
470 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
471
472 const v128_t vi19x01234567 = wasm_u16x8_load8x8(i19);
473 const v128_t vk19x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)));
474 const v128_t vi19x89ABCDEF = wasm_u16x8_load8x8(i19 + 8);
475 const v128_t vk19x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)));
476 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi19x01234567);
477 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi19x89ABCDEF);
478 i19 += 16;
479
480 vprod01234567 = wasm_i16x8_mul(vi19x01234567, vk19x01234567);
481 vprod89ABCDEF = wasm_i16x8_mul(vi19x89ABCDEF, vk19x89ABCDEF);
482
483 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
484 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
485 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
486 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
487
488 const v128_t vi20x01234567 = wasm_u16x8_load8x8(i20);
489 const v128_t vk20x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)));
490 const v128_t vi20x89ABCDEF = wasm_u16x8_load8x8(i20 + 8);
491 const v128_t vk20x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)));
492 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi20x01234567);
493 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi20x89ABCDEF);
494 i20 += 16;
495
496 vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567);
497 vprod89ABCDEF = wasm_i16x8_mul(vi20x89ABCDEF, vk20x89ABCDEF);
498
499 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
500 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
501 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
502 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
503
504 const v128_t vi21x01234567 = wasm_u16x8_load8x8(i21);
505 const v128_t vk21x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)));
506 const v128_t vi21x89ABCDEF = wasm_u16x8_load8x8(i21 + 8);
507 const v128_t vk21x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)));
508 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi21x01234567);
509 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi21x89ABCDEF);
510 i21 += 16;
511
512 vprod01234567 = wasm_i16x8_mul(vi21x01234567, vk21x01234567);
513 vprod89ABCDEF = wasm_i16x8_mul(vi21x89ABCDEF, vk21x89ABCDEF);
514
515 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
516 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
517 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
518 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
519
520 const v128_t vi22x01234567 = wasm_u16x8_load8x8(i22);
521 const v128_t vk22x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)));
522 const v128_t vi22x89ABCDEF = wasm_u16x8_load8x8(i22 + 8);
523 const v128_t vk22x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)));
524 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi22x01234567);
525 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi22x89ABCDEF);
526 i22 += 16;
527
528 vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567);
529 vprod89ABCDEF = wasm_i16x8_mul(vi22x89ABCDEF, vk22x89ABCDEF);
530
531 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
532 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
533 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
534 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
535
536 const v128_t vi23x01234567 = wasm_u16x8_load8x8(i23);
537 const v128_t vk23x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)));
538 const v128_t vi23x89ABCDEF = wasm_u16x8_load8x8(i23 + 8);
539 const v128_t vk23x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)));
540 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi23x01234567);
541 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi23x89ABCDEF);
542 i23 += 16;
543
544 vprod01234567 = wasm_i16x8_mul(vi23x01234567, vk23x01234567);
545 vprod89ABCDEF = wasm_i16x8_mul(vi23x89ABCDEF, vk23x89ABCDEF);
546
547 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
548 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
549 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
550 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
551
552 const v128_t vi24x01234567 = wasm_u16x8_load8x8(i24);
553 const v128_t vk24x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)));
554 const v128_t vi24x89ABCDEF = wasm_u16x8_load8x8(i24 + 8);
555 const v128_t vk24x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)));
556 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi24x01234567);
557 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi24x89ABCDEF);
558 i24 += 16;
559
560 vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567);
561 vprod89ABCDEF = wasm_i16x8_mul(vi24x89ABCDEF, vk24x89ABCDEF);
562
563 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
564 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
565 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
566 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
567
568 vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point));
569 vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point));
570 vacc89AB = wasm_i32x4_sub(vacc89AB, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx89ABCDEF), vkernel_zero_point));
571 vaccCDEF = wasm_i32x4_sub(vaccCDEF, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx89ABCDEF), vkernel_zero_point));
572
573 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
574
575 vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
576 vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
577 vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
578 vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
579
580 const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
581 vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
582 vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
583 vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
584 vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
585
586 const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
587 vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
588 vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
589 vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
590 vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
591
592 const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
593 vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
594 vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
595 vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
596 vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
597
598 const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
599 vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
600 vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
601 vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
602 vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
603
604 v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
605 v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
606
607 v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
608
609 const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
610 vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
611
612 wasm_v128_store(output, vout0123456789ABCDEF);
613 output += 16;
614 }
615 if XNN_UNLIKELY(c != 0) {
616 const uint8_t* k = (const uint8_t*) ((uintptr_t) w + 16 * sizeof(int32_t));
617 do {
618 v128_t vacc0123 = wasm_v128_load(w);
619 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
620
621
622 const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0);
623 const v128_t vk0x01234567 = wasm_u16x8_load8x8(k);
624 i0 += 8;
625
626 v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567);
627
628 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
629 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
630
631 const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1);
632 const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) (k + 16));
633 v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567);
634 i1 += 8;
635
636 vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567);
637
638 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
639 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
640
641 const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2);
642 const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) (k + 32));
643 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567);
644 i2 += 8;
645
646 vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567);
647
648 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
649 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
650
651 const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3);
652 const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) (k + 48));
653 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567);
654 i3 += 8;
655
656 vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567);
657
658 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
659 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
660
661 const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4);
662 const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) (k + 64));
663 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567);
664 i4 += 8;
665
666 vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567);
667
668 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
669 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
670
671 const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5);
672 const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) (k + 80));
673 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567);
674 i5 += 8;
675
676 vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567);
677
678 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
679 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
680
681 const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6);
682 const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) (k + 96));
683 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567);
684 i6 += 8;
685
686 vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567);
687
688 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
689 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
690
691 const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7);
692 const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) (k + 112));
693 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567);
694 i7 += 8;
695
696 vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567);
697
698 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
699 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
700
701 const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8);
702 const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) (k + 128));
703 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567);
704 i8 += 8;
705
706 vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567);
707
708 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
709 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
710
711 const v128_t vi9x01234567 = wasm_u16x8_load8x8(i9);
712 const v128_t vk9x01234567 = wasm_u16x8_load8x8((const void*) (k + 144));
713 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi9x01234567);
714 i9 += 8;
715
716 vprod01234567 = wasm_i16x8_mul(vi9x01234567, vk9x01234567);
717
718 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
719 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
720
721 const v128_t vi10x01234567 = wasm_u16x8_load8x8(i10);
722 const v128_t vk10x01234567 = wasm_u16x8_load8x8((const void*) (k + 160));
723 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi10x01234567);
724 i10 += 8;
725
726 vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567);
727
728 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
729 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
730
731 const v128_t vi11x01234567 = wasm_u16x8_load8x8(i11);
732 const v128_t vk11x01234567 = wasm_u16x8_load8x8((const void*) (k + 176));
733 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi11x01234567);
734 i11 += 8;
735
736 vprod01234567 = wasm_i16x8_mul(vi11x01234567, vk11x01234567);
737
738 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
739 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
740
741 const v128_t vi12x01234567 = wasm_u16x8_load8x8(i12);
742 const v128_t vk12x01234567 = wasm_u16x8_load8x8((const void*) (k + 192));
743 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi12x01234567);
744 i12 += 8;
745
746 vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567);
747
748 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
749 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
750
751 const v128_t vi13x01234567 = wasm_u16x8_load8x8(i13);
752 const v128_t vk13x01234567 = wasm_u16x8_load8x8((const void*) (k + 208));
753 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi13x01234567);
754 i13 += 8;
755
756 vprod01234567 = wasm_i16x8_mul(vi13x01234567, vk13x01234567);
757
758 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
759 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
760
761 const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14);
762 const v128_t vk14x01234567 = wasm_u16x8_load8x8((const void*) (k + 224));
763 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567);
764 i14 += 8;
765
766 vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567);
767
768 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
769 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
770
771 const v128_t vi15x01234567 = wasm_u16x8_load8x8(i15);
772 const v128_t vk15x01234567 = wasm_u16x8_load8x8((const void*) (k + 240));
773 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi15x01234567);
774 i15 += 8;
775
776 vprod01234567 = wasm_i16x8_mul(vi15x01234567, vk15x01234567);
777
778 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
779 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
780
781 const v128_t vi16x01234567 = wasm_u16x8_load8x8(i16);
782 const v128_t vk16x01234567 = wasm_u16x8_load8x8((const void*) (k + 256));
783 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi16x01234567);
784 i16 += 8;
785
786 vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567);
787
788 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
789 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
790
791 const v128_t vi17x01234567 = wasm_u16x8_load8x8(i17);
792 const v128_t vk17x01234567 = wasm_u16x8_load8x8((const void*) (k + 272));
793 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi17x01234567);
794 i17 += 8;
795
796 vprod01234567 = wasm_i16x8_mul(vi17x01234567, vk17x01234567);
797
798 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
799 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
800
801 const v128_t vi18x01234567 = wasm_u16x8_load8x8(i18);
802 const v128_t vk18x01234567 = wasm_u16x8_load8x8((const void*) (k + 288));
803 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi18x01234567);
804 i18 += 8;
805
806 vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567);
807
808 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
809 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
810
811 const v128_t vi19x01234567 = wasm_u16x8_load8x8(i19);
812 const v128_t vk19x01234567 = wasm_u16x8_load8x8((const void*) (k + 304));
813 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi19x01234567);
814 i19 += 8;
815
816 vprod01234567 = wasm_i16x8_mul(vi19x01234567, vk19x01234567);
817
818 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
819 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
820
821 const v128_t vi20x01234567 = wasm_u16x8_load8x8(i20);
822 const v128_t vk20x01234567 = wasm_u16x8_load8x8((const void*) (k + 320));
823 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi20x01234567);
824 i20 += 8;
825
826 vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567);
827
828 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
829 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
830
831 const v128_t vi21x01234567 = wasm_u16x8_load8x8(i21);
832 const v128_t vk21x01234567 = wasm_u16x8_load8x8((const void*) (k + 336));
833 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi21x01234567);
834 i21 += 8;
835
836 vprod01234567 = wasm_i16x8_mul(vi21x01234567, vk21x01234567);
837
838 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
839 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
840
841 const v128_t vi22x01234567 = wasm_u16x8_load8x8(i22);
842 const v128_t vk22x01234567 = wasm_u16x8_load8x8((const void*) (k + 352));
843 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi22x01234567);
844 i22 += 8;
845
846 vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567);
847
848 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
849 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
850
851 const v128_t vi23x01234567 = wasm_u16x8_load8x8(i23);
852 const v128_t vk23x01234567 = wasm_u16x8_load8x8((const void*) (k + 368));
853 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi23x01234567);
854 i23 += 8;
855
856 vprod01234567 = wasm_i16x8_mul(vi23x01234567, vk23x01234567);
857
858 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
859 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
860
861 const v128_t vi24x01234567 = wasm_u16x8_load8x8(i24);
862 const v128_t vk24x01234567 = wasm_u16x8_load8x8((const void*) (k + 384));
863 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi24x01234567);
864 i24 += 8;
865
866 vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567);
867
868 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
869 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
870
871 k += 8;
872
873 vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point));
874 vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point));
875
876 vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
877 vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
878
879 const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
880 vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
881 vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
882
883 const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
884 vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
885 vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
886
887 const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
888 vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
889 vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
890
891 const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
892 vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
893 vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
894
895 v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
896 v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
897
898 const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
899 vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
900
901 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
902
903 if XNN_LIKELY(c >= 8) {
904 *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
905 output += 8;
906 c -= 8;
907 } else {
908 if (c & 4) {
909 *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
910 vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
911 output += 4;
912 }
913 uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
914 if (c & 2) {
915 *((uint16_t*) output) = (uint16_t) vout0123;
916 vout0123 >>= 16;
917 output += 2;
918 }
919 if (c & 1) {
920 *output = (uint8_t) vout0123;
921 output += 1;
922 }
923 c = 0;
924 }
925 } while (c != 0);
926 }
927
928 output = (uint8_t*) ((uintptr_t) output + output_increment);
929 } while (--output_width != 0);
930 }
931