1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-dwconv/unipass-wasmsimd-mul16.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/dwconv.h>
15
16
xnn_qu8_dwconv_minmax_fp32_ukernel_up24x25__wasmsimd_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_dwconv_minmax_fp32_ukernel_up24x25__wasmsimd_mul16(
18 size_t channels,
19 size_t output_width,
20 const uint8_t** input,
21 const void* weights,
22 uint8_t* output,
23 size_t input_stride,
24 size_t output_increment,
25 size_t input_offset,
26 const uint8_t* zero,
27 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
28 {
29 assert(channels != 0);
30 assert(output_width != 0);
31
32 const v128_t vkernel_zero_point = wasm_u32x4_load16x4(params->fp32_wasmsimd.kernel_zero_point);
33 do {
34 const uint8_t* i0 = input[0];
35 assert(i0 != NULL);
36 if XNN_UNPREDICTABLE(i0 != zero) {
37 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
38 }
39 const uint8_t* i1 = input[1];
40 assert(i1 != NULL);
41 if XNN_UNPREDICTABLE(i1 != zero) {
42 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
43 }
44 const uint8_t* i2 = input[2];
45 assert(i2 != NULL);
46 if XNN_UNPREDICTABLE(i2 != zero) {
47 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
48 }
49 const uint8_t* i3 = input[3];
50 assert(i3 != NULL);
51 if XNN_UNPREDICTABLE(i3 != zero) {
52 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
53 }
54 const uint8_t* i4 = input[4];
55 assert(i4 != NULL);
56 if XNN_UNPREDICTABLE(i4 != zero) {
57 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
58 }
59 const uint8_t* i5 = input[5];
60 assert(i5 != NULL);
61 if XNN_UNPREDICTABLE(i5 != zero) {
62 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
63 }
64 const uint8_t* i6 = input[6];
65 assert(i6 != NULL);
66 if XNN_UNPREDICTABLE(i6 != zero) {
67 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
68 }
69 const uint8_t* i7 = input[7];
70 assert(i7 != NULL);
71 if XNN_UNPREDICTABLE(i7 != zero) {
72 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
73 }
74 const uint8_t* i8 = input[8];
75 assert(i8 != NULL);
76 if XNN_UNPREDICTABLE(i8 != zero) {
77 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
78 }
79 const uint8_t* i9 = input[9];
80 assert(i9 != NULL);
81 if XNN_UNPREDICTABLE(i9 != zero) {
82 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
83 }
84 const uint8_t* i10 = input[10];
85 assert(i10 != NULL);
86 if XNN_UNPREDICTABLE(i10 != zero) {
87 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
88 }
89 const uint8_t* i11 = input[11];
90 assert(i11 != NULL);
91 if XNN_UNPREDICTABLE(i11 != zero) {
92 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
93 }
94 const uint8_t* i12 = input[12];
95 assert(i12 != NULL);
96 if XNN_UNPREDICTABLE(i12 != zero) {
97 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
98 }
99 const uint8_t* i13 = input[13];
100 assert(i13 != NULL);
101 if XNN_UNPREDICTABLE(i13 != zero) {
102 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
103 }
104 const uint8_t* i14 = input[14];
105 assert(i14 != NULL);
106 if XNN_UNPREDICTABLE(i14 != zero) {
107 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
108 }
109 const uint8_t* i15 = input[15];
110 assert(i15 != NULL);
111 if XNN_UNPREDICTABLE(i15 != zero) {
112 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
113 }
114 const uint8_t* i16 = input[16];
115 assert(i16 != NULL);
116 if XNN_UNPREDICTABLE(i16 != zero) {
117 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
118 }
119 const uint8_t* i17 = input[17];
120 assert(i17 != NULL);
121 if XNN_UNPREDICTABLE(i17 != zero) {
122 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
123 }
124 const uint8_t* i18 = input[18];
125 assert(i18 != NULL);
126 if XNN_UNPREDICTABLE(i18 != zero) {
127 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
128 }
129 const uint8_t* i19 = input[19];
130 assert(i19 != NULL);
131 if XNN_UNPREDICTABLE(i19 != zero) {
132 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
133 }
134 const uint8_t* i20 = input[20];
135 assert(i20 != NULL);
136 if XNN_UNPREDICTABLE(i20 != zero) {
137 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
138 }
139 const uint8_t* i21 = input[21];
140 assert(i21 != NULL);
141 if XNN_UNPREDICTABLE(i21 != zero) {
142 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
143 }
144 const uint8_t* i22 = input[22];
145 assert(i22 != NULL);
146 if XNN_UNPREDICTABLE(i22 != zero) {
147 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
148 }
149 const uint8_t* i23 = input[23];
150 assert(i23 != NULL);
151 if XNN_UNPREDICTABLE(i23 != zero) {
152 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
153 }
154 const uint8_t* i24 = input[24];
155 assert(i24 != NULL);
156 if XNN_UNPREDICTABLE(i24 != zero) {
157 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
158 }
159 input = (const uint8_t**) ((uintptr_t) input + input_stride);
160
161 size_t c = channels;
162 const void* w = weights;
163 for (; c >= 24; c -= 24) {
164 v128_t vacc0123 = wasm_v128_load(w);
165 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
166 v128_t vacc89AB = wasm_v128_load((const void*) ((uintptr_t) w + 8 * sizeof(int32_t)));
167 v128_t vaccCDEF = wasm_v128_load((const void*) ((uintptr_t) w + 12 * sizeof(int32_t)));
168 v128_t vaccGHIJ = wasm_v128_load((const void*) ((uintptr_t) w + 16 * sizeof(int32_t)));
169 v128_t vaccKLMN = wasm_v128_load((const void*) ((uintptr_t) w + 20 * sizeof(int32_t)));
170
171
172 const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0);
173 const v128_t vk0x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
174 const v128_t vi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8);
175 const v128_t vk0x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
176 const v128_t vi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16);
177 const v128_t vk0xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
178 i0 += 24;
179
180 v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567);
181 v128_t vprod89ABCDEF = wasm_i16x8_mul(vi0x89ABCDEF, vk0x89ABCDEF);
182 v128_t vprodGHIJKLMN = wasm_i16x8_mul(vi0xGHIJKLMN, vk0xGHIJKLMN);
183
184 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
185 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
186 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
187 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
188 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
189 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
190
191 const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1);
192 const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
193 const v128_t vi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8);
194 const v128_t vk1x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
195 const v128_t vi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16);
196 const v128_t vk1xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
197 v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567);
198 v128_t vsumx89ABCDEF = wasm_i16x8_add(vi0x89ABCDEF, vi1x89ABCDEF);
199 v128_t vsumxGHIJKLMN = wasm_i16x8_add(vi0xGHIJKLMN, vi1xGHIJKLMN);
200 i1 += 24;
201
202 vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567);
203 vprod89ABCDEF = wasm_i16x8_mul(vi1x89ABCDEF, vk1x89ABCDEF);
204 vprodGHIJKLMN = wasm_i16x8_mul(vi1xGHIJKLMN, vk1xGHIJKLMN);
205
206 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
207 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
208 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
209 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
210 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
211 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
212
213 const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2);
214 const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
215 const v128_t vi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8);
216 const v128_t vk2x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
217 const v128_t vi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16);
218 const v128_t vk2xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
219 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567);
220 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi2x89ABCDEF);
221 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi2xGHIJKLMN);
222 i2 += 24;
223
224 vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567);
225 vprod89ABCDEF = wasm_i16x8_mul(vi2x89ABCDEF, vk2x89ABCDEF);
226 vprodGHIJKLMN = wasm_i16x8_mul(vi2xGHIJKLMN, vk2xGHIJKLMN);
227
228 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
229 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
230 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
231 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
232 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
233 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
234
235 const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3);
236 const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
237 const v128_t vi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8);
238 const v128_t vk3x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
239 const v128_t vi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16);
240 const v128_t vk3xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
241 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567);
242 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi3x89ABCDEF);
243 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi3xGHIJKLMN);
244 i3 += 24;
245
246 vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567);
247 vprod89ABCDEF = wasm_i16x8_mul(vi3x89ABCDEF, vk3x89ABCDEF);
248 vprodGHIJKLMN = wasm_i16x8_mul(vi3xGHIJKLMN, vk3xGHIJKLMN);
249
250 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
251 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
252 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
253 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
254 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
255 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
256
257 const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4);
258 const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
259 const v128_t vi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8);
260 const v128_t vk4x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
261 const v128_t vi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16);
262 const v128_t vk4xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
263 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567);
264 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi4x89ABCDEF);
265 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi4xGHIJKLMN);
266 i4 += 24;
267
268 vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567);
269 vprod89ABCDEF = wasm_i16x8_mul(vi4x89ABCDEF, vk4x89ABCDEF);
270 vprodGHIJKLMN = wasm_i16x8_mul(vi4xGHIJKLMN, vk4xGHIJKLMN);
271
272 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
273 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
274 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
275 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
276 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
277 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
278
279 const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5);
280 const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
281 const v128_t vi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8);
282 const v128_t vk5x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
283 const v128_t vi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16);
284 const v128_t vk5xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
285 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567);
286 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi5x89ABCDEF);
287 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi5xGHIJKLMN);
288 i5 += 24;
289
290 vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567);
291 vprod89ABCDEF = wasm_i16x8_mul(vi5x89ABCDEF, vk5x89ABCDEF);
292 vprodGHIJKLMN = wasm_i16x8_mul(vi5xGHIJKLMN, vk5xGHIJKLMN);
293
294 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
295 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
296 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
297 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
298 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
299 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
300
301 const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6);
302 const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
303 const v128_t vi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8);
304 const v128_t vk6x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
305 const v128_t vi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16);
306 const v128_t vk6xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
307 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567);
308 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi6x89ABCDEF);
309 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi6xGHIJKLMN);
310 i6 += 24;
311
312 vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567);
313 vprod89ABCDEF = wasm_i16x8_mul(vi6x89ABCDEF, vk6x89ABCDEF);
314 vprodGHIJKLMN = wasm_i16x8_mul(vi6xGHIJKLMN, vk6xGHIJKLMN);
315
316 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
317 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
318 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
319 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
320 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
321 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
322
323 const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7);
324 const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
325 const v128_t vi7x89ABCDEF = wasm_u16x8_load8x8(i7 + 8);
326 const v128_t vk7x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
327 const v128_t vi7xGHIJKLMN = wasm_u16x8_load8x8(i7 + 16);
328 const v128_t vk7xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
329 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567);
330 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi7x89ABCDEF);
331 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi7xGHIJKLMN);
332 i7 += 24;
333
334 vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567);
335 vprod89ABCDEF = wasm_i16x8_mul(vi7x89ABCDEF, vk7x89ABCDEF);
336 vprodGHIJKLMN = wasm_i16x8_mul(vi7xGHIJKLMN, vk7xGHIJKLMN);
337
338 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
339 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
340 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
341 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
342 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
343 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
344
345 const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8);
346 const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
347 const v128_t vi8x89ABCDEF = wasm_u16x8_load8x8(i8 + 8);
348 const v128_t vk8x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 200 * sizeof(uint8_t)));
349 const v128_t vi8xGHIJKLMN = wasm_u16x8_load8x8(i8 + 16);
350 const v128_t vk8xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 208 * sizeof(uint8_t)));
351 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567);
352 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi8x89ABCDEF);
353 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi8xGHIJKLMN);
354 i8 += 24;
355
356 vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567);
357 vprod89ABCDEF = wasm_i16x8_mul(vi8x89ABCDEF, vk8x89ABCDEF);
358 vprodGHIJKLMN = wasm_i16x8_mul(vi8xGHIJKLMN, vk8xGHIJKLMN);
359
360 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
361 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
362 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
363 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
364 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
365 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
366
367 const v128_t vi9x01234567 = wasm_u16x8_load8x8(i9);
368 const v128_t vk9x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 216 * sizeof(uint8_t)));
369 const v128_t vi9x89ABCDEF = wasm_u16x8_load8x8(i9 + 8);
370 const v128_t vk9x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 224 * sizeof(uint8_t)));
371 const v128_t vi9xGHIJKLMN = wasm_u16x8_load8x8(i9 + 16);
372 const v128_t vk9xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 232 * sizeof(uint8_t)));
373 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi9x01234567);
374 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi9x89ABCDEF);
375 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi9xGHIJKLMN);
376 i9 += 24;
377
378 vprod01234567 = wasm_i16x8_mul(vi9x01234567, vk9x01234567);
379 vprod89ABCDEF = wasm_i16x8_mul(vi9x89ABCDEF, vk9x89ABCDEF);
380 vprodGHIJKLMN = wasm_i16x8_mul(vi9xGHIJKLMN, vk9xGHIJKLMN);
381
382 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
383 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
384 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
385 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
386 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
387 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
388
389 const v128_t vi10x01234567 = wasm_u16x8_load8x8(i10);
390 const v128_t vk10x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 240 * sizeof(uint8_t)));
391 const v128_t vi10x89ABCDEF = wasm_u16x8_load8x8(i10 + 8);
392 const v128_t vk10x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 248 * sizeof(uint8_t)));
393 const v128_t vi10xGHIJKLMN = wasm_u16x8_load8x8(i10 + 16);
394 const v128_t vk10xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 256 * sizeof(uint8_t)));
395 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi10x01234567);
396 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi10x89ABCDEF);
397 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi10xGHIJKLMN);
398 i10 += 24;
399
400 vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567);
401 vprod89ABCDEF = wasm_i16x8_mul(vi10x89ABCDEF, vk10x89ABCDEF);
402 vprodGHIJKLMN = wasm_i16x8_mul(vi10xGHIJKLMN, vk10xGHIJKLMN);
403
404 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
405 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
406 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
407 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
408 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
409 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
410
411 const v128_t vi11x01234567 = wasm_u16x8_load8x8(i11);
412 const v128_t vk11x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 264 * sizeof(uint8_t)));
413 const v128_t vi11x89ABCDEF = wasm_u16x8_load8x8(i11 + 8);
414 const v128_t vk11x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 272 * sizeof(uint8_t)));
415 const v128_t vi11xGHIJKLMN = wasm_u16x8_load8x8(i11 + 16);
416 const v128_t vk11xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 280 * sizeof(uint8_t)));
417 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi11x01234567);
418 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi11x89ABCDEF);
419 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi11xGHIJKLMN);
420 i11 += 24;
421
422 vprod01234567 = wasm_i16x8_mul(vi11x01234567, vk11x01234567);
423 vprod89ABCDEF = wasm_i16x8_mul(vi11x89ABCDEF, vk11x89ABCDEF);
424 vprodGHIJKLMN = wasm_i16x8_mul(vi11xGHIJKLMN, vk11xGHIJKLMN);
425
426 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
427 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
428 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
429 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
430 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
431 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
432
433 const v128_t vi12x01234567 = wasm_u16x8_load8x8(i12);
434 const v128_t vk12x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 288 * sizeof(uint8_t)));
435 const v128_t vi12x89ABCDEF = wasm_u16x8_load8x8(i12 + 8);
436 const v128_t vk12x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 296 * sizeof(uint8_t)));
437 const v128_t vi12xGHIJKLMN = wasm_u16x8_load8x8(i12 + 16);
438 const v128_t vk12xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 304 * sizeof(uint8_t)));
439 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi12x01234567);
440 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi12x89ABCDEF);
441 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi12xGHIJKLMN);
442 i12 += 24;
443
444 vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567);
445 vprod89ABCDEF = wasm_i16x8_mul(vi12x89ABCDEF, vk12x89ABCDEF);
446 vprodGHIJKLMN = wasm_i16x8_mul(vi12xGHIJKLMN, vk12xGHIJKLMN);
447
448 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
449 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
450 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
451 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
452 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
453 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
454
455 const v128_t vi13x01234567 = wasm_u16x8_load8x8(i13);
456 const v128_t vk13x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 312 * sizeof(uint8_t)));
457 const v128_t vi13x89ABCDEF = wasm_u16x8_load8x8(i13 + 8);
458 const v128_t vk13x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 320 * sizeof(uint8_t)));
459 const v128_t vi13xGHIJKLMN = wasm_u16x8_load8x8(i13 + 16);
460 const v128_t vk13xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 328 * sizeof(uint8_t)));
461 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi13x01234567);
462 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi13x89ABCDEF);
463 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi13xGHIJKLMN);
464 i13 += 24;
465
466 vprod01234567 = wasm_i16x8_mul(vi13x01234567, vk13x01234567);
467 vprod89ABCDEF = wasm_i16x8_mul(vi13x89ABCDEF, vk13x89ABCDEF);
468 vprodGHIJKLMN = wasm_i16x8_mul(vi13xGHIJKLMN, vk13xGHIJKLMN);
469
470 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
471 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
472 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
473 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
474 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
475 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
476
477 const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14);
478 const v128_t vk14x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 336 * sizeof(uint8_t)));
479 const v128_t vi14x89ABCDEF = wasm_u16x8_load8x8(i14 + 8);
480 const v128_t vk14x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 344 * sizeof(uint8_t)));
481 const v128_t vi14xGHIJKLMN = wasm_u16x8_load8x8(i14 + 16);
482 const v128_t vk14xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 352 * sizeof(uint8_t)));
483 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567);
484 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi14x89ABCDEF);
485 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi14xGHIJKLMN);
486 i14 += 24;
487
488 vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567);
489 vprod89ABCDEF = wasm_i16x8_mul(vi14x89ABCDEF, vk14x89ABCDEF);
490 vprodGHIJKLMN = wasm_i16x8_mul(vi14xGHIJKLMN, vk14xGHIJKLMN);
491
492 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
493 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
494 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
495 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
496 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
497 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
498
499 const v128_t vi15x01234567 = wasm_u16x8_load8x8(i15);
500 const v128_t vk15x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 360 * sizeof(uint8_t)));
501 const v128_t vi15x89ABCDEF = wasm_u16x8_load8x8(i15 + 8);
502 const v128_t vk15x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 368 * sizeof(uint8_t)));
503 const v128_t vi15xGHIJKLMN = wasm_u16x8_load8x8(i15 + 16);
504 const v128_t vk15xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 376 * sizeof(uint8_t)));
505 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi15x01234567);
506 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi15x89ABCDEF);
507 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi15xGHIJKLMN);
508 i15 += 24;
509
510 vprod01234567 = wasm_i16x8_mul(vi15x01234567, vk15x01234567);
511 vprod89ABCDEF = wasm_i16x8_mul(vi15x89ABCDEF, vk15x89ABCDEF);
512 vprodGHIJKLMN = wasm_i16x8_mul(vi15xGHIJKLMN, vk15xGHIJKLMN);
513
514 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
515 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
516 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
517 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
518 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
519 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
520
521 const v128_t vi16x01234567 = wasm_u16x8_load8x8(i16);
522 const v128_t vk16x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 384 * sizeof(uint8_t)));
523 const v128_t vi16x89ABCDEF = wasm_u16x8_load8x8(i16 + 8);
524 const v128_t vk16x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 392 * sizeof(uint8_t)));
525 const v128_t vi16xGHIJKLMN = wasm_u16x8_load8x8(i16 + 16);
526 const v128_t vk16xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 400 * sizeof(uint8_t)));
527 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi16x01234567);
528 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi16x89ABCDEF);
529 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi16xGHIJKLMN);
530 i16 += 24;
531
532 vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567);
533 vprod89ABCDEF = wasm_i16x8_mul(vi16x89ABCDEF, vk16x89ABCDEF);
534 vprodGHIJKLMN = wasm_i16x8_mul(vi16xGHIJKLMN, vk16xGHIJKLMN);
535
536 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
537 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
538 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
539 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
540 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
541 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
542
543 const v128_t vi17x01234567 = wasm_u16x8_load8x8(i17);
544 const v128_t vk17x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 408 * sizeof(uint8_t)));
545 const v128_t vi17x89ABCDEF = wasm_u16x8_load8x8(i17 + 8);
546 const v128_t vk17x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 416 * sizeof(uint8_t)));
547 const v128_t vi17xGHIJKLMN = wasm_u16x8_load8x8(i17 + 16);
548 const v128_t vk17xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 424 * sizeof(uint8_t)));
549 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi17x01234567);
550 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi17x89ABCDEF);
551 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi17xGHIJKLMN);
552 i17 += 24;
553
554 vprod01234567 = wasm_i16x8_mul(vi17x01234567, vk17x01234567);
555 vprod89ABCDEF = wasm_i16x8_mul(vi17x89ABCDEF, vk17x89ABCDEF);
556 vprodGHIJKLMN = wasm_i16x8_mul(vi17xGHIJKLMN, vk17xGHIJKLMN);
557
558 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
559 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
560 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
561 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
562 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
563 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
564
565 const v128_t vi18x01234567 = wasm_u16x8_load8x8(i18);
566 const v128_t vk18x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 432 * sizeof(uint8_t)));
567 const v128_t vi18x89ABCDEF = wasm_u16x8_load8x8(i18 + 8);
568 const v128_t vk18x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 440 * sizeof(uint8_t)));
569 const v128_t vi18xGHIJKLMN = wasm_u16x8_load8x8(i18 + 16);
570 const v128_t vk18xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 448 * sizeof(uint8_t)));
571 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi18x01234567);
572 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi18x89ABCDEF);
573 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi18xGHIJKLMN);
574 i18 += 24;
575
576 vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567);
577 vprod89ABCDEF = wasm_i16x8_mul(vi18x89ABCDEF, vk18x89ABCDEF);
578 vprodGHIJKLMN = wasm_i16x8_mul(vi18xGHIJKLMN, vk18xGHIJKLMN);
579
580 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
581 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
582 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
583 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
584 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
585 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
586
587 const v128_t vi19x01234567 = wasm_u16x8_load8x8(i19);
588 const v128_t vk19x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 456 * sizeof(uint8_t)));
589 const v128_t vi19x89ABCDEF = wasm_u16x8_load8x8(i19 + 8);
590 const v128_t vk19x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 464 * sizeof(uint8_t)));
591 const v128_t vi19xGHIJKLMN = wasm_u16x8_load8x8(i19 + 16);
592 const v128_t vk19xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 472 * sizeof(uint8_t)));
593 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi19x01234567);
594 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi19x89ABCDEF);
595 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi19xGHIJKLMN);
596 i19 += 24;
597
598 vprod01234567 = wasm_i16x8_mul(vi19x01234567, vk19x01234567);
599 vprod89ABCDEF = wasm_i16x8_mul(vi19x89ABCDEF, vk19x89ABCDEF);
600 vprodGHIJKLMN = wasm_i16x8_mul(vi19xGHIJKLMN, vk19xGHIJKLMN);
601
602 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
603 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
604 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
605 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
606 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
607 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
608
609 const v128_t vi20x01234567 = wasm_u16x8_load8x8(i20);
610 const v128_t vk20x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 480 * sizeof(uint8_t)));
611 const v128_t vi20x89ABCDEF = wasm_u16x8_load8x8(i20 + 8);
612 const v128_t vk20x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 488 * sizeof(uint8_t)));
613 const v128_t vi20xGHIJKLMN = wasm_u16x8_load8x8(i20 + 16);
614 const v128_t vk20xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 496 * sizeof(uint8_t)));
615 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi20x01234567);
616 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi20x89ABCDEF);
617 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi20xGHIJKLMN);
618 i20 += 24;
619
620 vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567);
621 vprod89ABCDEF = wasm_i16x8_mul(vi20x89ABCDEF, vk20x89ABCDEF);
622 vprodGHIJKLMN = wasm_i16x8_mul(vi20xGHIJKLMN, vk20xGHIJKLMN);
623
624 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
625 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
626 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
627 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
628 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
629 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
630
631 const v128_t vi21x01234567 = wasm_u16x8_load8x8(i21);
632 const v128_t vk21x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 504 * sizeof(uint8_t)));
633 const v128_t vi21x89ABCDEF = wasm_u16x8_load8x8(i21 + 8);
634 const v128_t vk21x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 512 * sizeof(uint8_t)));
635 const v128_t vi21xGHIJKLMN = wasm_u16x8_load8x8(i21 + 16);
636 const v128_t vk21xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 520 * sizeof(uint8_t)));
637 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi21x01234567);
638 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi21x89ABCDEF);
639 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi21xGHIJKLMN);
640 i21 += 24;
641
642 vprod01234567 = wasm_i16x8_mul(vi21x01234567, vk21x01234567);
643 vprod89ABCDEF = wasm_i16x8_mul(vi21x89ABCDEF, vk21x89ABCDEF);
644 vprodGHIJKLMN = wasm_i16x8_mul(vi21xGHIJKLMN, vk21xGHIJKLMN);
645
646 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
647 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
648 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
649 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
650 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
651 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
652
653 const v128_t vi22x01234567 = wasm_u16x8_load8x8(i22);
654 const v128_t vk22x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 528 * sizeof(uint8_t)));
655 const v128_t vi22x89ABCDEF = wasm_u16x8_load8x8(i22 + 8);
656 const v128_t vk22x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 536 * sizeof(uint8_t)));
657 const v128_t vi22xGHIJKLMN = wasm_u16x8_load8x8(i22 + 16);
658 const v128_t vk22xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 544 * sizeof(uint8_t)));
659 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi22x01234567);
660 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi22x89ABCDEF);
661 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi22xGHIJKLMN);
662 i22 += 24;
663
664 vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567);
665 vprod89ABCDEF = wasm_i16x8_mul(vi22x89ABCDEF, vk22x89ABCDEF);
666 vprodGHIJKLMN = wasm_i16x8_mul(vi22xGHIJKLMN, vk22xGHIJKLMN);
667
668 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
669 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
670 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
671 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
672 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
673 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
674
675 const v128_t vi23x01234567 = wasm_u16x8_load8x8(i23);
676 const v128_t vk23x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 552 * sizeof(uint8_t)));
677 const v128_t vi23x89ABCDEF = wasm_u16x8_load8x8(i23 + 8);
678 const v128_t vk23x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 560 * sizeof(uint8_t)));
679 const v128_t vi23xGHIJKLMN = wasm_u16x8_load8x8(i23 + 16);
680 const v128_t vk23xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 568 * sizeof(uint8_t)));
681 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi23x01234567);
682 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi23x89ABCDEF);
683 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi23xGHIJKLMN);
684 i23 += 24;
685
686 vprod01234567 = wasm_i16x8_mul(vi23x01234567, vk23x01234567);
687 vprod89ABCDEF = wasm_i16x8_mul(vi23x89ABCDEF, vk23x89ABCDEF);
688 vprodGHIJKLMN = wasm_i16x8_mul(vi23xGHIJKLMN, vk23xGHIJKLMN);
689
690 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
691 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
692 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
693 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
694 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
695 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
696
697 const v128_t vi24x01234567 = wasm_u16x8_load8x8(i24);
698 const v128_t vk24x01234567 = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 576 * sizeof(uint8_t)));
699 const v128_t vi24x89ABCDEF = wasm_u16x8_load8x8(i24 + 8);
700 const v128_t vk24x89ABCDEF = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 584 * sizeof(uint8_t)));
701 const v128_t vi24xGHIJKLMN = wasm_u16x8_load8x8(i24 + 16);
702 const v128_t vk24xGHIJKLMN = wasm_u16x8_load8x8((const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 592 * sizeof(uint8_t)));
703 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi24x01234567);
704 vsumx89ABCDEF = wasm_i16x8_add(vsumx89ABCDEF, vi24x89ABCDEF);
705 vsumxGHIJKLMN = wasm_i16x8_add(vsumxGHIJKLMN, vi24xGHIJKLMN);
706 i24 += 24;
707
708 vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567);
709 vprod89ABCDEF = wasm_i16x8_mul(vi24x89ABCDEF, vk24x89ABCDEF);
710 vprodGHIJKLMN = wasm_i16x8_mul(vi24xGHIJKLMN, vk24xGHIJKLMN);
711
712 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
713 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
714 vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vprod89ABCDEF));
715 vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vprod89ABCDEF));
716 vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vprodGHIJKLMN));
717 vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vprodGHIJKLMN));
718
719 vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point));
720 vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point));
721 vacc89AB = wasm_i32x4_sub(vacc89AB, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx89ABCDEF), vkernel_zero_point));
722 vaccCDEF = wasm_i32x4_sub(vaccCDEF, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx89ABCDEF), vkernel_zero_point));
723 vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumxGHIJKLMN), vkernel_zero_point));
724 vaccKLMN = wasm_i32x4_sub(vaccKLMN, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumxGHIJKLMN), vkernel_zero_point));
725
726 w = (const void*) ((uintptr_t) w + 24 * sizeof(int32_t) + 600 * sizeof(uint8_t));
727
728 vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
729 vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
730 vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB);
731 vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF);
732 vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ);
733 vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN);
734
735 const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
736 vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
737 vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
738 vacc89AB = wasm_f32x4_mul(vacc89AB, vscale);
739 vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale);
740 vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale);
741 vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale);
742
743 const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
744 vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
745 vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
746 vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias);
747 vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias);
748 vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias);
749 vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias);
750
751 const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
752 vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
753 vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
754 vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min);
755 vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min);
756 vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min);
757 vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min);
758
759 const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
760 vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
761 vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
762 vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point);
763 vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point);
764 vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point);
765 vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point);
766
767 v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
768 v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF);
769 v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN);
770
771 v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
772 v128_t voutGHIJKLMNGHIJKLMN = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN);
773
774 const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
775 vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
776 voutGHIJKLMNGHIJKLMN = wasm_u8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max);
777
778 wasm_v128_store(output, vout0123456789ABCDEF);
779 *((double*) (output + 16)) = wasm_f64x2_extract_lane(voutGHIJKLMNGHIJKLMN, 0);
780 output += 24;
781 }
782 if XNN_UNLIKELY(c != 0) {
783 const uint8_t* k = (const uint8_t*) ((uintptr_t) w + 24 * sizeof(int32_t));
784 do {
785 v128_t vacc0123 = wasm_v128_load(w);
786 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
787
788
789 const v128_t vi0x01234567 = wasm_u16x8_load8x8(i0);
790 const v128_t vk0x01234567 = wasm_u16x8_load8x8(k);
791 i0 += 8;
792
793 v128_t vprod01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567);
794
795 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
796 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
797
798 const v128_t vi1x01234567 = wasm_u16x8_load8x8(i1);
799 const v128_t vk1x01234567 = wasm_u16x8_load8x8((const void*) (k + 24));
800 v128_t vsumx01234567 = wasm_i16x8_add(vi0x01234567, vi1x01234567);
801 i1 += 8;
802
803 vprod01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567);
804
805 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
806 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
807
808 const v128_t vi2x01234567 = wasm_u16x8_load8x8(i2);
809 const v128_t vk2x01234567 = wasm_u16x8_load8x8((const void*) (k + 48));
810 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi2x01234567);
811 i2 += 8;
812
813 vprod01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567);
814
815 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
816 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
817
818 const v128_t vi3x01234567 = wasm_u16x8_load8x8(i3);
819 const v128_t vk3x01234567 = wasm_u16x8_load8x8((const void*) (k + 72));
820 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi3x01234567);
821 i3 += 8;
822
823 vprod01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567);
824
825 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
826 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
827
828 const v128_t vi4x01234567 = wasm_u16x8_load8x8(i4);
829 const v128_t vk4x01234567 = wasm_u16x8_load8x8((const void*) (k + 96));
830 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi4x01234567);
831 i4 += 8;
832
833 vprod01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567);
834
835 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
836 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
837
838 const v128_t vi5x01234567 = wasm_u16x8_load8x8(i5);
839 const v128_t vk5x01234567 = wasm_u16x8_load8x8((const void*) (k + 120));
840 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi5x01234567);
841 i5 += 8;
842
843 vprod01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567);
844
845 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
846 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
847
848 const v128_t vi6x01234567 = wasm_u16x8_load8x8(i6);
849 const v128_t vk6x01234567 = wasm_u16x8_load8x8((const void*) (k + 144));
850 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi6x01234567);
851 i6 += 8;
852
853 vprod01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567);
854
855 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
856 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
857
858 const v128_t vi7x01234567 = wasm_u16x8_load8x8(i7);
859 const v128_t vk7x01234567 = wasm_u16x8_load8x8((const void*) (k + 168));
860 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi7x01234567);
861 i7 += 8;
862
863 vprod01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567);
864
865 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
866 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
867
868 const v128_t vi8x01234567 = wasm_u16x8_load8x8(i8);
869 const v128_t vk8x01234567 = wasm_u16x8_load8x8((const void*) (k + 192));
870 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi8x01234567);
871 i8 += 8;
872
873 vprod01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567);
874
875 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
876 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
877
878 const v128_t vi9x01234567 = wasm_u16x8_load8x8(i9);
879 const v128_t vk9x01234567 = wasm_u16x8_load8x8((const void*) (k + 216));
880 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi9x01234567);
881 i9 += 8;
882
883 vprod01234567 = wasm_i16x8_mul(vi9x01234567, vk9x01234567);
884
885 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
886 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
887
888 const v128_t vi10x01234567 = wasm_u16x8_load8x8(i10);
889 const v128_t vk10x01234567 = wasm_u16x8_load8x8((const void*) (k + 240));
890 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi10x01234567);
891 i10 += 8;
892
893 vprod01234567 = wasm_i16x8_mul(vi10x01234567, vk10x01234567);
894
895 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
896 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
897
898 const v128_t vi11x01234567 = wasm_u16x8_load8x8(i11);
899 const v128_t vk11x01234567 = wasm_u16x8_load8x8((const void*) (k + 264));
900 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi11x01234567);
901 i11 += 8;
902
903 vprod01234567 = wasm_i16x8_mul(vi11x01234567, vk11x01234567);
904
905 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
906 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
907
908 const v128_t vi12x01234567 = wasm_u16x8_load8x8(i12);
909 const v128_t vk12x01234567 = wasm_u16x8_load8x8((const void*) (k + 288));
910 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi12x01234567);
911 i12 += 8;
912
913 vprod01234567 = wasm_i16x8_mul(vi12x01234567, vk12x01234567);
914
915 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
916 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
917
918 const v128_t vi13x01234567 = wasm_u16x8_load8x8(i13);
919 const v128_t vk13x01234567 = wasm_u16x8_load8x8((const void*) (k + 312));
920 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi13x01234567);
921 i13 += 8;
922
923 vprod01234567 = wasm_i16x8_mul(vi13x01234567, vk13x01234567);
924
925 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
926 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
927
928 const v128_t vi14x01234567 = wasm_u16x8_load8x8(i14);
929 const v128_t vk14x01234567 = wasm_u16x8_load8x8((const void*) (k + 336));
930 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi14x01234567);
931 i14 += 8;
932
933 vprod01234567 = wasm_i16x8_mul(vi14x01234567, vk14x01234567);
934
935 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
936 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
937
938 const v128_t vi15x01234567 = wasm_u16x8_load8x8(i15);
939 const v128_t vk15x01234567 = wasm_u16x8_load8x8((const void*) (k + 360));
940 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi15x01234567);
941 i15 += 8;
942
943 vprod01234567 = wasm_i16x8_mul(vi15x01234567, vk15x01234567);
944
945 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
946 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
947
948 const v128_t vi16x01234567 = wasm_u16x8_load8x8(i16);
949 const v128_t vk16x01234567 = wasm_u16x8_load8x8((const void*) (k + 384));
950 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi16x01234567);
951 i16 += 8;
952
953 vprod01234567 = wasm_i16x8_mul(vi16x01234567, vk16x01234567);
954
955 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
956 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
957
958 const v128_t vi17x01234567 = wasm_u16x8_load8x8(i17);
959 const v128_t vk17x01234567 = wasm_u16x8_load8x8((const void*) (k + 408));
960 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi17x01234567);
961 i17 += 8;
962
963 vprod01234567 = wasm_i16x8_mul(vi17x01234567, vk17x01234567);
964
965 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
966 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
967
968 const v128_t vi18x01234567 = wasm_u16x8_load8x8(i18);
969 const v128_t vk18x01234567 = wasm_u16x8_load8x8((const void*) (k + 432));
970 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi18x01234567);
971 i18 += 8;
972
973 vprod01234567 = wasm_i16x8_mul(vi18x01234567, vk18x01234567);
974
975 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
976 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
977
978 const v128_t vi19x01234567 = wasm_u16x8_load8x8(i19);
979 const v128_t vk19x01234567 = wasm_u16x8_load8x8((const void*) (k + 456));
980 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi19x01234567);
981 i19 += 8;
982
983 vprod01234567 = wasm_i16x8_mul(vi19x01234567, vk19x01234567);
984
985 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
986 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
987
988 const v128_t vi20x01234567 = wasm_u16x8_load8x8(i20);
989 const v128_t vk20x01234567 = wasm_u16x8_load8x8((const void*) (k + 480));
990 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi20x01234567);
991 i20 += 8;
992
993 vprod01234567 = wasm_i16x8_mul(vi20x01234567, vk20x01234567);
994
995 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
996 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
997
998 const v128_t vi21x01234567 = wasm_u16x8_load8x8(i21);
999 const v128_t vk21x01234567 = wasm_u16x8_load8x8((const void*) (k + 504));
1000 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi21x01234567);
1001 i21 += 8;
1002
1003 vprod01234567 = wasm_i16x8_mul(vi21x01234567, vk21x01234567);
1004
1005 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
1006 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
1007
1008 const v128_t vi22x01234567 = wasm_u16x8_load8x8(i22);
1009 const v128_t vk22x01234567 = wasm_u16x8_load8x8((const void*) (k + 528));
1010 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi22x01234567);
1011 i22 += 8;
1012
1013 vprod01234567 = wasm_i16x8_mul(vi22x01234567, vk22x01234567);
1014
1015 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
1016 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
1017
1018 const v128_t vi23x01234567 = wasm_u16x8_load8x8(i23);
1019 const v128_t vk23x01234567 = wasm_u16x8_load8x8((const void*) (k + 552));
1020 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi23x01234567);
1021 i23 += 8;
1022
1023 vprod01234567 = wasm_i16x8_mul(vi23x01234567, vk23x01234567);
1024
1025 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
1026 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
1027
1028 const v128_t vi24x01234567 = wasm_u16x8_load8x8(i24);
1029 const v128_t vk24x01234567 = wasm_u16x8_load8x8((const void*) (k + 576));
1030 vsumx01234567 = wasm_i16x8_add(vsumx01234567, vi24x01234567);
1031 i24 += 8;
1032
1033 vprod01234567 = wasm_i16x8_mul(vi24x01234567, vk24x01234567);
1034
1035 vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vprod01234567));
1036 vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vprod01234567));
1037
1038 k += 8;
1039
1040 vacc0123 = wasm_i32x4_sub(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vsumx01234567), vkernel_zero_point));
1041 vacc4567 = wasm_i32x4_sub(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vsumx01234567), vkernel_zero_point));
1042
1043 vacc0123 = wasm_f32x4_convert_i32x4(vacc0123);
1044 vacc4567 = wasm_f32x4_convert_i32x4(vacc4567);
1045
1046 const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale);
1047 vacc0123 = wasm_f32x4_mul(vacc0123, vscale);
1048 vacc4567 = wasm_f32x4_mul(vacc4567, vscale);
1049
1050 const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias);
1051 vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias);
1052 vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias);
1053
1054 const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min);
1055 vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min);
1056 vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min);
1057
1058 const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point);
1059 vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point);
1060 vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point);
1061
1062 v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567);
1063 v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
1064
1065 const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max);
1066 vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
1067
1068 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
1069
1070 if XNN_LIKELY(c >= 8) {
1071 *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
1072 output += 8;
1073 c -= 8;
1074 } else {
1075 if (c & 4) {
1076 *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
1077 vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
1078 output += 4;
1079 }
1080 uint32_t vout0123 = wasm_i32x4_extract_lane(vout0123456701234567, 0);
1081 if (c & 2) {
1082 *((uint16_t*) output) = (uint16_t) vout0123;
1083 vout0123 >>= 16;
1084 output += 2;
1085 }
1086 if (c & 1) {
1087 *output = (uint8_t) vout0123;
1088 output += 1;
1089 }
1090 c = 0;
1091 }
1092 } while (c != 0);
1093 }
1094
1095 output = (uint8_t*) ((uintptr_t) output + output_increment);
1096 } while (--output_width != 0);
1097 }
1098