1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/argmaxpool.h>
11 #include <xnnpack/avgpool.h>
12 #include <xnnpack/common.h>
13 #include <xnnpack/dwconv.h>
14 #include <xnnpack/fill.h>
15 #include <xnnpack/gavgpool.h>
16 #include <xnnpack/gemm.h>
17 #include <xnnpack/ibilinear.h>
18 #include <xnnpack/igemm.h>
19 #include <xnnpack/intrinsics-polyfill.h>
20 #include <xnnpack/math.h>
21 #include <xnnpack/maxpool.h>
22 #include <xnnpack/pad.h>
23 #include <xnnpack/prelu.h>
24 #include <xnnpack/raddstoreexpminusmax.h>
25 #include <xnnpack/rmax.h>
26 #include <xnnpack/transpose.h>
27 #include <xnnpack/unaligned.h>
28 #include <xnnpack/unpool.h>
29 #include <xnnpack/vadd.h>
30 #include <xnnpack/vcvt.h>
31 #include <xnnpack/vlrelu.h>
32 #include <xnnpack/vmul.h>
33 #include <xnnpack/vunary.h>
34 #include <xnnpack/zip.h>
35
36
xnn_f16_f32_vcvt_ukernel__sse2_int16_x32(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])37 void xnn_f16_f32_vcvt_ukernel__sse2_int16_x32(
38 size_t n,
39 const void* input,
40 float* output,
41 const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
42 {
43 assert(n != 0);
44 assert(n % sizeof(uint16_t) == 0);
45 assert(input != NULL);
46 assert(output != NULL);
47
48 const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
49 const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
50 const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
51 const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
52 const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
53 const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
54
55 const uint16_t* i = (const uint16_t*) input;
56 for (; n >= 32 * sizeof(uint16_t); n -= 32 * sizeof(uint16_t)) {
57 const __m128i vh0 = _mm_loadu_si128((const __m128i*) i);
58 const __m128i vh1 = _mm_loadu_si128((const __m128i*) (i + 8));
59 const __m128i vh2 = _mm_loadu_si128((const __m128i*) (i + 16));
60 const __m128i vh3 = _mm_loadu_si128((const __m128i*) (i + 24));
61 i += 32;
62
63 const __m128i vsign0 = _mm_and_si128(vh0, vsign_mask);
64 const __m128i vsign1 = _mm_and_si128(vh1, vsign_mask);
65 const __m128i vsign2 = _mm_and_si128(vh2, vsign_mask);
66 const __m128i vsign3 = _mm_and_si128(vh3, vsign_mask);
67
68 const __m128i vnonsign0 = _mm_xor_si128(vh0, vsign0);
69 const __m128i vnonsign1 = _mm_xor_si128(vh1, vsign1);
70 const __m128i vnonsign2 = _mm_xor_si128(vh2, vsign2);
71 const __m128i vnonsign3 = _mm_xor_si128(vh3, vsign3);
72
73 const __m128i vprenorm0 = _mm_slli_epi16(vnonsign0, 13);
74 const __m128i vprenorm1 = _mm_add_epi16(_mm_srli_epi16(vnonsign0, 3), vexp_offset);
75 const __m128i vprenorm2 = _mm_slli_epi16(vnonsign1, 13);
76 const __m128i vprenorm3 = _mm_add_epi16(_mm_srli_epi16(vnonsign1, 3), vexp_offset);
77 const __m128i vprenorm4 = _mm_slli_epi16(vnonsign2, 13);
78 const __m128i vprenorm5 = _mm_add_epi16(_mm_srli_epi16(vnonsign2, 3), vexp_offset);
79 const __m128i vprenorm6 = _mm_slli_epi16(vnonsign3, 13);
80 const __m128i vprenorm7 = _mm_add_epi16(_mm_srli_epi16(vnonsign3, 3), vexp_offset);
81
82 const __m128i vnorm0 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm0, vprenorm1)), vexp_scale));
83 const __m128i vnorm1 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm0, vprenorm1)), vexp_scale));
84 const __m128i vnorm2 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm2, vprenorm3)), vexp_scale));
85 const __m128i vnorm3 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm2, vprenorm3)), vexp_scale));
86 const __m128i vnorm4 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm4, vprenorm5)), vexp_scale));
87 const __m128i vnorm5 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm4, vprenorm5)), vexp_scale));
88 const __m128i vnorm6 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm6, vprenorm7)), vexp_scale));
89 const __m128i vnorm7 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm6, vprenorm7)), vexp_scale));
90
91 const __m128i vdenorm0 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
92 const __m128i vdenorm1 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
93 const __m128i vdenorm2 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
94 const __m128i vdenorm3 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
95 const __m128i vdenorm4 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign2, vmagic_mask)), vmagic_bias));
96 const __m128i vdenorm5 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign2, vmagic_mask)), vmagic_bias));
97 const __m128i vdenorm6 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign3, vmagic_mask)), vmagic_bias));
98 const __m128i vdenorm7 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign3, vmagic_mask)), vmagic_bias));
99
100 const __m128i vmask0 = _mm_cmpgt_epi16(vnonsign0, vdenorm_cutoff);
101 const __m128i vmask1 = _mm_cmpgt_epi16(vnonsign1, vdenorm_cutoff);
102 const __m128i vmask2 = _mm_cmpgt_epi16(vnonsign2, vdenorm_cutoff);
103 const __m128i vmask3 = _mm_cmpgt_epi16(vnonsign3, vdenorm_cutoff);
104
105 const __m128i vxmask0 = _mm_unpacklo_epi16(vmask0, vmask0);
106 const __m128i vf0 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign0),
107 _mm_or_si128(_mm_and_si128(vxmask0, vnorm0), _mm_andnot_si128(vxmask0, vdenorm0)));
108 const __m128i vxmask1 = _mm_unpackhi_epi16(vmask0, vmask0);
109 const __m128i vf1 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign0),
110 _mm_or_si128(_mm_and_si128(vxmask1, vnorm1), _mm_andnot_si128(vxmask1, vdenorm1)));
111 const __m128i vxmask2 = _mm_unpacklo_epi16(vmask1, vmask1);
112 const __m128i vf2 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign1),
113 _mm_or_si128(_mm_and_si128(vxmask2, vnorm2), _mm_andnot_si128(vxmask2, vdenorm2)));
114 const __m128i vxmask3 = _mm_unpackhi_epi16(vmask1, vmask1);
115 const __m128i vf3 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign1),
116 _mm_or_si128(_mm_and_si128(vxmask3, vnorm3), _mm_andnot_si128(vxmask3, vdenorm3)));
117 const __m128i vxmask4 = _mm_unpacklo_epi16(vmask2, vmask2);
118 const __m128i vf4 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign2),
119 _mm_or_si128(_mm_and_si128(vxmask4, vnorm4), _mm_andnot_si128(vxmask4, vdenorm4)));
120 const __m128i vxmask5 = _mm_unpackhi_epi16(vmask2, vmask2);
121 const __m128i vf5 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign2),
122 _mm_or_si128(_mm_and_si128(vxmask5, vnorm5), _mm_andnot_si128(vxmask5, vdenorm5)));
123 const __m128i vxmask6 = _mm_unpacklo_epi16(vmask3, vmask3);
124 const __m128i vf6 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign3),
125 _mm_or_si128(_mm_and_si128(vxmask6, vnorm6), _mm_andnot_si128(vxmask6, vdenorm6)));
126 const __m128i vxmask7 = _mm_unpackhi_epi16(vmask3, vmask3);
127 const __m128i vf7 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign3),
128 _mm_or_si128(_mm_and_si128(vxmask7, vnorm7), _mm_andnot_si128(vxmask7, vdenorm7)));
129
130 _mm_storeu_ps(output, _mm_castsi128_ps(vf0));
131 _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf1));
132 _mm_storeu_ps(output + 8, _mm_castsi128_ps(vf2));
133 _mm_storeu_ps(output + 12, _mm_castsi128_ps(vf3));
134 _mm_storeu_ps(output + 16, _mm_castsi128_ps(vf4));
135 _mm_storeu_ps(output + 20, _mm_castsi128_ps(vf5));
136 _mm_storeu_ps(output + 24, _mm_castsi128_ps(vf6));
137 _mm_storeu_ps(output + 28, _mm_castsi128_ps(vf7));
138 output += 32;
139 }
140 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
141 const __m128i vh = _mm_loadu_si128((const __m128i*) i);
142 i += 8;
143
144 const __m128i vsign = _mm_and_si128(vh, vsign_mask);
145
146 const __m128i vnonsign = _mm_xor_si128(vh, vsign);
147
148 const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
149 const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
150
151 const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
152 const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
153
154 const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
155 const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
156
157 const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
158
159 const __m128i vxmask_lo = _mm_unpacklo_epi16(vmask, vmask);
160 const __m128i vf_lo = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
161 _mm_or_si128(_mm_and_si128(vxmask_lo, vnorm_lo), _mm_andnot_si128(vxmask_lo, vdenorm_lo)));
162
163 const __m128i vxmask_hi = _mm_unpackhi_epi16(vmask, vmask);
164 const __m128i vf_hi = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
165 _mm_or_si128(_mm_and_si128(vxmask_hi, vnorm_hi), _mm_andnot_si128(vxmask_hi, vdenorm_hi)));
166
167 _mm_storeu_ps(output, _mm_castsi128_ps(vf_lo));
168 _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf_hi));
169 output += 8;
170 }
171 if XNN_UNPREDICTABLE(n != 0) {
172 const __m128i vh = _mm_loadu_si128((const __m128i*) i);
173
174 const __m128i vsign = _mm_and_si128(vh, vsign_mask);
175
176 const __m128i vnonsign = _mm_xor_si128(vh, vsign);
177
178 const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
179 const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
180
181 const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
182 const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
183
184 const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
185 const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
186
187 const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
188
189 const __m128i vxmask_lo = _mm_unpacklo_epi16(vmask, vmask);
190 __m128i vf = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
191 _mm_or_si128(_mm_and_si128(vxmask_lo, vnorm_lo), _mm_andnot_si128(vxmask_lo, vdenorm_lo)));
192
193 if (n & (4 * sizeof(uint16_t))) {
194 _mm_storeu_ps(output, _mm_castsi128_ps(vf));
195 output += 4;
196
197 const __m128i vxmask_hi = _mm_unpackhi_epi16(vmask, vmask);
198 vf = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
199 _mm_or_si128(_mm_and_si128(vxmask_hi, vnorm_hi), _mm_andnot_si128(vxmask_hi, vdenorm_hi)));
200 }
201 if (n & (2 * sizeof(uint16_t))) {
202 _mm_storel_pi((__m64*) output, _mm_castsi128_ps(vf));
203 output += 2;
204
205 vf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(vf), _mm_castsi128_ps(vf)));
206 }
207 if (n & (1 * sizeof(uint16_t))) {
208 _mm_store_ss(output, _mm_castsi128_ps(vf));
209 }
210 }
211 }
212
xnn_f16_vabs_ukernel__sse2_x16(size_t n,const void * input,void * output,const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS (1)])213 void xnn_f16_vabs_ukernel__sse2_x16(
214 size_t n,
215 const void* input,
216 void* output,
217 const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
218 {
219 assert(n != 0);
220 assert(n % sizeof(uint16_t) == 0);
221 assert(input != NULL);
222 assert(output != NULL);
223
224 const uint16_t* i = (const uint16_t*) input;
225 uint16_t* o = (uint16_t*) output;
226 const __m128i vnonsign_mask = _mm_load_si128((const __m128i*) params->sse.nonsign_mask);
227 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
228 __m128i vacc0 = _mm_loadu_si128((const __m128i*) i);
229 __m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8));
230 i += 16;
231
232 vacc0 = _mm_and_si128(vacc0, vnonsign_mask);
233 vacc1 = _mm_and_si128(vacc1, vnonsign_mask);
234
235 _mm_storeu_si128((__m128i*) o, vacc0);
236 _mm_storeu_si128((__m128i*) (o + 8), vacc1);
237 o += 16;
238 }
239 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
240 __m128i vacc = _mm_loadu_si128((const __m128i*) i);
241 i += 8;
242 vacc = _mm_and_si128(vacc, vnonsign_mask);
243 _mm_storeu_si128((__m128i*) o, vacc);
244 o += 8;
245 }
246 if XNN_UNLIKELY(n != 0) {
247 __m128i vacc = _mm_loadu_si128((const __m128i*) i);
248 vacc = _mm_and_si128(vacc, vnonsign_mask);
249 if (n & (4 * sizeof(uint16_t))) {
250 _mm_storel_epi64((__m128i*) o, vacc);
251 o += 4;
252 vacc = _mm_unpackhi_epi64(vacc, vacc);
253 }
254 if (n & (2 * sizeof(uint16_t))) {
255 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vacc));
256 o += 2;
257 vacc = _mm_srli_epi64(vacc, 32);
258 }
259 if (n & (1 * sizeof(uint16_t))) {
260 *o = (uint16_t) _mm_extract_epi16(vacc, 0);
261 }
262 }
263 }
264
xnn_f16_vneg_ukernel__sse2_x16(size_t n,const void * input,void * output,const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS (1)])265 void xnn_f16_vneg_ukernel__sse2_x16(
266 size_t n,
267 const void* input,
268 void* output,
269 const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
270 {
271 assert(n != 0);
272 assert(n % sizeof(uint16_t) == 0);
273 assert(input != NULL);
274 assert(output != NULL);
275
276 const uint16_t* i = (const uint16_t*) input;
277 uint16_t* o = (uint16_t*) output;
278 const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse.sign_mask);
279 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
280 __m128i vacc0 = _mm_loadu_si128((const __m128i*) i);
281 __m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8));
282 i += 16;
283
284 vacc0 = _mm_xor_si128(vacc0, vsign_mask);
285 vacc1 = _mm_xor_si128(vacc1, vsign_mask);
286
287 _mm_storeu_si128((__m128i*) o, vacc0);
288 _mm_storeu_si128((__m128i*) (o + 8), vacc1);
289 o += 16;
290 }
291 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
292 __m128i vacc = _mm_loadu_si128((const __m128i*) i);
293 i += 8;
294 vacc = _mm_xor_si128(vacc, vsign_mask);
295 _mm_storeu_si128((__m128i*) o, vacc);
296 o += 8;
297 }
298 if XNN_UNLIKELY(n != 0) {
299 __m128i vacc = _mm_loadu_si128((const __m128i*) i);
300 vacc = _mm_xor_si128(vacc, vsign_mask);
301 if (n & (4 * sizeof(uint16_t))) {
302 _mm_storel_epi64((__m128i*) o, vacc);
303 o += 4;
304 vacc = _mm_unpackhi_epi64(vacc, vacc);
305 }
306 if (n & (2 * sizeof(uint16_t))) {
307 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vacc));
308 o += 2;
309 vacc = _mm_srli_epi64(vacc, 32);
310 }
311 if (n & (1 * sizeof(uint16_t))) {
312 *o = (uint16_t) _mm_extract_epi16(vacc, 0);
313 }
314 }
315 }
316
xnn_f32_argmaxpool_ukernel_4x__sse2_c4(size_t output_pixels,size_t pooling_elements,size_t channels,const float ** input,size_t input_offset,float * output,uint32_t * index,size_t input_increment,size_t output_increment)317 void xnn_f32_argmaxpool_ukernel_4x__sse2_c4(
318 size_t output_pixels,
319 size_t pooling_elements,
320 size_t channels,
321 const float** input,
322 size_t input_offset,
323 float* output,
324 uint32_t* index,
325 size_t input_increment,
326 size_t output_increment) XNN_OOB_READS
327 {
328 assert(output_pixels != 0);
329 assert(pooling_elements != 0);
330 assert(pooling_elements <= 4);
331 assert(channels != 0);
332
333 do {
334 const float* i0 = input[0];
335 const float* i1 = input[1];
336 const float* i2 = input[2];
337 const float* i3 = input[3];
338 i0 = (const float*) ((uintptr_t) i0 + input_offset);
339 i1 = (const float*) ((uintptr_t) i1 + input_offset);
340 i2 = (const float*) ((uintptr_t) i2 + input_offset);
341 i3 = (const float*) ((uintptr_t) i3 + input_offset);
342 if (pooling_elements < 2) {
343 i1 = i0;
344 }
345 if (pooling_elements <= 2) {
346 i2 = i0;
347 }
348 if (pooling_elements != 4) {
349 i3 = i0;
350 }
351
352 size_t c = channels;
353 for (; c >= 4; c -= 4) {
354 const __m128 vi0 = _mm_loadu_ps(i0);
355 i0 += 4;
356 const __m128 vi1 = _mm_loadu_ps(i1);
357 i1 += 4;
358 const __m128 vi2 = _mm_loadu_ps(i2);
359 i2 += 4;
360 const __m128 vi3 = _mm_loadu_ps(i3);
361 i3 += 4;
362
363 __m128 vmax = vi0;
364 __m128i vidx = _mm_setzero_si128();
365
366 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
367 vmax = _mm_max_ps(vi1, vmax);
368 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, _mm_set1_epi32(1)));
369
370 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
371 vmax = _mm_max_ps(vi2, vmax);
372 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, _mm_set1_epi32(2)));
373
374 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
375 vmax = _mm_max_ps(vi3, vmax);
376 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, _mm_set1_epi32(3)));
377
378 _mm_storeu_ps(output, vmax);
379 output += 4;
380 _mm_storeu_si128((__m128i*) index, vidx);
381 index += 4;
382 }
383 if (c != 0) {
384 const __m128 vi0 = _mm_loadu_ps(i0);
385 const __m128 vi1 = _mm_loadu_ps(i1);
386 const __m128 vi2 = _mm_loadu_ps(i2);
387 const __m128 vi3 = _mm_loadu_ps(i3);
388
389 __m128 vmax = vi0;
390 __m128i vidx = _mm_setzero_si128();
391
392 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
393 vmax = _mm_max_ps(vi1, vmax);
394 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, _mm_set1_epi32(1)));
395
396 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
397 vmax = _mm_max_ps(vi2, vmax);
398 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, _mm_set1_epi32(2)));
399
400 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
401 vmax = _mm_max_ps(vi3, vmax);
402 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, _mm_set1_epi32(3)));
403
404 if (c & 2) {
405 _mm_storel_pi((__m64*) output, vmax);
406 _mm_storel_epi64((__m128i*) index, vidx);
407 vmax = _mm_movehl_ps(vmax, vmax);
408 vidx = _mm_unpackhi_epi64(vidx, vidx);
409 output += 2;
410 index += 2;
411 }
412 if (c & 1) {
413 _mm_store_ss(output, vmax);
414 *index = (uint32_t) _mm_cvtsi128_si32(vidx);
415 output += 1;
416 index += 1;
417 }
418 }
419 input = (const float**) ((uintptr_t) input + input_increment);
420 output = (float*) ((uintptr_t) output + output_increment);
421 } while (--output_pixels != 0);
422 }
423
xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4(size_t output_pixels,size_t pooling_elements,size_t channels,const float ** input,size_t input_offset,float * accumulation_buffer,uint32_t * index_buffer,float * output,uint32_t * index,size_t input_increment,size_t output_increment)424 void xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4(
425 size_t output_pixels,
426 size_t pooling_elements,
427 size_t channels,
428 const float** input,
429 size_t input_offset,
430 float* accumulation_buffer,
431 uint32_t* index_buffer,
432 float* output,
433 uint32_t* index,
434 size_t input_increment,
435 size_t output_increment) XNN_OOB_READS
436 {
437 assert(output_pixels != 0);
438 assert(pooling_elements != 0);
439 assert(pooling_elements > 9);
440 assert(channels != 0);
441
442 do {
443 {
444 float* ab = accumulation_buffer;
445 uint32_t* ib = index_buffer;
446
447 const float* i0 = *input++;
448 const float* i1 = *input++;
449 const float* i2 = *input++;
450 const float* i3 = *input++;
451 const float* i4 = *input++;
452 const float* i5 = *input++;
453 const float* i6 = *input++;
454 const float* i7 = *input++;
455 const float* i8 = *input++;
456 i0 = (const float*) ((uintptr_t) i0 + input_offset);
457 i1 = (const float*) ((uintptr_t) i1 + input_offset);
458 i2 = (const float*) ((uintptr_t) i2 + input_offset);
459 i3 = (const float*) ((uintptr_t) i3 + input_offset);
460 i4 = (const float*) ((uintptr_t) i4 + input_offset);
461 i5 = (const float*) ((uintptr_t) i5 + input_offset);
462 i6 = (const float*) ((uintptr_t) i6 + input_offset);
463 i7 = (const float*) ((uintptr_t) i7 + input_offset);
464 i8 = (const float*) ((uintptr_t) i8 + input_offset);
465
466 for (size_t c = 0; c < channels; c += 4) {
467 const __m128 vi0 = _mm_loadu_ps(i0);
468 i0 += 4;
469 const __m128 vi1 = _mm_loadu_ps(i1);
470 i1 += 4;
471 const __m128 vi2 = _mm_loadu_ps(i2);
472 i2 += 4;
473 const __m128 vi3 = _mm_loadu_ps(i3);
474 i3 += 4;
475 const __m128 vi4 = _mm_loadu_ps(i4);
476 i4 += 4;
477 const __m128 vi5 = _mm_loadu_ps(i5);
478 i5 += 4;
479 const __m128 vi6 = _mm_loadu_ps(i6);
480 i6 += 4;
481 const __m128 vi7 = _mm_loadu_ps(i7);
482 i7 += 4;
483 const __m128 vi8 = _mm_loadu_ps(i8);
484 i8 += 4;
485
486 __m128 vmax = vi0;
487 __m128i vidx = _mm_setzero_si128();
488
489 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
490 vmax = _mm_max_ps(vi1, vmax);
491 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, _mm_set1_epi32(1)));
492
493 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
494 vmax = _mm_max_ps(vi2, vmax);
495 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, _mm_set1_epi32(2)));
496
497 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
498 vmax = _mm_max_ps(vi3, vmax);
499 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, _mm_set1_epi32(3)));
500
501 const __m128i vm4 = _mm_castps_si128(_mm_cmpgt_ps(vi4, vmax));
502 vmax = _mm_max_ps(vi4, vmax);
503 vidx = _mm_or_si128(_mm_andnot_si128(vm4, vidx), _mm_and_si128(vm4, _mm_set1_epi32(4)));
504
505 const __m128i vm5 = _mm_castps_si128(_mm_cmpgt_ps(vi5, vmax));
506 vmax = _mm_max_ps(vi5, vmax);
507 vidx = _mm_or_si128(_mm_andnot_si128(vm5, vidx), _mm_and_si128(vm5, _mm_set1_epi32(5)));
508
509 const __m128i vm6 = _mm_castps_si128(_mm_cmpgt_ps(vi6, vmax));
510 vmax = _mm_max_ps(vi6, vmax);
511 vidx = _mm_or_si128(_mm_andnot_si128(vm6, vidx), _mm_and_si128(vm6, _mm_set1_epi32(6)));
512
513 const __m128i vm7 = _mm_castps_si128(_mm_cmpgt_ps(vi7, vmax));
514 vmax = _mm_max_ps(vi7, vmax);
515 vidx = _mm_or_si128(_mm_andnot_si128(vm7, vidx), _mm_and_si128(vm7, _mm_set1_epi32(7)));
516
517 const __m128i vm8 = _mm_castps_si128(_mm_cmpgt_ps(vi8, vmax));
518 vmax = _mm_max_ps(vi8, vmax);
519 vidx = _mm_or_si128(_mm_andnot_si128(vm8, vidx), _mm_and_si128(vm8, _mm_set1_epi32(8)));
520
521 _mm_store_ps(ab, vmax);
522 ab += 4;
523 _mm_store_si128((__m128i*) ib, vidx);
524 ib += 4;
525 }
526 }
527 const __m128i v1 = _mm_set1_epi32(1);
528 const __m128i v8 = _mm_set1_epi32(8);
529 __m128i vidx0 = _mm_add_epi32(v1, v8);
530
531 size_t k = pooling_elements;
532 for (k -= 9; k > 8; k -= 8) {
533 const float* i0 = *input++;
534 const float* i1 = *input++;
535 const float* i2 = *input++;
536 const float* i3 = *input++;
537 const float* i4 = *input++;
538 const float* i5 = *input++;
539 const float* i6 = *input++;
540 const float* i7 = *input++;
541 i0 = (const float*) ((uintptr_t) i0 + input_offset);
542 i1 = (const float*) ((uintptr_t) i1 + input_offset);
543 i2 = (const float*) ((uintptr_t) i2 + input_offset);
544 i3 = (const float*) ((uintptr_t) i3 + input_offset);
545 i4 = (const float*) ((uintptr_t) i4 + input_offset);
546 i5 = (const float*) ((uintptr_t) i5 + input_offset);
547 i6 = (const float*) ((uintptr_t) i6 + input_offset);
548 i7 = (const float*) ((uintptr_t) i7 + input_offset);
549
550 float* ab = accumulation_buffer;
551 uint32_t* ib = index_buffer;
552
553 for (size_t c = 0; c < channels; c += 4) {
554 const __m128 vi0 = _mm_loadu_ps(i0);
555 i0 += 4;
556 const __m128 vi1 = _mm_loadu_ps(i1);
557 i1 += 4;
558 const __m128 vi2 = _mm_loadu_ps(i2);
559 i2 += 4;
560 const __m128 vi3 = _mm_loadu_ps(i3);
561 i3 += 4;
562 const __m128 vi4 = _mm_loadu_ps(i4);
563 i4 += 4;
564 const __m128 vi5 = _mm_loadu_ps(i5);
565 i5 += 4;
566 const __m128 vi6 = _mm_loadu_ps(i6);
567 i6 += 4;
568 const __m128 vi7 = _mm_loadu_ps(i7);
569 i7 += 4;
570
571 __m128 vmax = _mm_load_ps(ab);
572 __m128i vidx = _mm_load_si128((const __m128i*) ib);
573
574 const __m128i vm0 = _mm_castps_si128(_mm_cmpgt_ps(vi0, vmax));
575 vmax = _mm_max_ps(vi0, vmax);
576 vidx = _mm_or_si128(_mm_andnot_si128(vm0, vidx), _mm_and_si128(vm0, vidx0));
577
578 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
579 const __m128i vidx1 = _mm_add_epi32(vidx0, v1);
580 vmax = _mm_max_ps(vi1, vmax);
581 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, vidx1));
582
583 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
584 const __m128i vidx2 = _mm_add_epi32(vidx1, v1);
585 vmax = _mm_max_ps(vi2, vmax);
586 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, vidx2));
587
588 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
589 const __m128i vidx3 = _mm_add_epi32(vidx2, v1);
590 vmax = _mm_max_ps(vi3, vmax);
591 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, vidx3));
592
593 const __m128i vm4 = _mm_castps_si128(_mm_cmpgt_ps(vi4, vmax));
594 const __m128i vidx4 = _mm_add_epi32(vidx3, v1);
595 vmax = _mm_max_ps(vi4, vmax);
596 vidx = _mm_or_si128(_mm_andnot_si128(vm4, vidx), _mm_and_si128(vm4, vidx4));
597
598 const __m128i vm5 = _mm_castps_si128(_mm_cmpgt_ps(vi5, vmax));
599 const __m128i vidx5 = _mm_add_epi32(vidx4, v1);
600 vmax = _mm_max_ps(vi5, vmax);
601 vidx = _mm_or_si128(_mm_andnot_si128(vm5, vidx), _mm_and_si128(vm5, vidx5));
602
603 const __m128i vm6 = _mm_castps_si128(_mm_cmpgt_ps(vi6, vmax));
604 const __m128i vidx6 = _mm_add_epi32(vidx5, v1);
605 vmax = _mm_max_ps(vi6, vmax);
606 vidx = _mm_or_si128(_mm_andnot_si128(vm6, vidx), _mm_and_si128(vm6, vidx6));
607
608 const __m128i vm7 = _mm_castps_si128(_mm_cmpgt_ps(vi7, vmax));
609 const __m128i vidx7 = _mm_add_epi32(vidx6, v1);
610 vmax = _mm_max_ps(vi7, vmax);
611 vidx = _mm_or_si128(_mm_andnot_si128(vm7, vidx), _mm_and_si128(vm7, vidx7));
612
613 _mm_store_ps(ab, vmax);
614 ab += 4;
615 _mm_store_si128((__m128i*) ib, vidx);
616 ib += 4;
617 }
618 vidx0 = _mm_add_epi32(vidx0, v8);
619 }
620
621 float* o = output;
622 uint32_t* i = index;
623 {
624 const float* i0 = input[0];
625 const float* i1 = input[1];
626 const float* i2 = input[2];
627 const float* i3 = input[3];
628 const float* i4 = input[4];
629 const float* i5 = input[5];
630 const float* i6 = input[6];
631 const float* i7 = input[7];
632 i0 = (const float*) ((uintptr_t) i0 + input_offset);
633 i1 = (const float*) ((uintptr_t) i1 + input_offset);
634 i2 = (const float*) ((uintptr_t) i2 + input_offset);
635 i3 = (const float*) ((uintptr_t) i3 + input_offset);
636 i4 = (const float*) ((uintptr_t) i4 + input_offset);
637 i5 = (const float*) ((uintptr_t) i5 + input_offset);
638 i6 = (const float*) ((uintptr_t) i6 + input_offset);
639 i7 = (const float*) ((uintptr_t) i7 + input_offset);
640 input = (const float**) ((uintptr_t) input + input_increment);
641 if (k < 2) {
642 i1 = i0;
643 }
644 if (k <= 2) {
645 i2 = i0;
646 }
647 if (k < 4) {
648 i3 = i0;
649 }
650 if (k <= 4) {
651 i4 = i0;
652 }
653 if (k < 6) {
654 i5 = i0;
655 }
656 if (k <= 6) {
657 i6 = i0;
658 }
659 if (k != 8) {
660 i7 = i0;
661 }
662
663 size_t c = channels;
664 float* ab = accumulation_buffer;
665 uint32_t* ib = index_buffer;
666 for (; c >= 4; c -= 4) {
667 const __m128 vi0 = _mm_loadu_ps(i0);
668 i0 += 4;
669 const __m128 vi1 = _mm_loadu_ps(i1);
670 i1 += 4;
671 const __m128 vi2 = _mm_loadu_ps(i2);
672 i2 += 4;
673 const __m128 vi3 = _mm_loadu_ps(i3);
674 i3 += 4;
675 const __m128 vi4 = _mm_loadu_ps(i4);
676 i4 += 4;
677 const __m128 vi5 = _mm_loadu_ps(i5);
678 i5 += 4;
679 const __m128 vi6 = _mm_loadu_ps(i6);
680 i6 += 4;
681 const __m128 vi7 = _mm_loadu_ps(i7);
682 i7 += 4;
683
684 __m128 vmax = _mm_load_ps(ab);
685 ab += 4;
686 __m128i vidx = _mm_load_si128((const __m128i*) ib);
687 ib += 4;
688
689 const __m128i vm0 = _mm_castps_si128(_mm_cmpgt_ps(vi0, vmax));
690 vmax = _mm_max_ps(vi0, vmax);
691 vidx = _mm_or_si128(_mm_andnot_si128(vm0, vidx), _mm_and_si128(vm0, vidx0));
692
693 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
694 const __m128i vidx1 = _mm_add_epi32(vidx0, v1);
695 vmax = _mm_max_ps(vi1, vmax);
696 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, vidx1));
697
698 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
699 const __m128i vidx2 = _mm_add_epi32(vidx1, v1);
700 vmax = _mm_max_ps(vi2, vmax);
701 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, vidx2));
702
703 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
704 const __m128i vidx3 = _mm_add_epi32(vidx2, v1);
705 vmax = _mm_max_ps(vi3, vmax);
706 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, vidx3));
707
708 const __m128i vm4 = _mm_castps_si128(_mm_cmpgt_ps(vi4, vmax));
709 const __m128i vidx4 = _mm_add_epi32(vidx3, v1);
710 vmax = _mm_max_ps(vi4, vmax);
711 vidx = _mm_or_si128(_mm_andnot_si128(vm4, vidx), _mm_and_si128(vm4, vidx4));
712
713 const __m128i vm5 = _mm_castps_si128(_mm_cmpgt_ps(vi5, vmax));
714 const __m128i vidx5 = _mm_add_epi32(vidx4, v1);
715 vmax = _mm_max_ps(vi5, vmax);
716 vidx = _mm_or_si128(_mm_andnot_si128(vm5, vidx), _mm_and_si128(vm5, vidx5));
717
718 const __m128i vm6 = _mm_castps_si128(_mm_cmpgt_ps(vi6, vmax));
719 const __m128i vidx6 = _mm_add_epi32(vidx5, v1);
720 vmax = _mm_max_ps(vi6, vmax);
721 vidx = _mm_or_si128(_mm_andnot_si128(vm6, vidx), _mm_and_si128(vm6, vidx6));
722
723 const __m128i vm7 = _mm_castps_si128(_mm_cmpgt_ps(vi7, vmax));
724 const __m128i vidx7 = _mm_add_epi32(vidx6, v1);
725 vmax = _mm_max_ps(vi7, vmax);
726 vidx = _mm_or_si128(_mm_andnot_si128(vm7, vidx), _mm_and_si128(vm7, vidx7));
727
728 _mm_storeu_ps(o, vmax);
729 o += 4;
730 _mm_storeu_si128((__m128i*) i, vidx);
731 i += 4;
732 }
733 if (c != 0) {
734 const __m128 vi0 = _mm_loadu_ps(i0);
735 const __m128 vi1 = _mm_loadu_ps(i1);
736 const __m128 vi2 = _mm_loadu_ps(i2);
737 const __m128 vi3 = _mm_loadu_ps(i3);
738 const __m128 vi4 = _mm_loadu_ps(i4);
739 const __m128 vi5 = _mm_loadu_ps(i5);
740 const __m128 vi6 = _mm_loadu_ps(i6);
741 const __m128 vi7 = _mm_loadu_ps(i7);
742
743 __m128 vmax = _mm_load_ps(ab);
744 __m128i vidx = _mm_load_si128((const __m128i*) ib);
745
746 const __m128i vm0 = _mm_castps_si128(_mm_cmpgt_ps(vi0, vmax));
747 vmax = _mm_max_ps(vi0, vmax);
748 vidx = _mm_or_si128(_mm_andnot_si128(vm0, vidx), _mm_and_si128(vm0, vidx0));
749
750 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
751 const __m128i vidx1 = _mm_add_epi32(vidx0, v1);
752 vmax = _mm_max_ps(vi1, vmax);
753 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, vidx1));
754
755 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
756 const __m128i vidx2 = _mm_add_epi32(vidx1, v1);
757 vmax = _mm_max_ps(vi2, vmax);
758 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, vidx2));
759
760 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
761 const __m128i vidx3 = _mm_add_epi32(vidx2, v1);
762 vmax = _mm_max_ps(vi3, vmax);
763 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, vidx3));
764
765 const __m128i vm4 = _mm_castps_si128(_mm_cmpgt_ps(vi4, vmax));
766 const __m128i vidx4 = _mm_add_epi32(vidx3, v1);
767 vmax = _mm_max_ps(vi4, vmax);
768 vidx = _mm_or_si128(_mm_andnot_si128(vm4, vidx), _mm_and_si128(vm4, vidx4));
769
770 const __m128i vm5 = _mm_castps_si128(_mm_cmpgt_ps(vi5, vmax));
771 const __m128i vidx5 = _mm_add_epi32(vidx4, v1);
772 vmax = _mm_max_ps(vi5, vmax);
773 vidx = _mm_or_si128(_mm_andnot_si128(vm5, vidx), _mm_and_si128(vm5, vidx5));
774
775 const __m128i vm6 = _mm_castps_si128(_mm_cmpgt_ps(vi6, vmax));
776 const __m128i vidx6 = _mm_add_epi32(vidx5, v1);
777 vmax = _mm_max_ps(vi6, vmax);
778 vidx = _mm_or_si128(_mm_andnot_si128(vm6, vidx), _mm_and_si128(vm6, vidx6));
779
780 const __m128i vm7 = _mm_castps_si128(_mm_cmpgt_ps(vi7, vmax));
781 const __m128i vidx7 = _mm_add_epi32(vidx6, v1);
782 vmax = _mm_max_ps(vi7, vmax);
783 vidx = _mm_or_si128(_mm_andnot_si128(vm7, vidx), _mm_and_si128(vm7, vidx7));
784
785 if (c & 2) {
786 _mm_storel_pi((__m64*) o, vmax);
787 _mm_storel_epi64((__m128i*) i, vidx);
788 vmax = _mm_movehl_ps(vmax, vmax);
789 vidx = _mm_unpackhi_epi64(vidx, vidx);
790 o += 2;
791 i += 2;
792 }
793 if (c & 1) {
794 _mm_store_ss(o, vmax);
795 *i = (uint32_t) _mm_cvtsi128_si32(vidx);
796 o += 1;
797 i += 1;
798 }
799 }
800 }
801
802 output = (float*) ((uintptr_t) o + output_increment);
803 index = (uint32_t*) i;
804 } while (--output_pixels != 0);
805 }
806
xnn_f32_argmaxpool_ukernel_9x__sse2_c4(size_t output_pixels,size_t pooling_elements,size_t channels,const float ** input,size_t input_offset,float * output,uint32_t * index,size_t input_increment,size_t output_increment)807 void xnn_f32_argmaxpool_ukernel_9x__sse2_c4(
808 size_t output_pixels,
809 size_t pooling_elements,
810 size_t channels,
811 const float** input,
812 size_t input_offset,
813 float* output,
814 uint32_t* index,
815 size_t input_increment,
816 size_t output_increment) XNN_OOB_READS
817 {
818 assert(output_pixels != 0);
819 assert(pooling_elements != 0);
820 assert(pooling_elements <= 9);
821 assert(channels != 0);
822
823 do {
824 const float* i0 = input[0];
825 const float* i1 = input[1];
826 const float* i2 = input[2];
827 const float* i3 = input[3];
828 const float* i4 = input[4];
829 const float* i5 = input[5];
830 const float* i6 = input[6];
831 const float* i7 = input[7];
832 const float* i8 = input[8];
833 i0 = (const float*) ((uintptr_t) i0 + input_offset);
834 i1 = (const float*) ((uintptr_t) i1 + input_offset);
835 i2 = (const float*) ((uintptr_t) i2 + input_offset);
836 i3 = (const float*) ((uintptr_t) i3 + input_offset);
837 i4 = (const float*) ((uintptr_t) i4 + input_offset);
838 i5 = (const float*) ((uintptr_t) i5 + input_offset);
839 i6 = (const float*) ((uintptr_t) i6 + input_offset);
840 i7 = (const float*) ((uintptr_t) i7 + input_offset);
841 i8 = (const float*) ((uintptr_t) i8 + input_offset);
842 if (pooling_elements < 2) {
843 i1 = i0;
844 }
845 if (pooling_elements <= 2) {
846 i2 = i0;
847 }
848 if (pooling_elements < 4) {
849 i3 = i0;
850 }
851 if (pooling_elements <= 4) {
852 i4 = i0;
853 }
854 if (pooling_elements < 6) {
855 i5 = i0;
856 }
857 if (pooling_elements <= 6) {
858 i6 = i0;
859 }
860 if (pooling_elements < 8) {
861 i7 = i0;
862 }
863 if (pooling_elements <= 8) {
864 i8 = i0;
865 }
866
867 size_t c = channels;
868 for (; c >= 4; c -= 4) {
869 const __m128 vi0 = _mm_loadu_ps(i0);
870 i0 += 4;
871 const __m128 vi1 = _mm_loadu_ps(i1);
872 i1 += 4;
873 const __m128 vi2 = _mm_loadu_ps(i2);
874 i2 += 4;
875 const __m128 vi3 = _mm_loadu_ps(i3);
876 i3 += 4;
877 const __m128 vi4 = _mm_loadu_ps(i4);
878 i4 += 4;
879 const __m128 vi5 = _mm_loadu_ps(i5);
880 i5 += 4;
881 const __m128 vi6 = _mm_loadu_ps(i6);
882 i6 += 4;
883 const __m128 vi7 = _mm_loadu_ps(i7);
884 i7 += 4;
885 const __m128 vi8 = _mm_loadu_ps(i8);
886 i8 += 4;
887
888 __m128 vmax = vi0;
889 __m128i vidx = _mm_setzero_si128();
890
891 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
892 vmax = _mm_max_ps(vi1, vmax);
893 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, _mm_set1_epi32(1)));
894
895 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
896 vmax = _mm_max_ps(vi2, vmax);
897 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, _mm_set1_epi32(2)));
898
899 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
900 vmax = _mm_max_ps(vi3, vmax);
901 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, _mm_set1_epi32(3)));
902
903 const __m128i vm4 = _mm_castps_si128(_mm_cmpgt_ps(vi4, vmax));
904 vmax = _mm_max_ps(vi4, vmax);
905 vidx = _mm_or_si128(_mm_andnot_si128(vm4, vidx), _mm_and_si128(vm4, _mm_set1_epi32(4)));
906
907 const __m128i vm5 = _mm_castps_si128(_mm_cmpgt_ps(vi5, vmax));
908 vmax = _mm_max_ps(vi5, vmax);
909 vidx = _mm_or_si128(_mm_andnot_si128(vm5, vidx), _mm_and_si128(vm5, _mm_set1_epi32(5)));
910
911 const __m128i vm6 = _mm_castps_si128(_mm_cmpgt_ps(vi6, vmax));
912 vmax = _mm_max_ps(vi6, vmax);
913 vidx = _mm_or_si128(_mm_andnot_si128(vm6, vidx), _mm_and_si128(vm6, _mm_set1_epi32(6)));
914
915 const __m128i vm7 = _mm_castps_si128(_mm_cmpgt_ps(vi7, vmax));
916 vmax = _mm_max_ps(vi7, vmax);
917 vidx = _mm_or_si128(_mm_andnot_si128(vm7, vidx), _mm_and_si128(vm7, _mm_set1_epi32(7)));
918
919 const __m128i vm8 = _mm_castps_si128(_mm_cmpgt_ps(vi8, vmax));
920 vmax = _mm_max_ps(vi8, vmax);
921 vidx = _mm_or_si128(_mm_andnot_si128(vm8, vidx), _mm_and_si128(vm8, _mm_set1_epi32(8)));
922
923 _mm_storeu_ps(output, vmax);
924 output += 4;
925 _mm_storeu_si128((__m128i*) index, vidx);
926 index += 4;
927 }
928 if (c != 0) {
929 const __m128 vi0 = _mm_loadu_ps(i0);
930 const __m128 vi1 = _mm_loadu_ps(i1);
931 const __m128 vi2 = _mm_loadu_ps(i2);
932 const __m128 vi3 = _mm_loadu_ps(i3);
933 const __m128 vi4 = _mm_loadu_ps(i4);
934 const __m128 vi5 = _mm_loadu_ps(i5);
935 const __m128 vi6 = _mm_loadu_ps(i6);
936 const __m128 vi7 = _mm_loadu_ps(i7);
937 const __m128 vi8 = _mm_loadu_ps(i8);
938
939 __m128 vmax = vi0;
940 __m128i vidx = _mm_setzero_si128();
941
942 const __m128i vm1 = _mm_castps_si128(_mm_cmpgt_ps(vi1, vmax));
943 vmax = _mm_max_ps(vi1, vmax);
944 vidx = _mm_or_si128(_mm_andnot_si128(vm1, vidx), _mm_and_si128(vm1, _mm_set1_epi32(1)));
945
946 const __m128i vm2 = _mm_castps_si128(_mm_cmpgt_ps(vi2, vmax));
947 vmax = _mm_max_ps(vi2, vmax);
948 vidx = _mm_or_si128(_mm_andnot_si128(vm2, vidx), _mm_and_si128(vm2, _mm_set1_epi32(2)));
949
950 const __m128i vm3 = _mm_castps_si128(_mm_cmpgt_ps(vi3, vmax));
951 vmax = _mm_max_ps(vi3, vmax);
952 vidx = _mm_or_si128(_mm_andnot_si128(vm3, vidx), _mm_and_si128(vm3, _mm_set1_epi32(3)));
953
954 const __m128i vm4 = _mm_castps_si128(_mm_cmpgt_ps(vi4, vmax));
955 vmax = _mm_max_ps(vi4, vmax);
956 vidx = _mm_or_si128(_mm_andnot_si128(vm4, vidx), _mm_and_si128(vm4, _mm_set1_epi32(4)));
957
958 const __m128i vm5 = _mm_castps_si128(_mm_cmpgt_ps(vi5, vmax));
959 vmax = _mm_max_ps(vi5, vmax);
960 vidx = _mm_or_si128(_mm_andnot_si128(vm5, vidx), _mm_and_si128(vm5, _mm_set1_epi32(5)));
961
962 const __m128i vm6 = _mm_castps_si128(_mm_cmpgt_ps(vi6, vmax));
963 vmax = _mm_max_ps(vi6, vmax);
964 vidx = _mm_or_si128(_mm_andnot_si128(vm6, vidx), _mm_and_si128(vm6, _mm_set1_epi32(6)));
965
966 const __m128i vm7 = _mm_castps_si128(_mm_cmpgt_ps(vi7, vmax));
967 vmax = _mm_max_ps(vi7, vmax);
968 vidx = _mm_or_si128(_mm_andnot_si128(vm7, vidx), _mm_and_si128(vm7, _mm_set1_epi32(7)));
969
970 const __m128i vm8 = _mm_castps_si128(_mm_cmpgt_ps(vi8, vmax));
971 vmax = _mm_max_ps(vi8, vmax);
972 vidx = _mm_or_si128(_mm_andnot_si128(vm8, vidx), _mm_and_si128(vm8, _mm_set1_epi32(8)));
973
974 if (c & 2) {
975 _mm_storel_pi((__m64*) output, vmax);
976 _mm_storel_epi64((__m128i*) index, vidx);
977 vmax = _mm_movehl_ps(vmax, vmax);
978 vidx = _mm_unpackhi_epi64(vidx, vidx);
979 output += 2;
980 index += 2;
981 }
982 if (c & 1) {
983 _mm_store_ss(output, vmax);
984 *index = (uint32_t) _mm_cvtsi128_si32(vidx);
985 output += 1;
986 index += 1;
987 }
988 }
989 input = (const float**) ((uintptr_t) input + input_increment);
990 output = (float*) ((uintptr_t) output + output_increment);
991 index = (uint32_t*) index;
992 } while (--output_pixels != 0);
993 }
994
xnn_f32_f16_vcvt_ukernel__sse2_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])995 void xnn_f32_f16_vcvt_ukernel__sse2_x16(
996 size_t n,
997 const float* input,
998 void* output,
999 const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1000 {
1001 assert(n != 0);
1002 assert(n % sizeof(float) == 0);
1003 assert(input != NULL);
1004 assert(output != NULL);
1005
1006 const __m128 vnonsign_mask = _mm_load_ps((const float*) params->sse2.nonsign_mask);
1007 const __m128i vexp_bias = _mm_load_si128((const __m128i*) params->sse2.exp_bias);
1008 const __m128 vscale_to_inf = _mm_load_ps(params->sse2.scale_to_inf);
1009 const __m128i vexpw_max = _mm_load_si128((const __m128i*) params->sse2.expw_max);
1010 const __m128 vscale_to_zero = _mm_load_ps(params->sse2.scale_to_zero);
1011 const __m128i vbias_min = _mm_load_si128((const __m128i*) params->sse2.bias_min);
1012 const __m128i vmanth_mask = _mm_load_si128((const __m128i*) params->sse2.manth_mask);
1013 const __m128i vexph_mask = _mm_load_si128((const __m128i*) params->sse2.exph_mask);
1014 const __m128i vnanh = _mm_load_si128((const __m128i*) params->sse2.nanh);
1015
1016 uint16_t* o = (uint16_t*) output;
1017 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
1018 const __m128 vx0 = _mm_loadu_ps(input);
1019 const __m128 vx1 = _mm_loadu_ps(input + 4);
1020 const __m128 vx2 = _mm_loadu_ps(input + 8);
1021 const __m128 vx3 = _mm_loadu_ps(input + 12);
1022 input += 16;
1023
1024 const __m128 vabsx0 = _mm_and_ps(vx0, vnonsign_mask);
1025 const __m128 vabsx1 = _mm_and_ps(vx1, vnonsign_mask);
1026 const __m128 vabsx2 = _mm_and_ps(vx2, vnonsign_mask);
1027 const __m128 vabsx3 = _mm_and_ps(vx3, vnonsign_mask);
1028
1029 const __m128 vsignx0 = _mm_xor_ps(vx0, vabsx0);
1030 const __m128 vsignx1 = _mm_xor_ps(vx1, vabsx1);
1031 const __m128 vsignx2 = _mm_xor_ps(vx2, vabsx2);
1032 const __m128 vsignx3 = _mm_xor_ps(vx3, vabsx3);
1033
1034 __m128i vbias0 = _mm_add_epi32(_mm_castps_si128(vabsx0), vexp_bias);
1035 __m128i vbias1 = _mm_add_epi32(_mm_castps_si128(vabsx1), vexp_bias);
1036 __m128i vbias2 = _mm_add_epi32(_mm_castps_si128(vabsx2), vexp_bias);
1037 __m128i vbias3 = _mm_add_epi32(_mm_castps_si128(vabsx3), vexp_bias);
1038
1039 __m128 vf0 = _mm_mul_ps(vabsx0, vscale_to_inf);
1040 __m128 vf1 = _mm_mul_ps(vabsx1, vscale_to_inf);
1041 __m128 vf2 = _mm_mul_ps(vabsx2, vscale_to_inf);
1042 __m128 vf3 = _mm_mul_ps(vabsx3, vscale_to_inf);
1043
1044 const __m128i vnanmaskw0 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx0), vexpw_max);
1045 const __m128i vnanmaskw1 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx1), vexpw_max);
1046 const __m128i vnanmaskw2 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx2), vexpw_max);
1047 const __m128i vnanmaskw3 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx3), vexpw_max);
1048
1049 vbias0 = _mm_and_si128(vbias0, vexpw_max);
1050 vbias1 = _mm_and_si128(vbias1, vexpw_max);
1051 vbias2 = _mm_and_si128(vbias2, vexpw_max);
1052 vbias3 = _mm_and_si128(vbias3, vexpw_max);
1053
1054 vf0 = _mm_mul_ps(vf0, vscale_to_zero);
1055 vf1 = _mm_mul_ps(vf1, vscale_to_zero);
1056 vf2 = _mm_mul_ps(vf2, vscale_to_zero);
1057 vf3 = _mm_mul_ps(vf3, vscale_to_zero);
1058
1059 const __m128i vnanmaskh0 = _mm_packs_epi32(vnanmaskw0, vnanmaskw1);
1060 const __m128i vnanmaskh1 = _mm_packs_epi32(vnanmaskw2, vnanmaskw3);
1061
1062 const __m128i vsignh0 = _mm_packs_epi32(_mm_castps_si128(vsignx0), _mm_castps_si128(vsignx1));
1063 const __m128i vsignh1 = _mm_packs_epi32(_mm_castps_si128(vsignx2), _mm_castps_si128(vsignx3));
1064
1065 vbias0 = _mm_max_epi16(vbias0, vbias_min);
1066 vbias1 = _mm_max_epi16(vbias1, vbias_min);
1067 vbias2 = _mm_max_epi16(vbias2, vbias_min);
1068 vbias3 = _mm_max_epi16(vbias3, vbias_min);
1069
1070 __m128i vh0 = _mm_and_si128(vnanh, vnanmaskh0);
1071 __m128i vh1 = _mm_and_si128(vnanh, vnanmaskh1);
1072
1073 vf0 = _mm_add_ps(vf0, _mm_castsi128_ps(vbias0));
1074 vf1 = _mm_add_ps(vf1, _mm_castsi128_ps(vbias1));
1075 vf2 = _mm_add_ps(vf2, _mm_castsi128_ps(vbias2));
1076 vf3 = _mm_add_ps(vf3, _mm_castsi128_ps(vbias3));
1077
1078 vh0 = _mm_or_si128(vh0, vsignh0);
1079 vh1 = _mm_or_si128(vh1, vsignh1);
1080
1081 __m128i vexpw0 = _mm_srli_epi32(_mm_castps_si128(vf0), 13);
1082 __m128i vexpw1 = _mm_srli_epi32(_mm_castps_si128(vf1), 13);
1083 __m128i vexpw2 = _mm_srli_epi32(_mm_castps_si128(vf2), 13);
1084 __m128i vexpw3 = _mm_srli_epi32(_mm_castps_si128(vf3), 13);
1085
1086 const __m128i vmantw0 = _mm_and_si128(_mm_castps_si128(vf0), vmanth_mask);
1087 const __m128i vmantw1 = _mm_and_si128(_mm_castps_si128(vf1), vmanth_mask);
1088 const __m128i vmantw2 = _mm_and_si128(_mm_castps_si128(vf2), vmanth_mask);
1089 const __m128i vmantw3 = _mm_and_si128(_mm_castps_si128(vf3), vmanth_mask);
1090
1091 vexpw0 = _mm_and_si128(vexpw0, vexph_mask);
1092 vexpw1 = _mm_and_si128(vexpw1, vexph_mask);
1093 vexpw2 = _mm_and_si128(vexpw2, vexph_mask);
1094 vexpw3 = _mm_and_si128(vexpw3, vexph_mask);
1095
1096 const __m128i vnonsignw0 = _mm_add_epi32(vmantw0, vexpw0);
1097 const __m128i vnonsignw1 = _mm_add_epi32(vmantw1, vexpw1);
1098 const __m128i vnonsignw2 = _mm_add_epi32(vmantw2, vexpw2);
1099 const __m128i vnonsignw3 = _mm_add_epi32(vmantw3, vexpw3);
1100
1101 const __m128i vnonsignh0 = _mm_packs_epi32(vnonsignw0, vnonsignw1);
1102 const __m128i vnonsignh1 = _mm_packs_epi32(vnonsignw2, vnonsignw3);
1103
1104 vh0 = _mm_or_si128(vh0, _mm_andnot_si128(vnanmaskh0, vnonsignh0));
1105 vh1 = _mm_or_si128(vh1, _mm_andnot_si128(vnanmaskh1, vnonsignh1));
1106
1107 _mm_storeu_si128((__m128i*) o, vh0);
1108 _mm_storeu_si128((__m128i*) (o + 8), vh1);
1109 o += 16;
1110 }
1111 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1112 const __m128 vx_lo = _mm_loadu_ps(input);
1113 const __m128 vx_hi = _mm_loadu_ps(input + 4);
1114 input += 8;
1115
1116 const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
1117 const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
1118
1119 const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
1120 const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
1121 __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
1122 __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
1123 __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
1124 __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
1125 const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
1126 const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
1127
1128 vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
1129 vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
1130 vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
1131 vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
1132 const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
1133 const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
1134
1135 vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
1136 vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
1137 __m128i vh = _mm_and_si128(vnanh, vnanmaskh);
1138
1139 vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
1140 vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
1141 vh = _mm_or_si128(vh, vsignh);
1142
1143 __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
1144 __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
1145 const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
1146 const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
1147
1148 vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
1149 vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
1150
1151 const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
1152 const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
1153
1154 const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
1155
1156 vh = _mm_or_si128(vh, _mm_andnot_si128(vnanmaskh, vnonsignh));
1157
1158 _mm_storeu_si128((__m128i*) o, vh);
1159 o += 8;
1160 }
1161 if XNN_UNPREDICTABLE(n != 0) {
1162 const __m128 vx_lo = _mm_loadu_ps(input);
1163 const float* input_hi = (const float*) ((uintptr_t) input + (n & (4 * sizeof(float))));
1164 const __m128 vx_hi = _mm_loadu_ps(input_hi);
1165
1166 const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
1167 const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
1168
1169 const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
1170 const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
1171 __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
1172 __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
1173 __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
1174 __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
1175 const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
1176 const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
1177
1178 vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
1179 vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
1180 vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
1181 vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
1182 const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
1183 const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
1184
1185 vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
1186 vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
1187 __m128i vh = _mm_and_si128(vnanh, vnanmaskh);
1188
1189 vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
1190 vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
1191 vh = _mm_or_si128(vh, vsignh);
1192
1193 __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
1194 __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
1195 const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
1196 const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
1197
1198 vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
1199 vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
1200
1201 const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
1202 const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
1203
1204 const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
1205
1206 vh = _mm_or_si128(vh, _mm_andnot_si128(vnanmaskh, vnonsignh));
1207
1208 if (n & (4 * sizeof(float))) {
1209 _mm_storel_epi64((__m128i*) o, vh);
1210 vh = _mm_unpackhi_epi64(vh, vh);
1211 o += 4;
1212 }
1213 if (n & (2 * sizeof(float))) {
1214 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vh));
1215 vh = _mm_srli_epi64(vh, 32);
1216 o += 2;
1217 }
1218 if (n & (1 * sizeof(float))) {
1219 *o = (uint16_t) _mm_cvtsi128_si32(vh);
1220 }
1221 }
1222 }
1223
xnn_f32_prelu_ukernel__sse2_2x8(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride)1224 void xnn_f32_prelu_ukernel__sse2_2x8(
1225 size_t rows,
1226 size_t channels,
1227 const float*restrict input,
1228 size_t input_stride,
1229 const float*restrict weights,
1230 float*restrict output,
1231 size_t output_stride) XNN_OOB_READS
1232 {
1233 assert(rows != 0);
1234 assert(channels != 0);
1235 assert(channels % sizeof(float) == 0);
1236
1237 const float* i0 = input;
1238 float* o0 = output;
1239 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
1240 float* o1 = (float*) ((uintptr_t) o0 + output_stride);
1241
1242 const size_t input_increment = input_stride * 2 - channels;
1243 const size_t output_increment = output_stride * 2 - channels;
1244
1245 do {
1246 if XNN_UNPREDICTABLE(rows < 2) {
1247 i1 = i0;
1248 o1 = o0;
1249 }
1250
1251 const float* w = weights;
1252 size_t c = channels;
1253 for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
1254 const __m128 vw0123 = _mm_load_ps(w);
1255 const __m128 vw4567 = _mm_load_ps(w + 4);
1256 w += 8;
1257
1258 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1259 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
1260 i0 += 8;
1261 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1262 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
1263 i1 += 8;
1264
1265 const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
1266 const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
1267 const __m128 vprod0x4567 = _mm_mul_ps(vi0x4567, vw4567);
1268 const __m128 vmask0x4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x4567)));
1269 const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
1270 const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
1271 const __m128 vprod1x4567 = _mm_mul_ps(vi1x4567, vw4567);
1272 const __m128 vmask1x4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x4567)));
1273
1274 const __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
1275 const __m128 vacc0x4567 = _mm_or_ps(_mm_and_ps(vprod0x4567, vmask0x4567), _mm_andnot_ps(vmask0x4567, vi0x4567));
1276 const __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
1277 const __m128 vacc1x4567 = _mm_or_ps(_mm_and_ps(vprod1x4567, vmask1x4567), _mm_andnot_ps(vmask1x4567, vi1x4567));
1278
1279 _mm_storeu_ps(o0, vacc0x0123);
1280 _mm_storeu_ps(o0 + 4, vacc0x4567);
1281 o0 += 8;
1282 _mm_storeu_ps(o1, vacc1x0123);
1283 _mm_storeu_ps(o1 + 4, vacc1x4567);
1284 o1 += 8;
1285 }
1286 for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
1287 const __m128 vw0123 = _mm_load_ps(w);
1288 w += 4;
1289
1290 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1291 i0 += 4;
1292 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1293 i1 += 4;
1294
1295 const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
1296 const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
1297 const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
1298 const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
1299
1300 __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
1301 __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
1302
1303 _mm_storeu_ps(o0, vacc0x0123);
1304 o0 += 4;
1305 _mm_storeu_ps(o1, vacc1x0123);
1306 o1 += 4;
1307 }
1308 if XNN_UNLIKELY(c != 0) {
1309 const __m128 vw0123 = _mm_load_ps(w);
1310 w = (const float*) ((uintptr_t) w + c);
1311
1312 const __m128 vi0x0123 = _mm_loadu_ps(i0);
1313 i0 = (const float*) ((uintptr_t) i0 + c);
1314 const __m128 vi1x0123 = _mm_loadu_ps(i1);
1315 i1 = (const float*) ((uintptr_t) i1 + c);
1316
1317 const __m128 vprod0x0123 = _mm_mul_ps(vi0x0123, vw0123);
1318 const __m128 vmask0x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi0x0123)));
1319 const __m128 vprod1x0123 = _mm_mul_ps(vi1x0123, vw0123);
1320 const __m128 vmask1x0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vi1x0123)));
1321
1322 __m128 vacc0x0123 = _mm_or_ps(_mm_and_ps(vprod0x0123, vmask0x0123), _mm_andnot_ps(vmask0x0123, vi0x0123));
1323 __m128 vacc1x0123 = _mm_or_ps(_mm_and_ps(vprod1x0123, vmask1x0123), _mm_andnot_ps(vmask1x0123, vi1x0123));
1324
1325 if (c & (2 * sizeof(float))) {
1326 _mm_storel_pi((__m64*) o0, vacc0x0123);
1327 _mm_storel_pi((__m64*) o1, vacc1x0123);
1328
1329 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1330 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
1331
1332 o0 += 2;
1333 o1 += 2;
1334 }
1335 if (c & (1 * sizeof(float))) {
1336 _mm_store_ss(o0, vacc0x0123);
1337 _mm_store_ss(o1, vacc1x0123);
1338
1339 o0 += 1;
1340 o1 += 1;
1341 }
1342 }
1343 i0 = (const float*) ((uintptr_t) i0 + input_increment);
1344 o0 = (float*) ((uintptr_t) o0 + output_increment);
1345 i1 = (const float*) ((uintptr_t) i1 + input_increment);
1346 o1 = (float*) ((uintptr_t) o1 + output_increment);
1347 rows = doz(rows, 2);
1348 } while (rows != 0);
1349 }
1350
xnn_f32_qs8_vcvt_ukernel__sse2_x32(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1351 void xnn_f32_qs8_vcvt_ukernel__sse2_x32(
1352 size_t n,
1353 const float* x,
1354 int8_t* y,
1355 const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1356 {
1357 assert(n != 0);
1358 assert(n % sizeof(float) == 0);
1359 assert(x != NULL);
1360 assert(y != NULL);
1361
1362 const __m128 vscale = _mm_load_ps(params->sse2.scale);
1363 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->sse2.output_max_less_zero_point);
1364 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
1365 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
1366
1367 for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
1368 __m128 vx0123 = _mm_loadu_ps(x);
1369 __m128 vx4567 = _mm_loadu_ps(x + 4);
1370 __m128 vx89AB = _mm_loadu_ps(x + 8);
1371 __m128 vxCDEF = _mm_loadu_ps(x + 12);
1372 __m128 vxGHIJ = _mm_loadu_ps(x + 16);
1373 __m128 vxKLMN = _mm_loadu_ps(x + 20);
1374 __m128 vxOPQR = _mm_loadu_ps(x + 24);
1375 __m128 vxSTUV = _mm_loadu_ps(x + 28);
1376 x += 32;
1377
1378 vx0123 = _mm_mul_ps(vx0123, vscale);
1379 vx4567 = _mm_mul_ps(vx4567, vscale);
1380 vx89AB = _mm_mul_ps(vx89AB, vscale);
1381 vxCDEF = _mm_mul_ps(vxCDEF, vscale);
1382 vxGHIJ = _mm_mul_ps(vxGHIJ, vscale);
1383 vxKLMN = _mm_mul_ps(vxKLMN, vscale);
1384 vxOPQR = _mm_mul_ps(vxOPQR, vscale);
1385 vxSTUV = _mm_mul_ps(vxSTUV, vscale);
1386
1387 vx0123 = _mm_min_ps(vx0123, voutput_max_less_zero_point);
1388 vx4567 = _mm_min_ps(vx4567, voutput_max_less_zero_point);
1389 vx89AB = _mm_min_ps(vx89AB, voutput_max_less_zero_point);
1390 vxCDEF = _mm_min_ps(vxCDEF, voutput_max_less_zero_point);
1391 vxGHIJ = _mm_min_ps(vxGHIJ, voutput_max_less_zero_point);
1392 vxKLMN = _mm_min_ps(vxKLMN, voutput_max_less_zero_point);
1393 vxOPQR = _mm_min_ps(vxOPQR, voutput_max_less_zero_point);
1394 vxSTUV = _mm_min_ps(vxSTUV, voutput_max_less_zero_point);
1395
1396 const __m128i vy0123 = _mm_cvtps_epi32(vx0123);
1397 const __m128i vy4567 = _mm_cvtps_epi32(vx4567);
1398 const __m128i vy89AB = _mm_cvtps_epi32(vx89AB);
1399 const __m128i vyCDEF = _mm_cvtps_epi32(vxCDEF);
1400 const __m128i vyGHIJ = _mm_cvtps_epi32(vxGHIJ);
1401 const __m128i vyKLMN = _mm_cvtps_epi32(vxKLMN);
1402 const __m128i vyOPQR = _mm_cvtps_epi32(vxOPQR);
1403 const __m128i vySTUV = _mm_cvtps_epi32(vxSTUV);
1404
1405 __m128i vy01234567 = _mm_packs_epi32(vy0123, vy4567);
1406 __m128i vy89ABCDEF = _mm_packs_epi32(vy89AB, vyCDEF);
1407 __m128i vyGHIJKLMN = _mm_packs_epi32(vyGHIJ, vyKLMN);
1408 __m128i vyOPQRSTUV = _mm_packs_epi32(vyOPQR, vySTUV);
1409
1410 vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
1411 vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
1412 vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
1413 vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
1414
1415 vy01234567 = _mm_max_epi16(vy01234567, voutput_min);
1416 vy89ABCDEF = _mm_max_epi16(vy89ABCDEF, voutput_min);
1417 vyGHIJKLMN = _mm_max_epi16(vyGHIJKLMN, voutput_min);
1418 vyOPQRSTUV = _mm_max_epi16(vyOPQRSTUV, voutput_min);
1419
1420 __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
1421 __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
1422
1423
1424 _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
1425 _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
1426 y += 32;
1427 }
1428 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1429 __m128 vx_lo = _mm_loadu_ps(x);
1430 __m128 vx_hi = _mm_loadu_ps(x + 4);
1431 x += 8;
1432
1433 vx_lo = _mm_mul_ps(vx_lo, vscale);
1434 vx_hi = _mm_mul_ps(vx_hi, vscale);
1435
1436 vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
1437 vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
1438
1439 const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
1440 const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
1441
1442 __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
1443 vy = _mm_adds_epi16(vy, voutput_zero_point);
1444 vy = _mm_max_epi16(vy, voutput_min);
1445 vy = _mm_packs_epi16(vy, vy);
1446
1447 _mm_storel_epi64((__m128i*) y, vy);
1448 y += 8;
1449 }
1450 if XNN_UNLIKELY(n != 0) {
1451 __m128 vx_lo = _mm_loadu_ps(x);
1452 const float* x_hi = (const float*) ((uintptr_t) x + (n & (4 * sizeof(float))));
1453 __m128 vx_hi = _mm_loadu_ps(x_hi);
1454
1455 vx_lo = _mm_mul_ps(vx_lo, vscale);
1456 vx_hi = _mm_mul_ps(vx_hi, vscale);
1457
1458 vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
1459 vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
1460
1461 const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
1462 const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
1463
1464 __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
1465 vy = _mm_adds_epi16(vy, voutput_zero_point);
1466 vy = _mm_max_epi16(vy, voutput_min);
1467 vy = _mm_packs_epi16(vy, vy);
1468
1469 if (n & (4 * sizeof(float))) {
1470 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
1471 y += 4;
1472 vy = _mm_srli_epi64(vy, 32);
1473 }
1474 {
1475 uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
1476 if (n & (2 * sizeof(float))) {
1477 unaligned_store_u16(y, (uint16_t) vy_lo);
1478 y += 2;
1479 vy_lo >>= 16;
1480 }
1481 if (n & (1 * sizeof(float))) {
1482 *y = (int8_t) vy_lo;
1483 }
1484 }
1485 }
1486 }
1487
xnn_f32_qu8_vcvt_ukernel__sse2_x32(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1488 void xnn_f32_qu8_vcvt_ukernel__sse2_x32(
1489 size_t n,
1490 const float* x,
1491 uint8_t* y,
1492 const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1493 {
1494 assert(n != 0);
1495 assert(n % sizeof(float) == 0);
1496 assert(x != NULL);
1497 assert(y != NULL);
1498
1499 const __m128 vscale = _mm_load_ps(params->sse2.scale);
1500 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->sse2.output_max_less_zero_point);
1501 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
1502 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
1503
1504 for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
1505 __m128 vx0123 = _mm_loadu_ps(x);
1506 __m128 vx4567 = _mm_loadu_ps(x + 4);
1507 __m128 vx89AB = _mm_loadu_ps(x + 8);
1508 __m128 vxCDEF = _mm_loadu_ps(x + 12);
1509 __m128 vxGHIJ = _mm_loadu_ps(x + 16);
1510 __m128 vxKLMN = _mm_loadu_ps(x + 20);
1511 __m128 vxOPQR = _mm_loadu_ps(x + 24);
1512 __m128 vxSTUV = _mm_loadu_ps(x + 28);
1513 x += 32;
1514
1515 vx0123 = _mm_mul_ps(vx0123, vscale);
1516 vx4567 = _mm_mul_ps(vx4567, vscale);
1517 vx89AB = _mm_mul_ps(vx89AB, vscale);
1518 vxCDEF = _mm_mul_ps(vxCDEF, vscale);
1519 vxGHIJ = _mm_mul_ps(vxGHIJ, vscale);
1520 vxKLMN = _mm_mul_ps(vxKLMN, vscale);
1521 vxOPQR = _mm_mul_ps(vxOPQR, vscale);
1522 vxSTUV = _mm_mul_ps(vxSTUV, vscale);
1523
1524 vx0123 = _mm_min_ps(vx0123, voutput_max_less_zero_point);
1525 vx4567 = _mm_min_ps(vx4567, voutput_max_less_zero_point);
1526 vx89AB = _mm_min_ps(vx89AB, voutput_max_less_zero_point);
1527 vxCDEF = _mm_min_ps(vxCDEF, voutput_max_less_zero_point);
1528 vxGHIJ = _mm_min_ps(vxGHIJ, voutput_max_less_zero_point);
1529 vxKLMN = _mm_min_ps(vxKLMN, voutput_max_less_zero_point);
1530 vxOPQR = _mm_min_ps(vxOPQR, voutput_max_less_zero_point);
1531 vxSTUV = _mm_min_ps(vxSTUV, voutput_max_less_zero_point);
1532
1533 const __m128i vy0123 = _mm_cvtps_epi32(vx0123);
1534 const __m128i vy4567 = _mm_cvtps_epi32(vx4567);
1535 const __m128i vy89AB = _mm_cvtps_epi32(vx89AB);
1536 const __m128i vyCDEF = _mm_cvtps_epi32(vxCDEF);
1537 const __m128i vyGHIJ = _mm_cvtps_epi32(vxGHIJ);
1538 const __m128i vyKLMN = _mm_cvtps_epi32(vxKLMN);
1539 const __m128i vyOPQR = _mm_cvtps_epi32(vxOPQR);
1540 const __m128i vySTUV = _mm_cvtps_epi32(vxSTUV);
1541
1542 __m128i vy01234567 = _mm_packs_epi32(vy0123, vy4567);
1543 __m128i vy89ABCDEF = _mm_packs_epi32(vy89AB, vyCDEF);
1544 __m128i vyGHIJKLMN = _mm_packs_epi32(vyGHIJ, vyKLMN);
1545 __m128i vyOPQRSTUV = _mm_packs_epi32(vyOPQR, vySTUV);
1546
1547 vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
1548 vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
1549 vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
1550 vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
1551
1552
1553 __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
1554 __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
1555
1556 vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
1557 vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
1558
1559 _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
1560 _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
1561 y += 32;
1562 }
1563 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1564 __m128 vx_lo = _mm_loadu_ps(x);
1565 __m128 vx_hi = _mm_loadu_ps(x + 4);
1566 x += 8;
1567
1568 vx_lo = _mm_mul_ps(vx_lo, vscale);
1569 vx_hi = _mm_mul_ps(vx_hi, vscale);
1570
1571 vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
1572 vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
1573
1574 const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
1575 const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
1576
1577 __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
1578 vy = _mm_adds_epi16(vy, voutput_zero_point);
1579 vy = _mm_packus_epi16(vy, vy);
1580 vy = _mm_max_epu8(vy, voutput_min);
1581
1582 _mm_storel_epi64((__m128i*) y, vy);
1583 y += 8;
1584 }
1585 if XNN_UNLIKELY(n != 0) {
1586 __m128 vx_lo = _mm_loadu_ps(x);
1587 const float* x_hi = (const float*) ((uintptr_t) x + (n & (4 * sizeof(float))));
1588 __m128 vx_hi = _mm_loadu_ps(x_hi);
1589
1590 vx_lo = _mm_mul_ps(vx_lo, vscale);
1591 vx_hi = _mm_mul_ps(vx_hi, vscale);
1592
1593 vx_lo = _mm_min_ps(vx_lo, voutput_max_less_zero_point);
1594 vx_hi = _mm_min_ps(vx_hi, voutput_max_less_zero_point);
1595
1596 const __m128i vy_lo = _mm_cvtps_epi32(vx_lo);
1597 const __m128i vy_hi = _mm_cvtps_epi32(vx_hi);
1598
1599 __m128i vy = _mm_packs_epi32(vy_lo, vy_hi);
1600 vy = _mm_adds_epi16(vy, voutput_zero_point);
1601 vy = _mm_packus_epi16(vy, vy);
1602 vy = _mm_max_epu8(vy, voutput_min);
1603
1604 if (n & (4 * sizeof(float))) {
1605 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
1606 y += 4;
1607 vy = _mm_srli_epi64(vy, 32);
1608 }
1609 {
1610 uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
1611 if (n & (2 * sizeof(float))) {
1612 unaligned_store_u16(y, (uint16_t) vy_lo);
1613 y += 2;
1614 vy_lo >>= 16;
1615 }
1616 if (n & (1 * sizeof(float))) {
1617 *y = (uint8_t) vy_lo;
1618 }
1619 }
1620 }
1621 }
1622
xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2(size_t elements,const float * input,const float * max,float * output,float * sum,const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS (1)])1623 void xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2(
1624 size_t elements,
1625 const float* input,
1626 const float* max,
1627 float* output,
1628 float* sum,
1629 const union xnn_f32_expminus_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1630 {
1631 assert(elements % sizeof(float) == 0);
1632
1633 const __m128 vi_max = _mm_load1_ps(max);
1634 const __m128 vlog2e = _mm_load_ps(params->sse2_rr2_p5.log2e);
1635 const __m128 vmagic_bias = _mm_load_ps(params->sse2_rr2_p5.magic_bias);
1636 const __m128 vminus_ln2_hi = _mm_load_ps(params->sse2_rr2_p5.minus_ln2_hi);
1637 const __m128 vminus_ln2_lo = _mm_load_ps(params->sse2_rr2_p5.minus_ln2_lo);
1638 const __m128 vc5 = _mm_load_ps(params->sse2_rr2_p5.c5);
1639 const __m128 vc4 = _mm_load_ps(params->sse2_rr2_p5.c4);
1640 const __m128 vc3 = _mm_load_ps(params->sse2_rr2_p5.c3);
1641 const __m128 vc2 = _mm_load_ps(params->sse2_rr2_p5.c2);
1642 const __m128 vc1 = _mm_load_ps(params->sse2_rr2_p5.c1);
1643 const __m128 vdenorm_cutoff = _mm_load_ps(params->sse2_rr2_p5.denorm_cutoff);
1644
1645 __m128 vacc0 = _mm_setzero_ps();
1646 __m128 vacc1 = _mm_setzero_ps();
1647 for (; elements >= 20 * sizeof(float); elements -= 20 * sizeof(float)) {
1648 // Load 20 (5x4) inputs at a time.
1649 const __m128 vi0123 = _mm_loadu_ps(input);
1650 const __m128 vi4567 = _mm_loadu_ps(input + 4);
1651 const __m128 vi89AB = _mm_loadu_ps(input + 8);
1652 const __m128 viCDEF = _mm_loadu_ps(input + 12);
1653 const __m128 viGHIJ = _mm_loadu_ps(input + 16);
1654 input += 20;
1655
1656 // Subtract maximum input x := i - i_max. This implies x <= 0.
1657 const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
1658 const __m128 vx4567 = _mm_sub_ps(vi4567, vi_max);
1659 const __m128 vx89AB = _mm_sub_ps(vi89AB, vi_max);
1660 const __m128 vxCDEF = _mm_sub_ps(viCDEF, vi_max);
1661 const __m128 vxGHIJ = _mm_sub_ps(viGHIJ, vi_max);
1662
1663 // Compute reduced argument elements := round(x / log(2)).
1664 __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
1665 __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vx4567, vlog2e), vmagic_bias);
1666 __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vx89AB, vlog2e), vmagic_bias);
1667 __m128 vnCDEF = _mm_add_ps(_mm_mul_ps(vxCDEF, vlog2e), vmagic_bias);
1668 __m128 vnGHIJ = _mm_add_ps(_mm_mul_ps(vxGHIJ, vlog2e), vmagic_bias);
1669
1670 // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
1671 // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
1672 const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
1673 const __m128 vs4567 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn4567), 23));
1674 const __m128 vs89AB = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn89AB), 23));
1675 const __m128 vsCDEF = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnCDEF), 23));
1676 const __m128 vsGHIJ = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vnGHIJ), 23));
1677
1678 // Subtract the large number back to get final elements := round(x / log(2)).
1679 vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
1680 vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
1681 vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
1682 vnCDEF = _mm_sub_ps(vnCDEF, vmagic_bias);
1683 vnGHIJ = _mm_sub_ps(vnGHIJ, vmagic_bias);
1684
1685 // Compute reduced argument t := x - elements * log(2).
1686 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
1687 __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
1688 __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vx4567);
1689 __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vx89AB);
1690 __m128 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_hi), vxCDEF);
1691 __m128 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_hi), vxGHIJ);
1692
1693 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
1694 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
1695 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
1696 vtCDEF = _mm_add_ps(_mm_mul_ps(vnCDEF, vminus_ln2_lo), vtCDEF);
1697 vtGHIJ = _mm_add_ps(_mm_mul_ps(vnGHIJ, vminus_ln2_lo), vtGHIJ);
1698
1699 // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
1700 __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
1701 __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc5, vt4567), vc4);
1702 __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc5, vt89AB), vc4);
1703 __m128 vpCDEF = _mm_add_ps(_mm_mul_ps(vc5, vtCDEF), vc4);
1704 __m128 vpGHIJ = _mm_add_ps(_mm_mul_ps(vc5, vtGHIJ), vc4);
1705
1706 vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
1707 vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc3);
1708 vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc3);
1709 vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc3);
1710 vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc3);
1711
1712 vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
1713 vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc2);
1714 vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc2);
1715 vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc2);
1716 vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc2);
1717
1718 vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
1719 vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vc1);
1720 vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vc1);
1721 vpCDEF = _mm_add_ps(_mm_mul_ps(vpCDEF, vtCDEF), vc1);
1722 vpGHIJ = _mm_add_ps(_mm_mul_ps(vpGHIJ, vtGHIJ), vc1);
1723
1724 // Reconstruct the final f value:
1725 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
1726 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
1727 // = s + (t * s) * p
1728 vt0123 = _mm_mul_ps(vt0123, vs0123);
1729 vt4567 = _mm_mul_ps(vt4567, vs4567);
1730 vt89AB = _mm_mul_ps(vt89AB, vs89AB);
1731 vtCDEF = _mm_mul_ps(vtCDEF, vsCDEF);
1732 vtGHIJ = _mm_mul_ps(vtGHIJ, vsGHIJ);
1733
1734 __m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
1735 __m128 vf4567 = _mm_add_ps(_mm_mul_ps(vt4567, vp4567), vs4567);
1736 __m128 vf89AB = _mm_add_ps(_mm_mul_ps(vt89AB, vp89AB), vs89AB);
1737 __m128 vfCDEF = _mm_add_ps(_mm_mul_ps(vtCDEF, vpCDEF), vsCDEF);
1738 __m128 vfGHIJ = _mm_add_ps(_mm_mul_ps(vtGHIJ, vpGHIJ), vsGHIJ);
1739
1740 // For inputs below zero cutoff, replace output with +0.0f.
1741 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
1742 vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
1743 vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vx4567, vdenorm_cutoff), vf4567);
1744 vf89AB = _mm_andnot_ps(_mm_cmplt_ps(vx89AB, vdenorm_cutoff), vf89AB);
1745 vfCDEF = _mm_andnot_ps(_mm_cmplt_ps(vxCDEF, vdenorm_cutoff), vfCDEF);
1746 vfGHIJ = _mm_andnot_ps(_mm_cmplt_ps(vxGHIJ, vdenorm_cutoff), vfGHIJ);
1747
1748 // Store 20 (5x4) outputs at a time.
1749 _mm_storeu_ps(output, vf0123);
1750 _mm_storeu_ps(output + 4, vf4567);
1751 _mm_storeu_ps(output + 8, vf89AB);
1752 _mm_storeu_ps(output + 12, vfCDEF);
1753 _mm_storeu_ps(output + 16, vfGHIJ);
1754 output += 20;
1755
1756 // Accumulate computed exponents.
1757 vacc0 = _mm_add_ps(vacc0, vf0123);
1758 vacc0 = _mm_add_ps(vacc0, vf4567);
1759 vacc0 = _mm_add_ps(vacc0, vf89AB);
1760 vacc0 = _mm_add_ps(vacc0, vfCDEF);
1761 vacc0 = _mm_add_ps(vacc0, vfGHIJ);
1762 }
1763 // Add up all accumulators to vacc0
1764 vacc0 = _mm_add_ps(vacc0, vacc1);
1765
1766 __m128 vacc = vacc0;
1767 for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
1768 // Load 4 inputs at a time.
1769 const __m128 vi = _mm_loadu_ps(input);
1770 input += 4;
1771
1772 // Subtract maximum input x := i - i_max. This implies x <= 0.
1773 const __m128 vx = _mm_sub_ps(vi, vi_max);
1774
1775 // Compute reduced argument elements := round(x / log(2)).
1776 __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
1777
1778 // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
1779 // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
1780 const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
1781
1782 // Subtract the large number back to get final elements := round(x / log(2)).
1783 vn = _mm_sub_ps(vn, vmagic_bias);
1784
1785 // Compute reduced argument t := x - elements * log(2).
1786 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
1787 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
1788 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
1789
1790 // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
1791 __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
1792 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
1793 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
1794 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
1795
1796 // Reconstruct the final f value:
1797 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
1798 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
1799 // = s + (t * s) * p
1800 vt = _mm_mul_ps(vt, vs);
1801 __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
1802
1803 // For inputs below zero cutoff, replace output with +0.0f.
1804 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
1805 vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
1806
1807 // Store 4 outputs at a time.
1808 _mm_storeu_ps(output, vf);
1809 output += 4;
1810
1811 // Accumulate computed exponents.
1812 vacc = _mm_add_ps(vacc, vf);
1813 }
1814 if (elements != 0) {
1815 assert(elements >= 1 * sizeof(float));
1816 assert(elements <= 3 * sizeof(float));
1817 // Load 4 inputs at a time.
1818 const __m128 vi = _mm_loadu_ps(input);
1819
1820 // Subtract maximum input x := i - i_max. This implies x <= 0.
1821 const __m128 vx = _mm_sub_ps(vi, vi_max);
1822
1823 // Compute reduced argument elements := round(x / log(2)).
1824 __m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
1825
1826 // Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
1827 // -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
1828 const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
1829
1830 // Subtract the large number back to get final elements := round(x / log(2)).
1831 vn = _mm_sub_ps(vn, vmagic_bias);
1832
1833 // Compute reduced argument t := x - elements * log(2).
1834 // Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
1835 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
1836 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
1837
1838 // Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
1839 __m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
1840 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
1841 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
1842 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
1843
1844 // Reconstruct the final f value:
1845 // f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
1846 // = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
1847 // = s + (t * s) * p
1848 vt = _mm_mul_ps(vt, vs);
1849 __m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
1850
1851 // For inputs below zero cutoff, replace output with +0.0f.
1852 // Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
1853 vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
1854
1855 if (elements & (2 * sizeof(float))) {
1856 // Store 2 outputs at a time.
1857 _mm_storel_pi((__m64*) output, vf);
1858 output += 2;
1859
1860 // Accumulate 2 computed exponents.
1861 vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
1862
1863 vf = _mm_movehl_ps(vf, vf);
1864 }
1865 if (elements & (1 * sizeof(float))) {
1866 // Store 1 output at a time.
1867 _mm_store_ss(output, vf);
1868
1869 // Accumulate 1 computed exponent.
1870 vacc = _mm_add_ss(vacc, vf);
1871 }
1872 }
1873 // Reduce 4 elements in the SIMD register
1874 vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
1875 vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
1876 _mm_store_ss(sum, vacc);
1877 }
1878
1879 extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_16[16];
1880
xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])1881 void xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12(
1882 size_t n,
1883 const float* x,
1884 float* y,
1885 const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1886 {
1887 assert(n != 0);
1888 assert(n % sizeof(float) == 0);
1889 assert(x != NULL);
1890 assert(y != NULL);
1891
1892 const __m128 vprescale = _mm_load_ps(params->sse2_rr2_lut16_p3.prescale);
1893 const __m128 valpha = _mm_load_ps(params->sse2_rr2_lut16_p3.alpha);
1894 const __m128 vbeta = _mm_load_ps(params->sse2_rr2_lut16_p3.beta);
1895 const __m128 vsat_cutoff = _mm_load_ps(params->sse2_rr2_lut16_p3.sat_cutoff);
1896 const __m128 vmagic_bias = _mm_load_ps(params->sse2_rr2_lut16_p3.magic_bias);
1897 const __m128 vlog2e = _mm_load_ps(params->sse2_rr2_lut16_p3.log2e);
1898 const __m128i vindex_mask = _mm_load_si128((const __m128i*) params->sse2_rr2_lut16_p3.index_mask);
1899 const __m128 vminus_ln2_hi = _mm_load_ps(params->sse2_rr2_lut16_p3.minus_ln2_hi);
1900 const __m128 vminus_ln2_lo = _mm_load_ps(params->sse2_rr2_lut16_p3.minus_ln2_lo);
1901 const __m128 vc3 = _mm_load_ps(params->sse2_rr2_lut16_p3.c3);
1902 const __m128 vc2 = _mm_load_ps(params->sse2_rr2_lut16_p3.c2);
1903 const __m128 vone = _mm_load_ps(params->sse2_rr2_lut16_p3.one);
1904
1905 for (; n >= 12 * sizeof(float); n -= 12 * sizeof(float)) {
1906 __m128 vx0123 = _mm_loadu_ps(x);
1907 __m128 vx4567 = _mm_loadu_ps(x + 4);
1908 __m128 vx89AB = _mm_loadu_ps(x + 8);
1909 x += 12;
1910
1911 const __m128 vz0123 = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vx0123, vprescale));
1912 const __m128 vz4567 = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vx4567, vprescale));
1913 const __m128 vz89AB = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vx89AB, vprescale));
1914
1915 __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
1916 __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
1917 __m128 vn89AB = _mm_add_ps(_mm_mul_ps(vz89AB, vlog2e), vmagic_bias);
1918
1919 const __m128i vidx0123 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn0123), vindex_mask), 2);
1920 const __m128i ven0123 = _mm_slli_epi32(_mm_castps_si128(vn0123), 19);
1921 const __m128i vidx4567 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn4567), vindex_mask), 2);
1922 const __m128i ven4567 = _mm_slli_epi32(_mm_castps_si128(vn4567), 19);
1923 const __m128i vidx89AB = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn89AB), vindex_mask), 2);
1924 const __m128i ven89AB = _mm_slli_epi32(_mm_castps_si128(vn89AB), 19);
1925
1926 #if XNN_ARCH_X86_64
1927 const uint64_t vidx01 = (uint64_t) _mm_cvtsi128_si64(vidx0123);
1928 const uint64_t vidx23 = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx0123, vidx0123));
1929 const __m128i vl0 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01)));
1930 const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23)));
1931 const __m128i vl1 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32))));
1932 const __m128i vl01 = _mm_unpacklo_epi32(vl0, vl1);
1933 const __m128i vl3 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32))));
1934 const __m128i vl23 = _mm_unpacklo_epi32(vl2, vl3);
1935 const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
1936 const uint64_t vidx45 = (uint64_t) _mm_cvtsi128_si64(vidx4567);
1937 const uint64_t vidx67 = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx4567, vidx4567));
1938 const __m128i vl4 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45)));
1939 const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67)));
1940 const __m128i vl5 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32))));
1941 const __m128i vl45 = _mm_unpacklo_epi32(vl4, vl5);
1942 const __m128i vl7 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32))));
1943 const __m128i vl67 = _mm_unpacklo_epi32(vl6, vl7);
1944 const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
1945 const uint64_t vidx89 = (uint64_t) _mm_cvtsi128_si64(vidx89AB);
1946 const uint64_t vidxAB = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx89AB, vidx89AB));
1947 const __m128i vl8 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx89)));
1948 const __m128i vlA = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidxAB)));
1949 const __m128i vl9 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx89 >> 32))));
1950 const __m128i vl89 = _mm_unpacklo_epi32(vl8, vl9);
1951 const __m128i vlB = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidxAB >> 32))));
1952 const __m128i vlAB = _mm_unpacklo_epi32(vlA, vlB);
1953 const __m128i vl89AB = _mm_unpacklo_epi64(vl89, vlAB);
1954 #else // !XNN_ARCH_X86_64
1955 const uint32_t vidx0 = (uint32_t) _mm_cvtsi128_si32(vidx0123);
1956 const uint32_t vidx1 = (uint32_t) _mm_extract_epi16(vidx0123, 2);
1957 const uint32_t vidx2 = (uint32_t) _mm_extract_epi16(vidx0123, 4);
1958 const uint32_t vidx3 = (uint32_t) _mm_extract_epi16(vidx0123, 6);
1959 const __m128i vl0 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx0)));
1960 const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx2)));
1961 const __m128i vl1 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx1)));
1962 const __m128i vl01 = _mm_unpacklo_epi32(vl0, vl1);
1963 const __m128i vl3 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx3)));
1964 const __m128i vl23 = _mm_unpacklo_epi32(vl2, vl3);
1965 const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
1966 const uint32_t vidx4 = (uint32_t) _mm_cvtsi128_si32(vidx4567);
1967 const uint32_t vidx5 = (uint32_t) _mm_extract_epi16(vidx4567, 2);
1968 const uint32_t vidx6 = (uint32_t) _mm_extract_epi16(vidx4567, 4);
1969 const uint32_t vidx7 = (uint32_t) _mm_extract_epi16(vidx4567, 6);
1970 const __m128i vl4 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx4)));
1971 const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx6)));
1972 const __m128i vl5 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx5)));
1973 const __m128i vl45 = _mm_unpacklo_epi32(vl4, vl5);
1974 const __m128i vl7 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx7)));
1975 const __m128i vl67 = _mm_unpacklo_epi32(vl6, vl7);
1976 const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
1977 const uint32_t vidx8 = (uint32_t) _mm_cvtsi128_si32(vidx89AB);
1978 const uint32_t vidx9 = (uint32_t) _mm_extract_epi16(vidx89AB, 2);
1979 const uint32_t vidxA = (uint32_t) _mm_extract_epi16(vidx89AB, 4);
1980 const uint32_t vidxB = (uint32_t) _mm_extract_epi16(vidx89AB, 6);
1981 const __m128i vl8 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx8)));
1982 const __m128i vlA = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidxA)));
1983 const __m128i vl9 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidx9)));
1984 const __m128i vl89 = _mm_unpacklo_epi32(vl8, vl9);
1985 const __m128i vlB = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + vidxB)));
1986 const __m128i vlAB = _mm_unpacklo_epi32(vlA, vlB);
1987 const __m128i vl89AB = _mm_unpacklo_epi64(vl89, vlAB);
1988 #endif // XNN_ARCH_X86_64
1989
1990 vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
1991 __m128 vs0123 = _mm_castsi128_ps(_mm_add_epi32(vl0123, ven0123));
1992 vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
1993 __m128 vs4567 = _mm_castsi128_ps(_mm_add_epi32(vl4567, ven4567));
1994 vn89AB = _mm_sub_ps(vn89AB, vmagic_bias);
1995 __m128 vs89AB = _mm_castsi128_ps(_mm_add_epi32(vl89AB, ven89AB));
1996
1997 __m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vz0123);
1998 __m128 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_hi), vz4567);
1999 __m128 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_hi), vz89AB);
2000
2001 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
2002 vt4567 = _mm_add_ps(_mm_mul_ps(vn4567, vminus_ln2_lo), vt4567);
2003 vt89AB = _mm_add_ps(_mm_mul_ps(vn89AB, vminus_ln2_lo), vt89AB);
2004
2005 __m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc3, vt0123), vc2);
2006 __m128 vp4567 = _mm_add_ps(_mm_mul_ps(vc3, vt4567), vc2);
2007 __m128 vp89AB = _mm_add_ps(_mm_mul_ps(vc3, vt89AB), vc2);
2008
2009 vp0123 = _mm_mul_ps(vp0123, vt0123);
2010 vp4567 = _mm_mul_ps(vp4567, vt4567);
2011 vp89AB = _mm_mul_ps(vp89AB, vt89AB);
2012
2013 vt0123 = _mm_mul_ps(vt0123, vs0123);
2014 vs0123 = _mm_sub_ps(vs0123, vone);
2015 vt4567 = _mm_mul_ps(vt4567, vs4567);
2016 vs4567 = _mm_sub_ps(vs4567, vone);
2017 vt89AB = _mm_mul_ps(vt89AB, vs89AB);
2018 vs89AB = _mm_sub_ps(vs89AB, vone);
2019
2020 vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vt0123);
2021 vp4567 = _mm_add_ps(_mm_mul_ps(vp4567, vt4567), vt4567);
2022 vp89AB = _mm_add_ps(_mm_mul_ps(vp89AB, vt89AB), vt89AB);
2023
2024 const __m128 ve0123 = _mm_mul_ps(_mm_add_ps(vp0123, vs0123), valpha);
2025 const __m128 ve4567 = _mm_mul_ps(_mm_add_ps(vp4567, vs4567), valpha);
2026 const __m128 ve89AB = _mm_mul_ps(_mm_add_ps(vp89AB, vs89AB), valpha);
2027
2028 const __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
2029 vx0123 = _mm_mul_ps(vx0123, vbeta);
2030 const __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
2031 vx4567 = _mm_mul_ps(vx4567, vbeta);
2032 const __m128 vm89AB = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx89AB)));
2033 vx89AB = _mm_mul_ps(vx89AB, vbeta);
2034
2035 const __m128 vy0123 = _mm_or_ps(_mm_and_ps(ve0123, vm0123), _mm_andnot_ps(vm0123, vx0123));
2036 const __m128 vy4567 = _mm_or_ps(_mm_and_ps(ve4567, vm4567), _mm_andnot_ps(vm4567, vx4567));
2037 const __m128 vy89AB = _mm_or_ps(_mm_and_ps(ve89AB, vm89AB), _mm_andnot_ps(vm89AB, vx89AB));
2038
2039 _mm_storeu_ps(y, vy0123);
2040 _mm_storeu_ps(y + 4, vy4567);
2041 _mm_storeu_ps(y + 8, vy89AB);
2042 y += 12;
2043 }
2044 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2045 __m128 vx = _mm_loadu_ps(x);
2046 x += 4;
2047
2048 const __m128 vz = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vx, vprescale));
2049
2050 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
2051
2052 const __m128i ven = _mm_slli_epi32(_mm_castps_si128(vn), 19);
2053 const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
2054 #if XNN_ARCH_X86_64
2055 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
2056 const uint64_t vidx_hi = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx, vidx));
2057 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)));
2058 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)));
2059 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))));
2060 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2061 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32))));
2062 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2063 #else // !XNN_ARCH_X86_64
2064 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx))));
2065 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi16(vidx, 4))));
2066 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi16(vidx, 2))));
2067 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2068 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi16(vidx, 6))));
2069 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2070 #endif // XNN_ARCH_X86_64
2071 const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
2072 __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ven));
2073 vn = _mm_sub_ps(vn, vmagic_bias);
2074
2075 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
2076 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
2077
2078 __m128 vp = _mm_add_ps(_mm_mul_ps(vc3, vt), vc2);
2079 vp = _mm_mul_ps(vp, vt);
2080
2081 vt = _mm_mul_ps(vt, vs);
2082 vs = _mm_sub_ps(vs, vone);
2083 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vt);
2084 const __m128 ve = _mm_mul_ps(_mm_add_ps(vp, vs), valpha);
2085
2086 const __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
2087 vx = _mm_mul_ps(vx, vbeta);
2088 const __m128 vy = _mm_or_ps(_mm_and_ps(ve, vm), _mm_andnot_ps(vm, vx));
2089
2090 _mm_storeu_ps(y, vy);
2091 y += 4;
2092 }
2093 if XNN_UNLIKELY(n != 0) {
2094 __m128 vx = _mm_loadu_ps(x);
2095
2096 const __m128 vz = _mm_max_ps(vsat_cutoff, _mm_mul_ps(vx, vprescale));
2097
2098 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
2099
2100 const __m128i ven = _mm_slli_epi32(_mm_castps_si128(vn), 19);
2101 const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
2102 #if XNN_ARCH_X86_64
2103 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
2104 const uint64_t vidx_hi = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx, vidx));
2105 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo)));
2106 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi)));
2107 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32))));
2108 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2109 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32))));
2110 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2111 #else // !XNN_ARCH_X86_64
2112 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_cvtsi128_si32(vidx))));
2113 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi16(vidx, 4))));
2114 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi16(vidx, 2))));
2115 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2116 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) _mm_extract_epi16(vidx, 6))));
2117 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2118 #endif // XNN_ARCH_X86_64
2119 const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
2120 __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ven));
2121 vn = _mm_sub_ps(vn, vmagic_bias);
2122
2123 __m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vz);
2124 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
2125
2126 __m128 vp = _mm_add_ps(_mm_mul_ps(vc3, vt), vc2);
2127 vp = _mm_mul_ps(vp, vt);
2128
2129 vt = _mm_mul_ps(vt, vs);
2130 vs = _mm_sub_ps(vs, vone);
2131 vp = _mm_add_ps(_mm_mul_ps(vp, vt), vt);
2132 const __m128 ve = _mm_mul_ps(_mm_add_ps(vp, vs), valpha);
2133
2134 const __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
2135 vx = _mm_mul_ps(vx, vbeta);
2136 __m128 vy = _mm_or_ps(_mm_and_ps(ve, vm), _mm_andnot_ps(vm, vx));
2137
2138 if (n & (2 * sizeof(float))) {
2139 _mm_storel_pi((__m64*) y, vy);
2140 vy = _mm_movehl_ps(vy, vy);
2141 y += 2;
2142 }
2143 if (n & (1 * sizeof(float))) {
2144 _mm_store_ss(y, vy);
2145 }
2146 }
2147 }
2148
xnn_f32_vlrelu_ukernel__sse2_x8(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])2149 void xnn_f32_vlrelu_ukernel__sse2_x8(
2150 size_t n,
2151 const float* x,
2152 float* y,
2153 const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2154 {
2155 assert(n != 0);
2156 assert(n % sizeof(float) == 0);
2157
2158 const __m128 vslope = _mm_load_ps(params->sse.slope);
2159 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2160 const __m128 vx0123 = _mm_loadu_ps(x);
2161 const __m128 vx4567 = _mm_loadu_ps(x + 4);
2162 x += 8;
2163
2164 __m128 vacc0123 = _mm_mul_ps(vx0123, vslope);
2165 const __m128 vmask0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
2166 __m128 vacc4567 = _mm_mul_ps(vx4567, vslope);
2167 const __m128 vmask4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
2168
2169 vacc0123 = _mm_or_ps(_mm_and_ps(vacc0123, vmask0123), _mm_andnot_ps(vmask0123, vx0123));
2170 vacc4567 = _mm_or_ps(_mm_and_ps(vacc4567, vmask4567), _mm_andnot_ps(vmask4567, vx4567));
2171
2172 _mm_storeu_ps(y, vacc0123);
2173 _mm_storeu_ps(y + 4, vacc4567);
2174 y += 8;
2175 }
2176 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2177 const __m128 vx = _mm_loadu_ps(x);
2178 x += 4;
2179
2180 __m128 vacc = _mm_mul_ps(vx, vslope);
2181 const __m128 vmask = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
2182 vacc = _mm_or_ps(_mm_and_ps(vacc, vmask), _mm_andnot_ps(vmask, vx));
2183
2184 _mm_storeu_ps(y, vacc);
2185 y += 4;
2186 }
2187 if XNN_UNLIKELY(n != 0) {
2188 const __m128 vx = _mm_loadu_ps(x);
2189
2190 __m128 vacc = _mm_mul_ps(vx, vslope);
2191 const __m128 vmask = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
2192 vacc = _mm_or_ps(_mm_and_ps(vacc, vmask), _mm_andnot_ps(vmask, vx));
2193
2194 if (n & (2 * sizeof(float))) {
2195 _mm_storel_pi((__m64*) y, vacc);
2196 vacc = _mm_movehl_ps(vacc, vacc);
2197 y += 2;
2198 }
2199 if (n & (1 * sizeof(float))) {
2200 _mm_store_ss(y, vacc);
2201 }
2202 }
2203 }
2204
xnn_f32_vrndd_ukernel__sse2_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2205 void xnn_f32_vrndd_ukernel__sse2_x8(
2206 size_t n,
2207 const float* x,
2208 float* y,
2209 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2210 {
2211 assert(n != 0);
2212 assert(n % sizeof(float) == 0);
2213
2214 const __m128i vmagic = _mm_load_si128((const __m128i*) params->sse2.sign_mask);
2215 const __m128 vone = _mm_load_ps(params->sse2.one);
2216 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2217 const __m128 vx0123 = _mm_loadu_ps(x);
2218 const __m128 vx4567 = _mm_loadu_ps(x + 4);
2219 x += 8;
2220
2221 const __m128i vintx0123 = _mm_cvttps_epi32(vx0123);
2222 const __m128i vintx4567 = _mm_cvttps_epi32(vx4567);
2223
2224 const __m128 vrndmask0123 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx0123, vmagic)));
2225 const __m128 vrndmask4567 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx4567, vmagic)));
2226
2227 const __m128 vprerndx0123 = _mm_cvtepi32_ps(vintx0123);
2228 const __m128 vprerndx4567 = _mm_cvtepi32_ps(vintx4567);
2229
2230 const __m128 vrndx0123 = _mm_or_ps(_mm_and_ps(vx0123, vrndmask0123), _mm_andnot_ps(vrndmask0123, vprerndx0123));
2231 const __m128 vrndx4567 = _mm_or_ps(_mm_and_ps(vx4567, vrndmask4567), _mm_andnot_ps(vrndmask4567, vprerndx4567));
2232
2233 const __m128 vy0123 = _mm_sub_ps(vrndx0123, _mm_and_ps(_mm_cmpgt_ps(vrndx0123, vx0123), vone));
2234 const __m128 vy4567 = _mm_sub_ps(vrndx4567, _mm_and_ps(_mm_cmpgt_ps(vrndx4567, vx4567), vone));
2235
2236 _mm_storeu_ps(y, vy0123);
2237 _mm_storeu_ps(y + 4, vy4567);
2238 y += 8;
2239 }
2240 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2241 const __m128 vx = _mm_loadu_ps(x);
2242 x += 4;
2243
2244 const __m128i vintx = _mm_cvttps_epi32(vx);
2245 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2246 const __m128 vprerndx = _mm_cvtepi32_ps(vintx);
2247 const __m128 vrndx = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vprerndx));
2248 const __m128 vy = _mm_sub_ps(vrndx, _mm_and_ps(_mm_cmpgt_ps(vrndx, vx), vone));
2249
2250 _mm_storeu_ps(y, vy);
2251 y += 4;
2252 }
2253 if XNN_UNLIKELY(n != 0) {
2254 const __m128 vx = _mm_loadu_ps(x);
2255 const __m128i vintx = _mm_cvttps_epi32(vx);
2256 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2257 const __m128 vprerndx = _mm_cvtepi32_ps(vintx);
2258 const __m128 vrndx = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vprerndx));
2259 __m128 vy = _mm_sub_ps(vrndx, _mm_and_ps(_mm_cmpgt_ps(vrndx, vx), vone));
2260 if (n & (2 * sizeof(float))) {
2261 _mm_storel_pi((__m64*) y, vy);
2262 vy = _mm_movehl_ps(vy, vy);
2263 y += 2;
2264 }
2265 if (n & (1 * sizeof(float))) {
2266 _mm_store_ss(y, vy);
2267 }
2268 }
2269 }
2270
xnn_f32_vrndne_ukernel__sse2_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2271 void xnn_f32_vrndne_ukernel__sse2_x8(
2272 size_t n,
2273 const float* x,
2274 float* y,
2275 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2276 {
2277 assert(n != 0);
2278 assert(n % sizeof(float) == 0);
2279
2280 const __m128i vmagic = _mm_load_si128((const __m128i*) params->sse2.sign_mask);
2281 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2282 const __m128 vx0123 = _mm_loadu_ps(x);
2283 const __m128 vx4567 = _mm_loadu_ps(x + 4);
2284 x += 8;
2285
2286 const __m128i vintx0123 = _mm_cvtps_epi32(vx0123);
2287 const __m128i vintx4567 = _mm_cvtps_epi32(vx4567);
2288
2289 const __m128 vrndmask0123 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx0123, vmagic)));
2290 const __m128 vrndmask4567 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx4567, vmagic)));
2291
2292 const __m128 vrndx0123 = _mm_cvtepi32_ps(vintx0123);
2293 const __m128 vrndx4567 = _mm_cvtepi32_ps(vintx4567);
2294
2295 const __m128 vy0123 = _mm_or_ps(_mm_and_ps(vx0123, vrndmask0123), _mm_andnot_ps(vrndmask0123, vrndx0123));
2296 const __m128 vy4567 = _mm_or_ps(_mm_and_ps(vx4567, vrndmask4567), _mm_andnot_ps(vrndmask4567, vrndx4567));
2297
2298 _mm_storeu_ps(y, vy0123);
2299 _mm_storeu_ps(y + 4, vy4567);
2300 y += 8;
2301 }
2302 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2303 const __m128 vx = _mm_loadu_ps(x);
2304 x += 4;
2305
2306 const __m128i vintx = _mm_cvtps_epi32(vx);
2307 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2308 const __m128 vrndx = _mm_cvtepi32_ps(vintx);
2309 const __m128 vy = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vrndx));
2310
2311 _mm_storeu_ps(y, vy);
2312 y += 4;
2313 }
2314 if XNN_UNLIKELY(n != 0) {
2315 const __m128 vx = _mm_loadu_ps(x);
2316 const __m128i vintx = _mm_cvtps_epi32(vx);
2317 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2318 const __m128 vrndx = _mm_cvtepi32_ps(vintx);
2319 __m128 vy = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vrndx));
2320 if (n & (2 * sizeof(float))) {
2321 _mm_storel_pi((__m64*) y, vy);
2322 vy = _mm_movehl_ps(vy, vy);
2323 y += 2;
2324 }
2325 if (n & (1 * sizeof(float))) {
2326 _mm_store_ss(y, vy);
2327 }
2328 }
2329 }
2330
xnn_f32_vrndu_ukernel__sse2_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2331 void xnn_f32_vrndu_ukernel__sse2_x8(
2332 size_t n,
2333 const float* x,
2334 float* y,
2335 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2336 {
2337 assert(n != 0);
2338 assert(n % sizeof(float) == 0);
2339
2340 const __m128i vmagic = _mm_load_si128((const __m128i*) params->sse2.sign_mask);
2341 const __m128 vone = _mm_load_ps(params->sse2.one);
2342 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2343 const __m128 vx0123 = _mm_loadu_ps(x);
2344 const __m128 vx4567 = _mm_loadu_ps(x + 4);
2345 x += 8;
2346
2347 const __m128i vintx0123 = _mm_cvttps_epi32(vx0123);
2348 const __m128i vintx4567 = _mm_cvttps_epi32(vx4567);
2349
2350 const __m128 vrndmask0123 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx0123, vmagic)));
2351 const __m128 vrndmask4567 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx4567, vmagic)));
2352
2353 const __m128 vprerndx0123 = _mm_cvtepi32_ps(vintx0123);
2354 const __m128 vprerndx4567 = _mm_cvtepi32_ps(vintx4567);
2355
2356 const __m128 vrndx0123 = _mm_or_ps(_mm_and_ps(vx0123, vrndmask0123), _mm_andnot_ps(vrndmask0123, vprerndx0123));
2357 const __m128 vrndx4567 = _mm_or_ps(_mm_and_ps(vx4567, vrndmask4567), _mm_andnot_ps(vrndmask4567, vprerndx4567));
2358
2359 const __m128 vadjmask0123 = _mm_or_ps(_mm_cmpge_ps(vrndx0123, vx0123), _mm_castsi128_ps(vmagic));
2360 const __m128 vadjmask4567 = _mm_or_ps(_mm_cmpge_ps(vrndx4567, vx4567), _mm_castsi128_ps(vmagic));
2361
2362 const __m128 vadjrndx0123 = _mm_add_ps(vrndx0123, vone);
2363 const __m128 vadjrndx4567 = _mm_add_ps(vrndx4567, vone);
2364
2365 const __m128 vy0123 = _mm_or_ps(_mm_and_ps(vrndx0123, vadjmask0123), _mm_andnot_ps(vadjmask0123, vadjrndx0123));
2366 const __m128 vy4567 = _mm_or_ps(_mm_and_ps(vrndx4567, vadjmask4567), _mm_andnot_ps(vadjmask4567, vadjrndx4567));
2367
2368 _mm_storeu_ps(y, vy0123);
2369 _mm_storeu_ps(y + 4, vy4567);
2370 y += 8;
2371 }
2372 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2373 const __m128 vx = _mm_loadu_ps(x);
2374 x += 4;
2375
2376 const __m128i vintx = _mm_cvttps_epi32(vx);
2377 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2378 const __m128 vprerndx = _mm_cvtepi32_ps(vintx);
2379 const __m128 vrndx = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vprerndx));
2380 const __m128 vadjmask = _mm_or_ps(_mm_cmpge_ps(vrndx, vx), _mm_castsi128_ps(vmagic));
2381 const __m128 vadjrndx = _mm_add_ps(vrndx, vone);
2382 const __m128 vy = _mm_or_ps(_mm_and_ps(vrndx, vadjmask), _mm_andnot_ps(vadjmask, vadjrndx));
2383
2384 _mm_storeu_ps(y, vy);
2385 y += 4;
2386 }
2387 if XNN_UNLIKELY(n != 0) {
2388 const __m128 vx = _mm_loadu_ps(x);
2389 const __m128i vintx = _mm_cvttps_epi32(vx);
2390 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2391 const __m128 vprerndx = _mm_cvtepi32_ps(vintx);
2392 const __m128 vrndx = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vprerndx));
2393 const __m128 vadjmask = _mm_or_ps(_mm_cmpge_ps(vrndx, vx), _mm_castsi128_ps(vmagic));
2394 const __m128 vadjrndx = _mm_add_ps(vrndx, vone);
2395 __m128 vy = _mm_or_ps(_mm_and_ps(vrndx, vadjmask), _mm_andnot_ps(vadjmask, vadjrndx));
2396 if (n & (2 * sizeof(float))) {
2397 _mm_storel_pi((__m64*) y, vy);
2398 vy = _mm_movehl_ps(vy, vy);
2399 y += 2;
2400 }
2401 if (n & (1 * sizeof(float))) {
2402 _mm_store_ss(y, vy);
2403 }
2404 }
2405 }
2406
xnn_f32_vrndz_ukernel__sse2_x8(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2407 void xnn_f32_vrndz_ukernel__sse2_x8(
2408 size_t n,
2409 const float* x,
2410 float* y,
2411 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2412 {
2413 assert(n != 0);
2414 assert(n % sizeof(float) == 0);
2415
2416 const __m128i vmagic = _mm_load_si128((const __m128i*) params->sse2.sign_mask);
2417 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2418 const __m128 vx0123 = _mm_loadu_ps(x);
2419 const __m128 vx4567 = _mm_loadu_ps(x + 4);
2420 x += 8;
2421
2422 const __m128i vintx0123 = _mm_cvttps_epi32(vx0123);
2423 const __m128i vintx4567 = _mm_cvttps_epi32(vx4567);
2424
2425 const __m128 vrndmask0123 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx0123, vmagic)));
2426 const __m128 vrndmask4567 = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx4567, vmagic)));
2427
2428 const __m128 vrndx0123 = _mm_cvtepi32_ps(vintx0123);
2429 const __m128 vrndx4567 = _mm_cvtepi32_ps(vintx4567);
2430
2431 const __m128 vy0123 = _mm_or_ps(_mm_and_ps(vx0123, vrndmask0123), _mm_andnot_ps(vrndmask0123, vrndx0123));
2432 const __m128 vy4567 = _mm_or_ps(_mm_and_ps(vx4567, vrndmask4567), _mm_andnot_ps(vrndmask4567, vrndx4567));
2433
2434 _mm_storeu_ps(y, vy0123);
2435 _mm_storeu_ps(y + 4, vy4567);
2436 y += 8;
2437 }
2438 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2439 const __m128 vx = _mm_loadu_ps(x);
2440 x += 4;
2441
2442 const __m128i vintx = _mm_cvttps_epi32(vx);
2443 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2444 const __m128 vrndx = _mm_cvtepi32_ps(vintx);
2445 const __m128 vy = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vrndx));
2446
2447 _mm_storeu_ps(y, vy);
2448 y += 4;
2449 }
2450 if XNN_UNLIKELY(n != 0) {
2451 const __m128 vx = _mm_loadu_ps(x);
2452 const __m128i vintx = _mm_cvttps_epi32(vx);
2453 const __m128 vrndmask = _mm_castsi128_ps(_mm_or_si128(vmagic, _mm_cmpeq_epi32(vintx, vmagic)));
2454 const __m128 vrndx = _mm_cvtepi32_ps(vintx);
2455 __m128 vy = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vrndx));
2456 if (n & (2 * sizeof(float))) {
2457 _mm_storel_pi((__m64*) y, vy);
2458 vy = _mm_movehl_ps(vy, vy);
2459 y += 2;
2460 }
2461 if (n & (1 * sizeof(float))) {
2462 _mm_store_ss(y, vy);
2463 }
2464 }
2465 }
2466
2467 extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_64[64];
2468
xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])2469 void xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8(
2470 size_t n,
2471 const float* x,
2472 float* y,
2473 const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2474 {
2475 assert(n % sizeof(float) == 0);
2476
2477 const __m128 vsign_mask = _mm_load_ps(params->sse2_rr2_lut64_p2.sign_mask);
2478 const __m128 vmagic_bias = _mm_load_ps(params->sse2_rr2_lut64_p2.magic_bias);
2479 const __m128 vlog2e = _mm_load_ps(params->sse2_rr2_lut64_p2.log2e);
2480 const __m128i vindex_mask = _mm_load_si128((const __m128i*) params->sse2_rr2_lut64_p2.index_mask);
2481 const __m128 vminus_ln2_hi = _mm_load_ps(params->sse2_rr2_lut64_p2.minus_ln2_hi);
2482 const __m128 vminus_ln2_lo = _mm_load_ps(params->sse2_rr2_lut64_p2.minus_ln2_lo);
2483 const __m128 vc2 = _mm_load_ps(params->sse2_rr2_lut64_p2.c2);
2484 const __m128 vone = _mm_load_ps(params->sse2_rr2_lut64_p2.one);
2485 const __m128 vdenorm_cutoff = _mm_load_ps(params->sse2_rr2_lut64_p2.denorm_cutoff);
2486
2487 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2488 const __m128 vx0123 = _mm_loadu_ps(x);
2489 const __m128 vx4567 = _mm_loadu_ps(x + 4);
2490 x += 8;
2491
2492 const __m128 vz0123 = _mm_or_ps(vx0123, vsign_mask);
2493 const __m128 vz4567 = _mm_or_ps(vx4567, vsign_mask);
2494
2495 __m128 vn0123 = _mm_add_ps(_mm_mul_ps(vz0123, vlog2e), vmagic_bias);
2496 __m128 vn4567 = _mm_add_ps(_mm_mul_ps(vz4567, vlog2e), vmagic_bias);
2497
2498 const __m128i ve0123 = _mm_slli_epi32(_mm_castps_si128(vn0123), 17);
2499 const __m128i ve4567 = _mm_slli_epi32(_mm_castps_si128(vn4567), 17);
2500
2501 const __m128i vidx0123 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn0123), vindex_mask), 2);
2502 const __m128i vidx4567 = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn4567), vindex_mask), 2);
2503
2504 #if XNN_ARCH_X86_64
2505 const uint64_t vidx01 = (uint64_t) _mm_cvtsi128_si64(vidx0123);
2506 const uint64_t vidx23 = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx0123, vidx0123));
2507 const __m128i vl0 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx01)));
2508 const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx23)));
2509 const __m128i vl1 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx01 >> 32))));
2510 const __m128i vl01 = _mm_unpacklo_epi32(vl0, vl1);
2511 const __m128i vl3 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx23 >> 32))));
2512 const __m128i vl23 = _mm_unpacklo_epi32(vl2, vl3);
2513 const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
2514 const uint64_t vidx45 = (uint64_t) _mm_cvtsi128_si64(vidx4567);
2515 const uint64_t vidx67 = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx4567, vidx4567));
2516 const __m128i vl4 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx45)));
2517 const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx67)));
2518 const __m128i vl5 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx45 >> 32))));
2519 const __m128i vl45 = _mm_unpacklo_epi32(vl4, vl5);
2520 const __m128i vl7 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx67 >> 32))));
2521 const __m128i vl67 = _mm_unpacklo_epi32(vl6, vl7);
2522 const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
2523 #else // !XNN_ARCH_X86_64
2524 const uint32_t vidx0 = (uint32_t) _mm_cvtsi128_si32(vidx0123);
2525 const uint32_t vidx1 = (uint32_t) _mm_extract_epi16(vidx0123, 2);
2526 const uint32_t vidx2 = (uint32_t) _mm_extract_epi16(vidx0123, 4);
2527 const uint32_t vidx3 = (uint32_t) _mm_extract_epi16(vidx0123, 6);
2528 const __m128i vl0 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx0)));
2529 const __m128i vl2 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx2)));
2530 const __m128i vl1 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx1)));
2531 const __m128i vl01 = _mm_unpacklo_epi32(vl0, vl1);
2532 const __m128i vl3 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx3)));
2533 const __m128i vl23 = _mm_unpacklo_epi32(vl2, vl3);
2534 const __m128i vl0123 = _mm_unpacklo_epi64(vl01, vl23);
2535 const uint32_t vidx4 = (uint32_t) _mm_cvtsi128_si32(vidx4567);
2536 const uint32_t vidx5 = (uint32_t) _mm_extract_epi16(vidx4567, 2);
2537 const uint32_t vidx6 = (uint32_t) _mm_extract_epi16(vidx4567, 4);
2538 const uint32_t vidx7 = (uint32_t) _mm_extract_epi16(vidx4567, 6);
2539 const __m128i vl4 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx4)));
2540 const __m128i vl6 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx6)));
2541 const __m128i vl5 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx5)));
2542 const __m128i vl45 = _mm_unpacklo_epi32(vl4, vl5);
2543 const __m128i vl7 = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + vidx7)));
2544 const __m128i vl67 = _mm_unpacklo_epi32(vl6, vl7);
2545 const __m128i vl4567 = _mm_unpacklo_epi64(vl45, vl67);
2546 #endif // XNN_ARCH_X86_64
2547
2548 const __m128 vs0123 = _mm_castsi128_ps(_mm_add_epi32(vl0123, ve0123));
2549 const __m128 vs4567 = _mm_castsi128_ps(_mm_add_epi32(vl4567, ve4567));
2550
2551 vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
2552 vn4567 = _mm_sub_ps(vn4567, vmagic_bias);
2553
2554 __m128 vt0123 = _mm_add_ps(vz0123, _mm_mul_ps(vn0123, vminus_ln2_hi));
2555 __m128 vt4567 = _mm_add_ps(vz4567, _mm_mul_ps(vn4567, vminus_ln2_hi));
2556
2557 vt0123 = _mm_add_ps(vt0123, _mm_mul_ps(vn0123, vminus_ln2_lo));
2558 vt4567 = _mm_add_ps(vt4567, _mm_mul_ps(vn4567, vminus_ln2_lo));
2559
2560 __m128 vp0123 = _mm_mul_ps(vt0123, vc2);
2561 __m128 vp4567 = _mm_mul_ps(vt4567, vc2);
2562
2563 vp0123 = _mm_add_ps(vt0123, _mm_mul_ps(vp0123, vt0123));
2564 vp4567 = _mm_add_ps(vt4567, _mm_mul_ps(vp4567, vt4567));
2565
2566 const __m128 vy0123 = _mm_add_ps(vs0123, _mm_mul_ps(vs0123, vp0123));
2567 const __m128 vy4567 = _mm_add_ps(vs4567, _mm_mul_ps(vs4567, vp4567));
2568
2569 __m128 vf0123 = _mm_div_ps(vy0123, _mm_add_ps(vy0123, vone));
2570 __m128 vf4567 = _mm_div_ps(vy4567, _mm_add_ps(vy4567, vone));
2571
2572 vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vz0123, vdenorm_cutoff), vf0123);
2573 vf4567 = _mm_andnot_ps(_mm_cmplt_ps(vz4567, vdenorm_cutoff), vf4567);
2574
2575 const __m128 vm0123 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx0123)));
2576 const __m128 vm4567 = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx4567)));
2577
2578 vf0123 = _mm_or_ps(_mm_and_ps(vf0123, vm0123), _mm_andnot_ps(vm0123, _mm_sub_ps(vone, vf0123)));
2579 vf4567 = _mm_or_ps(_mm_and_ps(vf4567, vm4567), _mm_andnot_ps(vm4567, _mm_sub_ps(vone, vf4567)));
2580
2581 _mm_storeu_ps(y, vf0123);
2582 _mm_storeu_ps(y + 4, vf4567);
2583 y += 8;
2584 }
2585 for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
2586 const __m128 vx = _mm_loadu_ps(x);
2587 x += 4;
2588
2589 const __m128 vz = _mm_or_ps(vx, vsign_mask);
2590
2591 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
2592 const __m128i ve = _mm_slli_epi32(_mm_castps_si128(vn), 17);
2593
2594 const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
2595 #if XNN_ARCH_X86_64
2596 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
2597 const uint64_t vidx_hi = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx, vidx));
2598 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)));
2599 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)));
2600 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))));
2601 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2602 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))));
2603 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2604 #else // !XNN_ARCH_X86_64
2605 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx))));
2606 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 4))));
2607 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 2))));
2608 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2609 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 6))));
2610 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2611 #endif // XNN_ARCH_X86_64
2612 const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
2613
2614 const __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ve));
2615 vn = _mm_sub_ps(vn, vmagic_bias);
2616
2617 __m128 vt = _mm_add_ps(vz, _mm_mul_ps(vn, vminus_ln2_hi));
2618 vt = _mm_add_ps(vt, _mm_mul_ps(vn, vminus_ln2_lo));
2619
2620 __m128 vp = _mm_mul_ps(vt, vc2);
2621 vp = _mm_add_ps(vt, _mm_mul_ps(vp, vt));
2622
2623 const __m128 vy = _mm_add_ps(vs, _mm_mul_ps(vs, vp));
2624
2625 __m128 vf = _mm_div_ps(vy, _mm_add_ps(vy, vone));
2626 vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
2627 const __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
2628 vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
2629
2630 _mm_storeu_ps(y, vf);
2631 y += 4;
2632 }
2633 if XNN_UNLIKELY(n != 0) {
2634 const __m128 vx = _mm_loadu_ps(x);
2635
2636 const __m128 vz = _mm_or_ps(vx, vsign_mask);
2637
2638 __m128 vn = _mm_add_ps(_mm_mul_ps(vz, vlog2e), vmagic_bias);
2639 const __m128i ve = _mm_slli_epi32(_mm_castps_si128(vn), 17);
2640
2641 const __m128i vidx = _mm_slli_epi32(_mm_and_si128(_mm_castps_si128(vn), vindex_mask), 2);
2642 #if XNN_ARCH_X86_64
2643 const uint64_t vidx_lo = (uint64_t) _mm_cvtsi128_si64(vidx);
2644 const uint64_t vidx_hi = (uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vidx, vidx));
2645 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_lo)));
2646 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) vidx_hi)));
2647 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_lo >> 32))));
2648 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2649 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) (vidx_hi >> 32))));
2650 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2651 #else // !XNN_ARCH_X86_64
2652 const __m128i vl_ll = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_cvtsi128_si32(vidx))));
2653 const __m128i vl_hl = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 4))));
2654 const __m128i vl_lh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 2))));
2655 const __m128i vl_lo = _mm_unpacklo_epi32(vl_ll, vl_lh);
2656 const __m128i vl_hh = _mm_cvtsi32_si128(*((const int*) ((uintptr_t) xnn_table_exp2minus_k_over_64 + (uint32_t) _mm_extract_epi16(vidx, 6))));
2657 const __m128i vl_hi = _mm_unpacklo_epi32(vl_hl, vl_hh);
2658 #endif // XNN_ARCH_X86_64
2659 const __m128i vl = _mm_unpacklo_epi64(vl_lo, vl_hi);
2660
2661 const __m128 vs = _mm_castsi128_ps(_mm_add_epi32(vl, ve));
2662 vn = _mm_sub_ps(vn, vmagic_bias);
2663
2664 __m128 vt = _mm_add_ps(vz, _mm_mul_ps(vn, vminus_ln2_hi));
2665 vt = _mm_add_ps(vt, _mm_mul_ps(vn, vminus_ln2_lo));
2666
2667 __m128 vp = _mm_mul_ps(vt, vc2);
2668 vp = _mm_add_ps(vt, _mm_mul_ps(vp, vt));
2669
2670 const __m128 vy = _mm_add_ps(vs, _mm_mul_ps(vs, vp));
2671
2672 __m128 vf = _mm_div_ps(vy, _mm_add_ps(vy, vone));
2673 vf = _mm_andnot_ps(_mm_cmplt_ps(vz, vdenorm_cutoff), vf);
2674 const __m128 vm = _mm_castsi128_ps(_mm_cmpgt_epi32(_mm_setzero_si128(), _mm_castps_si128(vx)));
2675 vf = _mm_or_ps(_mm_and_ps(vf, vm), _mm_andnot_ps(vm, _mm_sub_ps(vone, vf)));
2676
2677 if (n & (2 * sizeof(float))) {
2678 _mm_storel_pi((__m64*) y, vf);
2679 vf = _mm_movehl_ps(vf, vf);
2680 y += 2;
2681 }
2682 if (n & (1 * sizeof(float))) {
2683 _mm_store_ss(y, vf);
2684 }
2685 }
2686 }
2687
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2688 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16(
2689 size_t channels,
2690 size_t output_width,
2691 const int8_t** input,
2692 const void* weights,
2693 int8_t* output,
2694 size_t input_stride,
2695 size_t output_increment,
2696 size_t input_offset,
2697 const int8_t* zero,
2698 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2699 {
2700 assert(channels != 0);
2701 assert(output_width != 0);
2702
2703 do {
2704 const int8_t* i0 = input[0];
2705 assert(i0 != NULL);
2706 if XNN_UNPREDICTABLE(i0 != zero) {
2707 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
2708 }
2709 const int8_t* i1 = input[1];
2710 assert(i1 != NULL);
2711 if XNN_UNPREDICTABLE(i1 != zero) {
2712 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
2713 }
2714 const int8_t* i2 = input[2];
2715 assert(i2 != NULL);
2716 if XNN_UNPREDICTABLE(i2 != zero) {
2717 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
2718 }
2719 const int8_t* i3 = input[3];
2720 assert(i3 != NULL);
2721 if XNN_UNPREDICTABLE(i3 != zero) {
2722 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
2723 }
2724 const int8_t* i4 = input[4];
2725 assert(i4 != NULL);
2726 if XNN_UNPREDICTABLE(i4 != zero) {
2727 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
2728 }
2729 const int8_t* i5 = input[5];
2730 assert(i5 != NULL);
2731 if XNN_UNPREDICTABLE(i5 != zero) {
2732 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
2733 }
2734 const int8_t* i6 = input[6];
2735 assert(i6 != NULL);
2736 if XNN_UNPREDICTABLE(i6 != zero) {
2737 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
2738 }
2739 const int8_t* i7 = input[7];
2740 assert(i7 != NULL);
2741 if XNN_UNPREDICTABLE(i7 != zero) {
2742 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
2743 }
2744 const int8_t* i8 = input[8];
2745 assert(i8 != NULL);
2746 if XNN_UNPREDICTABLE(i8 != zero) {
2747 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
2748 }
2749 const int8_t* i9 = input[9];
2750 assert(i9 != NULL);
2751 if XNN_UNPREDICTABLE(i9 != zero) {
2752 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
2753 }
2754 const int8_t* i10 = input[10];
2755 assert(i10 != NULL);
2756 if XNN_UNPREDICTABLE(i10 != zero) {
2757 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
2758 }
2759 const int8_t* i11 = input[11];
2760 assert(i11 != NULL);
2761 if XNN_UNPREDICTABLE(i11 != zero) {
2762 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
2763 }
2764 const int8_t* i12 = input[12];
2765 assert(i12 != NULL);
2766 if XNN_UNPREDICTABLE(i12 != zero) {
2767 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
2768 }
2769 const int8_t* i13 = input[13];
2770 assert(i13 != NULL);
2771 if XNN_UNPREDICTABLE(i13 != zero) {
2772 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
2773 }
2774 const int8_t* i14 = input[14];
2775 assert(i14 != NULL);
2776 if XNN_UNPREDICTABLE(i14 != zero) {
2777 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
2778 }
2779 const int8_t* i15 = input[15];
2780 assert(i15 != NULL);
2781 if XNN_UNPREDICTABLE(i15 != zero) {
2782 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
2783 }
2784 const int8_t* i16 = input[16];
2785 assert(i16 != NULL);
2786 if XNN_UNPREDICTABLE(i16 != zero) {
2787 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
2788 }
2789 const int8_t* i17 = input[17];
2790 assert(i17 != NULL);
2791 if XNN_UNPREDICTABLE(i17 != zero) {
2792 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
2793 }
2794 const int8_t* i18 = input[18];
2795 assert(i18 != NULL);
2796 if XNN_UNPREDICTABLE(i18 != zero) {
2797 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
2798 }
2799 const int8_t* i19 = input[19];
2800 assert(i19 != NULL);
2801 if XNN_UNPREDICTABLE(i19 != zero) {
2802 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
2803 }
2804 const int8_t* i20 = input[20];
2805 assert(i20 != NULL);
2806 if XNN_UNPREDICTABLE(i20 != zero) {
2807 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
2808 }
2809 const int8_t* i21 = input[21];
2810 assert(i21 != NULL);
2811 if XNN_UNPREDICTABLE(i21 != zero) {
2812 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
2813 }
2814 const int8_t* i22 = input[22];
2815 assert(i22 != NULL);
2816 if XNN_UNPREDICTABLE(i22 != zero) {
2817 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
2818 }
2819 const int8_t* i23 = input[23];
2820 assert(i23 != NULL);
2821 if XNN_UNPREDICTABLE(i23 != zero) {
2822 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
2823 }
2824 const int8_t* i24 = input[24];
2825 assert(i24 != NULL);
2826 if XNN_UNPREDICTABLE(i24 != zero) {
2827 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
2828 }
2829 input = (const int8_t**) ((uintptr_t) input + input_stride);
2830
2831 size_t c = channels;
2832 const void* w = weights;
2833 for (; c >= 8; c -= 8) {
2834 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
2835 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
2836
2837
2838 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
2839 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
2840 i0 += 8;
2841
2842 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
2843 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
2844
2845 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
2846 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
2847
2848 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
2849 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
2850
2851 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
2852 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
2853 i1 += 8;
2854
2855 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
2856 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
2857
2858 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
2859 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
2860
2861 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
2862 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
2863
2864 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
2865 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
2866 i2 += 8;
2867
2868 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
2869 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
2870
2871 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
2872 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
2873
2874 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
2875 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
2876
2877 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
2878 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
2879 i3 += 8;
2880
2881 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
2882 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
2883
2884 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
2885 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
2886
2887 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
2888 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
2889
2890 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
2891 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
2892 i4 += 8;
2893
2894 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
2895 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
2896
2897 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
2898 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
2899
2900 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
2901 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
2902
2903 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
2904 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
2905 i5 += 8;
2906
2907 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
2908 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
2909
2910 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
2911 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
2912
2913 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
2914 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
2915
2916 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
2917 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
2918 i6 += 8;
2919
2920 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
2921 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
2922
2923 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
2924 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
2925
2926 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
2927 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
2928
2929 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
2930 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
2931 i7 += 8;
2932
2933 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
2934 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
2935
2936 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
2937 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
2938
2939 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
2940 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
2941
2942 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
2943 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
2944 i8 += 8;
2945
2946 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
2947 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
2948
2949 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
2950 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
2951
2952 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
2953 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
2954
2955 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
2956 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
2957 i9 += 8;
2958
2959 const __m128i vxi9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi9x01234567, vi9x01234567), 8);
2960 const __m128i vxk9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk9x01234567, vk9x01234567), 8);
2961
2962 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
2963 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
2964
2965 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
2966 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
2967
2968 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
2969 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
2970 i10 += 8;
2971
2972 const __m128i vxi10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi10x01234567, vi10x01234567), 8);
2973 const __m128i vxk10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk10x01234567, vk10x01234567), 8);
2974
2975 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
2976 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
2977
2978 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
2979 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
2980
2981 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
2982 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
2983 i11 += 8;
2984
2985 const __m128i vxi11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi11x01234567, vi11x01234567), 8);
2986 const __m128i vxk11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk11x01234567, vk11x01234567), 8);
2987
2988 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
2989 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
2990
2991 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
2992 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
2993
2994 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
2995 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
2996 i12 += 8;
2997
2998 const __m128i vxi12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi12x01234567, vi12x01234567), 8);
2999 const __m128i vxk12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk12x01234567, vk12x01234567), 8);
3000
3001 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
3002 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
3003
3004 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
3005 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
3006
3007 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
3008 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
3009 i13 += 8;
3010
3011 const __m128i vxi13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi13x01234567, vi13x01234567), 8);
3012 const __m128i vxk13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk13x01234567, vk13x01234567), 8);
3013
3014 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
3015 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
3016
3017 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
3018 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
3019
3020 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
3021 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
3022 i14 += 8;
3023
3024 const __m128i vxi14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi14x01234567, vi14x01234567), 8);
3025 const __m128i vxk14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk14x01234567, vk14x01234567), 8);
3026
3027 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
3028 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
3029
3030 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
3031 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
3032
3033 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
3034 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
3035 i15 += 8;
3036
3037 const __m128i vxi15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi15x01234567, vi15x01234567), 8);
3038 const __m128i vxk15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk15x01234567, vk15x01234567), 8);
3039
3040 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
3041 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
3042
3043 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
3044 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
3045
3046 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
3047 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
3048 i16 += 8;
3049
3050 const __m128i vxi16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi16x01234567, vi16x01234567), 8);
3051 const __m128i vxk16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk16x01234567, vk16x01234567), 8);
3052
3053 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
3054 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
3055
3056 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
3057 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
3058
3059 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
3060 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
3061 i17 += 8;
3062
3063 const __m128i vxi17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi17x01234567, vi17x01234567), 8);
3064 const __m128i vxk17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk17x01234567, vk17x01234567), 8);
3065
3066 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
3067 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
3068
3069 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
3070 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
3071
3072 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
3073 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3074 i18 += 8;
3075
3076 const __m128i vxi18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi18x01234567, vi18x01234567), 8);
3077 const __m128i vxk18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk18x01234567, vk18x01234567), 8);
3078
3079 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
3080 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
3081
3082 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
3083 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
3084
3085 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
3086 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
3087 i19 += 8;
3088
3089 const __m128i vxi19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi19x01234567, vi19x01234567), 8);
3090 const __m128i vxk19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk19x01234567, vk19x01234567), 8);
3091
3092 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
3093 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
3094
3095 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
3096 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
3097
3098 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
3099 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
3100 i20 += 8;
3101
3102 const __m128i vxi20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi20x01234567, vi20x01234567), 8);
3103 const __m128i vxk20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk20x01234567, vk20x01234567), 8);
3104
3105 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
3106 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
3107
3108 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
3109 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
3110
3111 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
3112 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
3113 i21 += 8;
3114
3115 const __m128i vxi21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi21x01234567, vi21x01234567), 8);
3116 const __m128i vxk21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk21x01234567, vk21x01234567), 8);
3117
3118 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
3119 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
3120
3121 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
3122 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
3123
3124 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
3125 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
3126 i22 += 8;
3127
3128 const __m128i vxi22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi22x01234567, vi22x01234567), 8);
3129 const __m128i vxk22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk22x01234567, vk22x01234567), 8);
3130
3131 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
3132 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
3133
3134 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
3135 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
3136
3137 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
3138 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
3139 i23 += 8;
3140
3141 const __m128i vxi23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi23x01234567, vi23x01234567), 8);
3142 const __m128i vxk23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk23x01234567, vk23x01234567), 8);
3143
3144 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
3145 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
3146
3147 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
3148 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
3149
3150 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
3151 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
3152 i24 += 8;
3153
3154 const __m128i vxi24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi24x01234567, vi24x01234567), 8);
3155 const __m128i vxk24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk24x01234567, vk24x01234567), 8);
3156
3157 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
3158 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
3159
3160 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
3161 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
3162
3163 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
3164
3165 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3166 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3167
3168 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
3169 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
3170 w = (const void*) ((const float*) w + 8);
3171 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
3172 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
3173
3174 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
3175 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3176 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3177
3178 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3179 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3180
3181 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
3182 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3183
3184 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
3185 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
3186
3187 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3188
3189
3190 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3191 output += 8;
3192 }
3193 if XNN_UNLIKELY(c != 0) {
3194 {
3195 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3196 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3197
3198
3199 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3200 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3201
3202 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
3203 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
3204
3205 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3206 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
3207
3208 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
3209 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
3210
3211 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3212 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3213
3214 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
3215 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
3216
3217 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
3218 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
3219
3220 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
3221 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
3222
3223 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3224 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3225
3226 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
3227 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
3228
3229 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3230 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
3231
3232 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
3233 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
3234
3235 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3236 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3237
3238 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
3239 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
3240
3241 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
3242 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
3243
3244 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
3245 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
3246
3247 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3248 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3249
3250 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
3251 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
3252
3253 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3254 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
3255
3256 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
3257 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
3258
3259 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3260 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3261
3262 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
3263 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
3264
3265 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
3266 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
3267
3268 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
3269 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
3270
3271 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3272 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3273
3274 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
3275 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
3276
3277 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3278 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
3279
3280 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
3281 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
3282
3283 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3284 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3285
3286 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
3287 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
3288
3289 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
3290 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
3291
3292 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
3293 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
3294
3295 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3296 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3297
3298 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
3299 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
3300
3301 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3302 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
3303
3304 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
3305 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
3306
3307 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
3308 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
3309
3310 const __m128i vxi9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi9x01234567, vi9x01234567), 8);
3311 const __m128i vxk9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk9x01234567, vk9x01234567), 8);
3312
3313 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
3314 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
3315
3316 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
3317 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
3318
3319 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
3320 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
3321
3322 const __m128i vxi10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi10x01234567, vi10x01234567), 8);
3323 const __m128i vxk10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk10x01234567, vk10x01234567), 8);
3324
3325 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
3326 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
3327
3328 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
3329 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
3330
3331 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
3332 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
3333
3334 const __m128i vxi11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi11x01234567, vi11x01234567), 8);
3335 const __m128i vxk11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk11x01234567, vk11x01234567), 8);
3336
3337 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
3338 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
3339
3340 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
3341 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
3342
3343 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
3344 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
3345
3346 const __m128i vxi12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi12x01234567, vi12x01234567), 8);
3347 const __m128i vxk12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk12x01234567, vk12x01234567), 8);
3348
3349 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
3350 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
3351
3352 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
3353 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
3354
3355 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
3356 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
3357
3358 const __m128i vxi13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi13x01234567, vi13x01234567), 8);
3359 const __m128i vxk13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk13x01234567, vk13x01234567), 8);
3360
3361 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
3362 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
3363
3364 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
3365 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
3366
3367 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
3368 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
3369
3370 const __m128i vxi14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi14x01234567, vi14x01234567), 8);
3371 const __m128i vxk14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk14x01234567, vk14x01234567), 8);
3372
3373 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
3374 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
3375
3376 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
3377 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
3378
3379 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
3380 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
3381
3382 const __m128i vxi15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi15x01234567, vi15x01234567), 8);
3383 const __m128i vxk15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk15x01234567, vk15x01234567), 8);
3384
3385 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
3386 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
3387
3388 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
3389 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
3390
3391 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
3392 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
3393
3394 const __m128i vxi16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi16x01234567, vi16x01234567), 8);
3395 const __m128i vxk16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk16x01234567, vk16x01234567), 8);
3396
3397 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
3398 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
3399
3400 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
3401 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
3402
3403 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
3404 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
3405
3406 const __m128i vxi17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi17x01234567, vi17x01234567), 8);
3407 const __m128i vxk17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk17x01234567, vk17x01234567), 8);
3408
3409 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
3410 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
3411
3412 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
3413 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
3414
3415 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
3416 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
3417
3418 const __m128i vxi18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi18x01234567, vi18x01234567), 8);
3419 const __m128i vxk18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk18x01234567, vk18x01234567), 8);
3420
3421 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
3422 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
3423
3424 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
3425 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
3426
3427 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
3428 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
3429
3430 const __m128i vxi19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi19x01234567, vi19x01234567), 8);
3431 const __m128i vxk19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk19x01234567, vk19x01234567), 8);
3432
3433 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
3434 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
3435
3436 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
3437 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
3438
3439 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
3440 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
3441
3442 const __m128i vxi20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi20x01234567, vi20x01234567), 8);
3443 const __m128i vxk20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk20x01234567, vk20x01234567), 8);
3444
3445 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
3446 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
3447
3448 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
3449 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
3450
3451 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
3452 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
3453
3454 const __m128i vxi21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi21x01234567, vi21x01234567), 8);
3455 const __m128i vxk21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk21x01234567, vk21x01234567), 8);
3456
3457 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
3458 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
3459
3460 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
3461 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
3462
3463 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
3464 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
3465
3466 const __m128i vxi22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi22x01234567, vi22x01234567), 8);
3467 const __m128i vxk22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk22x01234567, vk22x01234567), 8);
3468
3469 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
3470 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
3471
3472 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
3473 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
3474
3475 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
3476 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
3477
3478 const __m128i vxi23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi23x01234567, vi23x01234567), 8);
3479 const __m128i vxk23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk23x01234567, vk23x01234567), 8);
3480
3481 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
3482 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
3483
3484 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
3485 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
3486
3487 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
3488 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
3489
3490 const __m128i vxi24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi24x01234567, vi24x01234567), 8);
3491 const __m128i vxk24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk24x01234567, vk24x01234567), 8);
3492
3493 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
3494 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
3495
3496 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
3497 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
3498
3499
3500 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3501 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3502
3503 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t)));
3504 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t) + 4 * sizeof(float)));
3505 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
3506 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
3507
3508 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
3509 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3510 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3511
3512 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3513 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3514
3515
3516 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
3517 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3518
3519 vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
3520
3521 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3522
3523
3524 if (c & 4) {
3525 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3526 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3527 output += 4;
3528 }
3529 if (c & 2) {
3530 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3531 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3532 output += 2;
3533 }
3534 if (c & 1) {
3535 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
3536 output += 1;
3537 }
3538 }
3539 }
3540
3541 output = (int8_t*) ((uintptr_t) output + output_increment);
3542 } while (--output_width != 0);
3543 }
3544
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse2_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3545 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse2_mul16(
3546 size_t channels,
3547 size_t output_width,
3548 const int8_t** input,
3549 const void* weights,
3550 int8_t* output,
3551 size_t input_stride,
3552 size_t output_increment,
3553 size_t input_offset,
3554 const int8_t* zero,
3555 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3556 {
3557 assert(channels != 0);
3558 assert(output_width != 0);
3559
3560 do {
3561 const int8_t* i0 = input[0];
3562 assert(i0 != NULL);
3563 if XNN_UNPREDICTABLE(i0 != zero) {
3564 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3565 }
3566 const int8_t* i1 = input[1];
3567 assert(i1 != NULL);
3568 if XNN_UNPREDICTABLE(i1 != zero) {
3569 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3570 }
3571 const int8_t* i2 = input[2];
3572 assert(i2 != NULL);
3573 if XNN_UNPREDICTABLE(i2 != zero) {
3574 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3575 }
3576 input = (const int8_t**) ((uintptr_t) input + input_stride);
3577
3578 size_t c = channels;
3579 const void* w = weights;
3580 for (; c >= 8; c -= 8) {
3581 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3582 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3583
3584
3585 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3586 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3587 i0 += 8;
3588
3589 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
3590 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
3591
3592 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3593 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
3594
3595 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
3596 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
3597
3598 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3599 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3600 i1 += 8;
3601
3602 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
3603 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
3604
3605 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
3606 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
3607
3608 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
3609 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
3610
3611 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3612 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3613 i2 += 8;
3614
3615 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
3616 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
3617
3618 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3619 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
3620
3621 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
3622 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
3623
3624 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t));
3625
3626 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3627 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3628
3629 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
3630 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
3631 w = (const void*) ((const float*) w + 8);
3632 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
3633 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
3634
3635 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
3636 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3637 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3638
3639 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3640 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3641
3642 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
3643 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3644
3645 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
3646 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
3647
3648 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3649
3650
3651 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3652 output += 8;
3653 }
3654 if XNN_UNLIKELY(c != 0) {
3655 {
3656 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3657 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3658
3659
3660 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3661 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3662
3663 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
3664 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
3665
3666 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3667 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
3668
3669 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
3670 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
3671
3672 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3673 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3674
3675 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
3676 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
3677
3678 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
3679 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
3680
3681 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
3682 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
3683
3684 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3685 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3686
3687 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
3688 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
3689
3690 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3691 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
3692
3693 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
3694 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
3695
3696
3697 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3698 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3699
3700 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3701 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t) + 4 * sizeof(float)));
3702 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
3703 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
3704
3705 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
3706 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3707 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3708
3709 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3710 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3711
3712
3713 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
3714 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3715
3716 vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
3717
3718 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3719
3720
3721 if (c & 4) {
3722 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
3723 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
3724 output += 4;
3725 }
3726 if (c & 2) {
3727 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
3728 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
3729 output += 2;
3730 }
3731 if (c & 1) {
3732 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
3733 output += 1;
3734 }
3735 }
3736 }
3737
3738 output = (int8_t*) ((uintptr_t) output + output_increment);
3739 } while (--output_width != 0);
3740 }
3741
xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3742 void xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16(
3743 size_t channels,
3744 size_t output_width,
3745 const int8_t** input,
3746 const void* weights,
3747 int8_t* output,
3748 size_t input_stride,
3749 size_t output_increment,
3750 size_t input_offset,
3751 const int8_t* zero,
3752 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
3753 {
3754 assert(channels != 0);
3755 assert(output_width != 0);
3756
3757 do {
3758 const int8_t* i0 = input[0];
3759 assert(i0 != NULL);
3760 if XNN_UNPREDICTABLE(i0 != zero) {
3761 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
3762 }
3763 const int8_t* i1 = input[1];
3764 assert(i1 != NULL);
3765 if XNN_UNPREDICTABLE(i1 != zero) {
3766 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
3767 }
3768 const int8_t* i2 = input[2];
3769 assert(i2 != NULL);
3770 if XNN_UNPREDICTABLE(i2 != zero) {
3771 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
3772 }
3773 const int8_t* i3 = input[3];
3774 assert(i3 != NULL);
3775 if XNN_UNPREDICTABLE(i3 != zero) {
3776 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
3777 }
3778 const int8_t* i4 = input[4];
3779 assert(i4 != NULL);
3780 if XNN_UNPREDICTABLE(i4 != zero) {
3781 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
3782 }
3783 const int8_t* i5 = input[5];
3784 assert(i5 != NULL);
3785 if XNN_UNPREDICTABLE(i5 != zero) {
3786 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
3787 }
3788 const int8_t* i6 = input[6];
3789 assert(i6 != NULL);
3790 if XNN_UNPREDICTABLE(i6 != zero) {
3791 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
3792 }
3793 const int8_t* i7 = input[7];
3794 assert(i7 != NULL);
3795 if XNN_UNPREDICTABLE(i7 != zero) {
3796 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
3797 }
3798 const int8_t* i8 = input[8];
3799 assert(i8 != NULL);
3800 if XNN_UNPREDICTABLE(i8 != zero) {
3801 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
3802 }
3803 input = (const int8_t**) ((uintptr_t) input + input_stride);
3804
3805 size_t c = channels;
3806 const void* w = weights;
3807 for (; c >= 8; c -= 8) {
3808 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3809 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3810
3811
3812 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3813 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3814 i0 += 8;
3815
3816 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
3817 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
3818
3819 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3820 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
3821
3822 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
3823 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
3824
3825 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3826 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3827 i1 += 8;
3828
3829 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
3830 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
3831
3832 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
3833 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
3834
3835 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
3836 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
3837
3838 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3839 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3840 i2 += 8;
3841
3842 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
3843 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
3844
3845 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3846 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
3847
3848 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
3849 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
3850
3851 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
3852 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
3853 i3 += 8;
3854
3855 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
3856 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
3857
3858 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
3859 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
3860
3861 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
3862 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
3863
3864 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
3865 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
3866 i4 += 8;
3867
3868 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
3869 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
3870
3871 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
3872 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
3873
3874 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
3875 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
3876
3877 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
3878 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
3879 i5 += 8;
3880
3881 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
3882 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
3883
3884 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
3885 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
3886
3887 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
3888 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
3889
3890 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
3891 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
3892 i6 += 8;
3893
3894 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
3895 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
3896
3897 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
3898 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
3899
3900 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
3901 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
3902
3903 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
3904 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
3905 i7 += 8;
3906
3907 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
3908 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
3909
3910 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
3911 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
3912
3913 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
3914 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
3915
3916 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
3917 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
3918 i8 += 8;
3919
3920 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
3921 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
3922
3923 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
3924 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
3925
3926 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
3927 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
3928
3929 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
3930
3931 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
3932 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
3933
3934 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
3935 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
3936 w = (const void*) ((const float*) w + 8);
3937 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
3938 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
3939
3940 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
3941 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
3942 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
3943
3944 vacc0123 = _mm_cvtps_epi32(vscaled0123);
3945 vacc4567 = _mm_cvtps_epi32(vscaled4567);
3946
3947 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
3948 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
3949
3950 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
3951 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
3952
3953 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
3954
3955
3956 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
3957 output += 8;
3958 }
3959 if XNN_UNLIKELY(c != 0) {
3960 {
3961 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
3962 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
3963
3964
3965 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
3966 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
3967
3968 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
3969 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
3970
3971 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
3972 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
3973
3974 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
3975 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
3976
3977 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
3978 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
3979
3980 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
3981 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
3982
3983 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
3984 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
3985
3986 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
3987 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
3988
3989 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
3990 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
3991
3992 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
3993 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
3994
3995 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
3996 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
3997
3998 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
3999 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
4000
4001 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
4002 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
4003
4004 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
4005 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
4006
4007 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
4008 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
4009
4010 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
4011 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
4012
4013 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
4014 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
4015
4016 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
4017 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
4018
4019 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
4020 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
4021
4022 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
4023 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
4024
4025 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
4026 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
4027
4028 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
4029 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
4030
4031 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
4032 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
4033
4034 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
4035 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
4036
4037 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
4038 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
4039
4040 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
4041 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
4042
4043 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
4044 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
4045
4046 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
4047 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
4048
4049 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
4050 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
4051
4052 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
4053 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
4054
4055 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
4056 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
4057
4058 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
4059 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
4060
4061 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
4062 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
4063
4064 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
4065 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
4066
4067 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
4068 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
4069
4070 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
4071 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
4072
4073
4074 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
4075 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
4076
4077 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
4078 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t) + 4 * sizeof(float)));
4079 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
4080 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
4081
4082 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
4083 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
4084 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
4085
4086 vacc0123 = _mm_cvtps_epi32(vscaled0123);
4087 vacc4567 = _mm_cvtps_epi32(vscaled4567);
4088
4089
4090 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
4091 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
4092
4093 vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
4094
4095 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
4096
4097
4098 if (c & 4) {
4099 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
4100 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
4101 output += 4;
4102 }
4103 if (c & 2) {
4104 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
4105 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
4106 output += 2;
4107 }
4108 if (c & 1) {
4109 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
4110 output += 1;
4111 }
4112 }
4113 }
4114
4115 output = (int8_t*) ((uintptr_t) output + output_increment);
4116 } while (--output_width != 0);
4117 }
4118
xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4119 void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
4120 size_t mr,
4121 size_t nc,
4122 size_t kc,
4123 const int8_t* restrict a,
4124 size_t a_stride,
4125 const void* restrict w,
4126 int8_t* restrict c,
4127 size_t cm_stride,
4128 size_t cn_stride,
4129 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4130 {
4131 assert(mr != 0);
4132 assert(mr <= 1);
4133 assert(nc != 0);
4134 assert(kc != 0);
4135 assert(kc % sizeof(int8_t) == 0);
4136 assert(a != NULL);
4137 assert(w != NULL);
4138 assert(c != NULL);
4139
4140 kc = round_up_po2(kc, 8);
4141 const int8_t* a0 = a;
4142 int8_t* c0 = c;
4143
4144 do {
4145 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4146 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4147 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4148 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4149 w = (const int32_t*) w + 4;
4150
4151 size_t k = 0;
4152 while (k < kc) {
4153 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4154 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
4155 a0 += 8;
4156
4157 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4158 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
4159
4160 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4161 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4162 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
4163
4164 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4165 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4166 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
4167
4168 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4169 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4170 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
4171
4172 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4173
4174 w = (const void*) ((const int8_t*) w + 32);
4175 k += 8 * sizeof(int8_t);
4176 }
4177
4178 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
4179 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
4180
4181 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
4182
4183 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4184
4185 const __m128 vscale0123 = _mm_load_ps((const float*) w);
4186 w = (const void*) ((const float*) w + 4);
4187 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
4188
4189 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
4190 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4191
4192 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4193
4194 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
4195 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
4196
4197 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
4198 vacc00x0123 = _mm_max_epi16(vacc00x0123, voutput_min);
4199
4200 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
4201
4202
4203 if (nc >= 4) {
4204 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4205
4206 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4207
4208 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4209
4210 nc -= 4;
4211 } else {
4212 if (nc & 2) {
4213 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4214 c0 += 2;
4215 vout = _mm_srli_epi32(vout, 16);
4216 }
4217 if (nc & 1) {
4218 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
4219 }
4220
4221 nc = 0;
4222 }
4223 } while (nc != 0);
4224 }
4225
xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4226 void xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
4227 size_t mr,
4228 size_t nc,
4229 size_t kc,
4230 const int8_t* restrict a,
4231 size_t a_stride,
4232 const void* restrict w,
4233 int8_t* restrict c,
4234 size_t cm_stride,
4235 size_t cn_stride,
4236 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4237 {
4238 assert(mr != 0);
4239 assert(mr <= 3);
4240 assert(nc != 0);
4241 assert(kc != 0);
4242 assert(kc % sizeof(int8_t) == 0);
4243 assert(a != NULL);
4244 assert(w != NULL);
4245 assert(c != NULL);
4246
4247 kc = round_up_po2(kc, 8);
4248 const int8_t* a0 = a;
4249 int8_t* c0 = c;
4250 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
4251 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4252 if XNN_UNPREDICTABLE(mr < 2) {
4253 a1 = a0;
4254 c1 = c0;
4255 }
4256 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
4257 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4258 if XNN_UNPREDICTABLE(mr <= 2) {
4259 a2 = a1;
4260 c2 = c1;
4261 }
4262
4263 do {
4264 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4265 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4266 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4267 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4268 __m128i vacc1x0 = vacc0x0;
4269 __m128i vacc1x1 = vacc0x1;
4270 __m128i vacc1x2 = vacc0x2;
4271 __m128i vacc1x3 = vacc0x3;
4272 __m128i vacc2x0 = vacc0x0;
4273 __m128i vacc2x1 = vacc0x1;
4274 __m128i vacc2x2 = vacc0x2;
4275 __m128i vacc2x3 = vacc0x3;
4276 w = (const int32_t*) w + 4;
4277
4278 size_t k = 0;
4279 while (k < kc) {
4280 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4281 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
4282 a0 += 8;
4283 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
4284 const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
4285 a1 += 8;
4286 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
4287 const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
4288 a2 += 8;
4289
4290 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4291 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
4292
4293 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4294 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
4295 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
4296 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4297 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
4298
4299 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4300 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
4301 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
4302 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4303 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
4304
4305 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4306 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
4307 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
4308 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4309 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
4310
4311 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4312 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
4313 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
4314
4315 w = (const void*) ((const int8_t*) w + 32);
4316 k += 8 * sizeof(int8_t);
4317 }
4318
4319 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
4320 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
4321 const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
4322 const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
4323 const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
4324 const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
4325
4326 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
4327 __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
4328 __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
4329
4330 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4331 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
4332 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
4333
4334 const __m128 vscale0123 = _mm_load_ps((const float*) w);
4335 w = (const void*) ((const float*) w + 4);
4336 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
4337 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
4338 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
4339
4340 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
4341 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4342 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
4343 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
4344
4345 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4346 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
4347 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
4348
4349 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
4350 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
4351 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
4352
4353 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
4354 vacc01x0123 = _mm_max_epi16(vacc01x0123, voutput_min);
4355 vacc22x0123 = _mm_max_epi16(vacc22x0123, voutput_min);
4356
4357 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
4358
4359
4360 if (nc >= 4) {
4361 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4362 vout = _mm_srli_si128(vout, 4);
4363 unaligned_store_u32(c1, (uint32_t) _mm_cvtsi128_si32(vout));
4364 vout = _mm_srli_si128(vout, 4);
4365 unaligned_store_u32(c2, (uint32_t) _mm_cvtsi128_si32(vout));
4366
4367 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4368 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4369 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4370
4371 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
4372 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
4373 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
4374
4375 nc -= 4;
4376 } else {
4377 if (nc & 2) {
4378 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4379 c0 += 2;
4380 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
4381 c1 += 2;
4382 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
4383 c2 += 2;
4384 vout = _mm_srli_epi32(vout, 16);
4385 }
4386 if (nc & 1) {
4387 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
4388 *c1 = (int8_t) _mm_extract_epi16(vout, 2);
4389 *c2 = (int8_t) _mm_extract_epi16(vout, 4);
4390 }
4391
4392 nc = 0;
4393 }
4394 } while (nc != 0);
4395 }
4396
xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4397 void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
4398 size_t mr,
4399 size_t nc,
4400 size_t kc,
4401 size_t ks,
4402 const int8_t** restrict a,
4403 const void* restrict w,
4404 int8_t* restrict c,
4405 size_t cm_stride,
4406 size_t cn_stride,
4407 size_t a_offset,
4408 const int8_t* zero,
4409 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4410 {
4411 assert(mr != 0);
4412 assert(mr <= 1);
4413 assert(nc != 0);
4414 assert(kc != 0);
4415 assert(ks != 0);
4416 assert(ks % (1 * sizeof(void*)) == 0);
4417 assert(a_offset % sizeof(int8_t) == 0);
4418 assert(a != NULL);
4419 assert(w != NULL);
4420 assert(c != NULL);
4421
4422 kc = round_up_po2(kc, 8);
4423 int8_t* c0 = c;
4424
4425 do {
4426 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4427 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4428 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4429 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4430 w = (const int32_t*) w + 4;
4431
4432 size_t p = ks;
4433 do {
4434 const int8_t* restrict a0 = a[0];
4435 if XNN_UNPREDICTABLE(a0 != zero) {
4436 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4437 }
4438 a += 1;
4439
4440 size_t k = 0;
4441 while (k < kc) {
4442 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4443 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
4444 a0 += 8;
4445
4446 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4447 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
4448
4449 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4450 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4451 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
4452
4453 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4454 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4455 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
4456
4457 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4458 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4459 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
4460
4461 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4462
4463 w = (const void*) ((const int8_t*) w + 32);
4464 k += 8 * sizeof(int8_t);
4465 }
4466 p -= 1 * sizeof(void*);
4467 } while (p != 0);
4468
4469 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
4470 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
4471
4472 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
4473
4474 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4475
4476 const __m128 vscale0123 = _mm_load_ps((const float*) w);
4477 w = (const void*) ((const float*) w + 4);
4478 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
4479
4480 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
4481 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4482
4483 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4484
4485 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
4486 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
4487
4488 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
4489 vacc00x0123 = _mm_max_epi16(vacc00x0123, voutput_min);
4490
4491 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
4492
4493
4494 if (nc >= 4) {
4495 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4496 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4497
4498 a = (const int8_t**restrict) ((uintptr_t) a - ks);
4499
4500 nc -= 4;
4501 } else {
4502 if (nc & 2) {
4503 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4504 c0 += 2;
4505 vout = _mm_srli_epi32(vout, 16);
4506 }
4507 if (nc & 1) {
4508 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
4509 }
4510
4511 nc = 0;
4512 }
4513 } while (nc != 0);
4514 }
4515
xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4516 void xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
4517 size_t mr,
4518 size_t nc,
4519 size_t kc,
4520 size_t ks,
4521 const int8_t** restrict a,
4522 const void* restrict w,
4523 int8_t* restrict c,
4524 size_t cm_stride,
4525 size_t cn_stride,
4526 size_t a_offset,
4527 const int8_t* zero,
4528 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4529 {
4530 assert(mr != 0);
4531 assert(mr <= 3);
4532 assert(nc != 0);
4533 assert(kc != 0);
4534 assert(ks != 0);
4535 assert(ks % (3 * sizeof(void*)) == 0);
4536 assert(a_offset % sizeof(int8_t) == 0);
4537 assert(a != NULL);
4538 assert(w != NULL);
4539 assert(c != NULL);
4540
4541 kc = round_up_po2(kc, 8);
4542 int8_t* c0 = c;
4543 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
4544 if XNN_UNPREDICTABLE(mr < 2) {
4545 c1 = c0;
4546 }
4547 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
4548 if XNN_UNPREDICTABLE(mr <= 2) {
4549 c2 = c1;
4550 }
4551
4552 do {
4553 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
4554 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
4555 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
4556 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
4557 __m128i vacc1x0 = vacc0x0;
4558 __m128i vacc1x1 = vacc0x1;
4559 __m128i vacc1x2 = vacc0x2;
4560 __m128i vacc1x3 = vacc0x3;
4561 __m128i vacc2x0 = vacc0x0;
4562 __m128i vacc2x1 = vacc0x1;
4563 __m128i vacc2x2 = vacc0x2;
4564 __m128i vacc2x3 = vacc0x3;
4565 w = (const int32_t*) w + 4;
4566
4567 size_t p = ks;
4568 do {
4569 const int8_t* restrict a0 = a[0];
4570 if XNN_UNPREDICTABLE(a0 != zero) {
4571 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
4572 }
4573 const int8_t* restrict a1 = a[1];
4574 if XNN_UNPREDICTABLE(a1 != zero) {
4575 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
4576 }
4577 const int8_t* restrict a2 = a[2];
4578 if XNN_UNPREDICTABLE(a2 != zero) {
4579 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
4580 }
4581 a += 3;
4582
4583 size_t k = 0;
4584 while (k < kc) {
4585 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
4586 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
4587 a0 += 8;
4588 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
4589 const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
4590 a1 += 8;
4591 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
4592 const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
4593 a2 += 8;
4594
4595 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
4596 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
4597
4598 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
4599 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
4600 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
4601 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
4602 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
4603
4604 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
4605 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
4606 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
4607 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
4608 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
4609
4610 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
4611 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
4612 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
4613 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
4614 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
4615
4616 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
4617 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
4618 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
4619
4620 w = (const void*) ((const int8_t*) w + 32);
4621 k += 8 * sizeof(int8_t);
4622 }
4623 p -= 3 * sizeof(void*);
4624 } while (p != 0);
4625
4626 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
4627 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
4628 const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
4629 const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
4630 const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
4631 const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
4632
4633 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
4634 __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
4635 __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
4636
4637 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
4638 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
4639 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
4640
4641 const __m128 vscale0123 = _mm_load_ps((const float*) w);
4642 w = (const void*) ((const float*) w + 4);
4643 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
4644 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
4645 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale0123);
4646
4647 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
4648 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
4649 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
4650 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
4651
4652 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
4653 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
4654 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
4655
4656 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
4657 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
4658 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
4659
4660 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
4661 vacc01x0123 = _mm_max_epi16(vacc01x0123, voutput_min);
4662 vacc22x0123 = _mm_max_epi16(vacc22x0123, voutput_min);
4663
4664 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
4665
4666
4667 if (nc >= 4) {
4668 unaligned_store_u32(c2, (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2))));
4669 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
4670 unaligned_store_u32(c1, (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1))));
4671 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
4672 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
4673 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
4674
4675 a = (const int8_t**restrict) ((uintptr_t) a - ks);
4676
4677 nc -= 4;
4678 } else {
4679 if (nc & 2) {
4680 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
4681 c2 += 2;
4682 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
4683 c1 += 2;
4684 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
4685 c0 += 2;
4686 vout = _mm_srli_epi32(vout, 16);
4687 }
4688 if (nc & 1) {
4689 *c2 = (int8_t) _mm_extract_epi16(vout, 4);
4690 *c1 = (int8_t) _mm_extract_epi16(vout, 2);
4691 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
4692 }
4693
4694 nc = 0;
4695 }
4696 } while (nc != 0);
4697 }
4698
xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4699 void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16(
4700 size_t channels,
4701 size_t output_width,
4702 const int8_t** input,
4703 const void* weights,
4704 int8_t* output,
4705 size_t input_stride,
4706 size_t output_increment,
4707 size_t input_offset,
4708 const int8_t* zero,
4709 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4710 {
4711 assert(channels != 0);
4712 assert(output_width != 0);
4713
4714 do {
4715 const int8_t* i0 = input[0];
4716 assert(i0 != NULL);
4717 if XNN_UNPREDICTABLE(i0 != zero) {
4718 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
4719 }
4720 const int8_t* i1 = input[1];
4721 assert(i1 != NULL);
4722 if XNN_UNPREDICTABLE(i1 != zero) {
4723 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
4724 }
4725 const int8_t* i2 = input[2];
4726 assert(i2 != NULL);
4727 if XNN_UNPREDICTABLE(i2 != zero) {
4728 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
4729 }
4730 const int8_t* i3 = input[3];
4731 assert(i3 != NULL);
4732 if XNN_UNPREDICTABLE(i3 != zero) {
4733 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
4734 }
4735 const int8_t* i4 = input[4];
4736 assert(i4 != NULL);
4737 if XNN_UNPREDICTABLE(i4 != zero) {
4738 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
4739 }
4740 const int8_t* i5 = input[5];
4741 assert(i5 != NULL);
4742 if XNN_UNPREDICTABLE(i5 != zero) {
4743 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
4744 }
4745 const int8_t* i6 = input[6];
4746 assert(i6 != NULL);
4747 if XNN_UNPREDICTABLE(i6 != zero) {
4748 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
4749 }
4750 const int8_t* i7 = input[7];
4751 assert(i7 != NULL);
4752 if XNN_UNPREDICTABLE(i7 != zero) {
4753 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
4754 }
4755 const int8_t* i8 = input[8];
4756 assert(i8 != NULL);
4757 if XNN_UNPREDICTABLE(i8 != zero) {
4758 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
4759 }
4760 const int8_t* i9 = input[9];
4761 assert(i9 != NULL);
4762 if XNN_UNPREDICTABLE(i9 != zero) {
4763 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
4764 }
4765 const int8_t* i10 = input[10];
4766 assert(i10 != NULL);
4767 if XNN_UNPREDICTABLE(i10 != zero) {
4768 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
4769 }
4770 const int8_t* i11 = input[11];
4771 assert(i11 != NULL);
4772 if XNN_UNPREDICTABLE(i11 != zero) {
4773 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
4774 }
4775 const int8_t* i12 = input[12];
4776 assert(i12 != NULL);
4777 if XNN_UNPREDICTABLE(i12 != zero) {
4778 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
4779 }
4780 const int8_t* i13 = input[13];
4781 assert(i13 != NULL);
4782 if XNN_UNPREDICTABLE(i13 != zero) {
4783 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
4784 }
4785 const int8_t* i14 = input[14];
4786 assert(i14 != NULL);
4787 if XNN_UNPREDICTABLE(i14 != zero) {
4788 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
4789 }
4790 const int8_t* i15 = input[15];
4791 assert(i15 != NULL);
4792 if XNN_UNPREDICTABLE(i15 != zero) {
4793 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
4794 }
4795 const int8_t* i16 = input[16];
4796 assert(i16 != NULL);
4797 if XNN_UNPREDICTABLE(i16 != zero) {
4798 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
4799 }
4800 const int8_t* i17 = input[17];
4801 assert(i17 != NULL);
4802 if XNN_UNPREDICTABLE(i17 != zero) {
4803 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
4804 }
4805 const int8_t* i18 = input[18];
4806 assert(i18 != NULL);
4807 if XNN_UNPREDICTABLE(i18 != zero) {
4808 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
4809 }
4810 const int8_t* i19 = input[19];
4811 assert(i19 != NULL);
4812 if XNN_UNPREDICTABLE(i19 != zero) {
4813 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
4814 }
4815 const int8_t* i20 = input[20];
4816 assert(i20 != NULL);
4817 if XNN_UNPREDICTABLE(i20 != zero) {
4818 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
4819 }
4820 const int8_t* i21 = input[21];
4821 assert(i21 != NULL);
4822 if XNN_UNPREDICTABLE(i21 != zero) {
4823 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
4824 }
4825 const int8_t* i22 = input[22];
4826 assert(i22 != NULL);
4827 if XNN_UNPREDICTABLE(i22 != zero) {
4828 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
4829 }
4830 const int8_t* i23 = input[23];
4831 assert(i23 != NULL);
4832 if XNN_UNPREDICTABLE(i23 != zero) {
4833 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
4834 }
4835 const int8_t* i24 = input[24];
4836 assert(i24 != NULL);
4837 if XNN_UNPREDICTABLE(i24 != zero) {
4838 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
4839 }
4840 input = (const int8_t**) ((uintptr_t) input + input_stride);
4841
4842 size_t c = channels;
4843 const void* w = weights;
4844 for (; c >= 8; c -= 8) {
4845 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
4846 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
4847
4848
4849 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
4850 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
4851 i0 += 8;
4852
4853 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
4854 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
4855
4856 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
4857
4858
4859 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
4860 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
4861 i1 += 8;
4862
4863 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
4864 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
4865
4866 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
4867
4868 const __m128i vsignprod1x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
4869 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod1x01234567));
4870 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod1x01234567));
4871
4872 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
4873 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
4874 i2 += 8;
4875
4876 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
4877 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
4878
4879 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
4880
4881
4882 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
4883 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
4884 i3 += 8;
4885
4886 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
4887 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
4888
4889 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
4890
4891 const __m128i vsignprod3x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
4892 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod3x01234567));
4893 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod3x01234567));
4894
4895 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
4896 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
4897 i4 += 8;
4898
4899 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
4900 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
4901
4902 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
4903
4904
4905 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
4906 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
4907 i5 += 8;
4908
4909 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
4910 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
4911
4912 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
4913
4914 const __m128i vsignprod5x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
4915 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod5x01234567));
4916 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod5x01234567));
4917
4918 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
4919 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
4920 i6 += 8;
4921
4922 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
4923 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
4924
4925 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
4926
4927
4928 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
4929 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
4930 i7 += 8;
4931
4932 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
4933 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
4934
4935 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
4936
4937 const __m128i vsignprod7x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
4938 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod7x01234567));
4939 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod7x01234567));
4940
4941 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
4942 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
4943 i8 += 8;
4944
4945 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
4946 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
4947
4948 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
4949
4950
4951 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
4952 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
4953 i9 += 8;
4954
4955 const __m128i vxi9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi9x01234567, vi9x01234567), 8);
4956 const __m128i vxk9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk9x01234567, vk9x01234567), 8);
4957
4958 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
4959
4960 const __m128i vsignprod9x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
4961 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod9x01234567));
4962 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod9x01234567));
4963
4964 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
4965 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
4966 i10 += 8;
4967
4968 const __m128i vxi10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi10x01234567, vi10x01234567), 8);
4969 const __m128i vxk10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk10x01234567, vk10x01234567), 8);
4970
4971 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
4972
4973
4974 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
4975 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
4976 i11 += 8;
4977
4978 const __m128i vxi11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi11x01234567, vi11x01234567), 8);
4979 const __m128i vxk11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk11x01234567, vk11x01234567), 8);
4980
4981 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
4982
4983 const __m128i vsignprod11x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
4984 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod11x01234567));
4985 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod11x01234567));
4986
4987 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
4988 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
4989 i12 += 8;
4990
4991 const __m128i vxi12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi12x01234567, vi12x01234567), 8);
4992 const __m128i vxk12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk12x01234567, vk12x01234567), 8);
4993
4994 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
4995
4996
4997 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
4998 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
4999 i13 += 8;
5000
5001 const __m128i vxi13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi13x01234567, vi13x01234567), 8);
5002 const __m128i vxk13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk13x01234567, vk13x01234567), 8);
5003
5004 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
5005
5006 const __m128i vsignprod13x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5007 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod13x01234567));
5008 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod13x01234567));
5009
5010 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
5011 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
5012 i14 += 8;
5013
5014 const __m128i vxi14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi14x01234567, vi14x01234567), 8);
5015 const __m128i vxk14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk14x01234567, vk14x01234567), 8);
5016
5017 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
5018
5019
5020 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
5021 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
5022 i15 += 8;
5023
5024 const __m128i vxi15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi15x01234567, vi15x01234567), 8);
5025 const __m128i vxk15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk15x01234567, vk15x01234567), 8);
5026
5027 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
5028
5029 const __m128i vsignprod15x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5030 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod15x01234567));
5031 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod15x01234567));
5032
5033 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
5034 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
5035 i16 += 8;
5036
5037 const __m128i vxi16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi16x01234567, vi16x01234567), 8);
5038 const __m128i vxk16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk16x01234567, vk16x01234567), 8);
5039
5040 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
5041
5042
5043 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
5044 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
5045 i17 += 8;
5046
5047 const __m128i vxi17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi17x01234567, vi17x01234567), 8);
5048 const __m128i vxk17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk17x01234567, vk17x01234567), 8);
5049
5050 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
5051
5052 const __m128i vsignprod17x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5053 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod17x01234567));
5054 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod17x01234567));
5055
5056 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
5057 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
5058 i18 += 8;
5059
5060 const __m128i vxi18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi18x01234567, vi18x01234567), 8);
5061 const __m128i vxk18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk18x01234567, vk18x01234567), 8);
5062
5063 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
5064
5065
5066 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
5067 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
5068 i19 += 8;
5069
5070 const __m128i vxi19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi19x01234567, vi19x01234567), 8);
5071 const __m128i vxk19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk19x01234567, vk19x01234567), 8);
5072
5073 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
5074
5075 const __m128i vsignprod19x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5076 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod19x01234567));
5077 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod19x01234567));
5078
5079 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
5080 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
5081 i20 += 8;
5082
5083 const __m128i vxi20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi20x01234567, vi20x01234567), 8);
5084 const __m128i vxk20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk20x01234567, vk20x01234567), 8);
5085
5086 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
5087
5088
5089 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
5090 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
5091 i21 += 8;
5092
5093 const __m128i vxi21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi21x01234567, vi21x01234567), 8);
5094 const __m128i vxk21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk21x01234567, vk21x01234567), 8);
5095
5096 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
5097
5098 const __m128i vsignprod21x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5099 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod21x01234567));
5100 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod21x01234567));
5101
5102 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
5103 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
5104 i22 += 8;
5105
5106 const __m128i vxi22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi22x01234567, vi22x01234567), 8);
5107 const __m128i vxk22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk22x01234567, vk22x01234567), 8);
5108
5109 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
5110
5111
5112 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
5113 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
5114 i23 += 8;
5115
5116 const __m128i vxi23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi23x01234567, vi23x01234567), 8);
5117 const __m128i vxk23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk23x01234567, vk23x01234567), 8);
5118
5119 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
5120
5121 const __m128i vsignprod23x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5122 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod23x01234567));
5123 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod23x01234567));
5124
5125 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
5126 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
5127 i24 += 8;
5128
5129 const __m128i vxi24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi24x01234567, vi24x01234567), 8);
5130 const __m128i vxk24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk24x01234567, vk24x01234567), 8);
5131
5132 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
5133
5134 const __m128i vsignprod24x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5135 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod24x01234567));
5136 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod24x01234567));
5137
5138 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(int8_t));
5139
5140 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5141 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5142
5143 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
5144 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
5145 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
5146
5147 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
5148 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5149 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5150
5151 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5152 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5153
5154 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
5155 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5156
5157 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
5158 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
5159
5160 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5161
5162
5163 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5164 output += 8;
5165 }
5166 if XNN_UNLIKELY(c != 0) {
5167 {
5168 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5169 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5170
5171
5172 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5173 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
5174
5175 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
5176 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
5177
5178 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5179
5180
5181 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5182 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
5183
5184 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
5185 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
5186
5187 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5188
5189 const __m128i vsignprod1x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5190 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod1x01234567));
5191 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod1x01234567));
5192
5193 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5194 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
5195
5196 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
5197 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
5198
5199 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5200
5201
5202 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5203 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
5204
5205 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
5206 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
5207
5208 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
5209
5210 const __m128i vsignprod3x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5211 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod3x01234567));
5212 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod3x01234567));
5213
5214 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5215 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
5216
5217 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
5218 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
5219
5220 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5221
5222
5223 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5224 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
5225
5226 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
5227 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
5228
5229 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5230
5231 const __m128i vsignprod5x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5232 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod5x01234567));
5233 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod5x01234567));
5234
5235 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5236 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
5237
5238 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
5239 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
5240
5241 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5242
5243
5244 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5245 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
5246
5247 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
5248 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
5249
5250 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5251
5252 const __m128i vsignprod7x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5253 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod7x01234567));
5254 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod7x01234567));
5255
5256 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5257 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
5258
5259 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
5260 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
5261
5262 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5263
5264
5265 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5266 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t)));
5267
5268 const __m128i vxi9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi9x01234567, vi9x01234567), 8);
5269 const __m128i vxk9x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk9x01234567, vk9x01234567), 8);
5270
5271 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
5272
5273 const __m128i vsignprod9x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5274 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod9x01234567));
5275 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod9x01234567));
5276
5277 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5278 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(int8_t)));
5279
5280 const __m128i vxi10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi10x01234567, vi10x01234567), 8);
5281 const __m128i vxk10x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk10x01234567, vk10x01234567), 8);
5282
5283 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5284
5285
5286 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5287 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(int8_t)));
5288
5289 const __m128i vxi11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi11x01234567, vi11x01234567), 8);
5290 const __m128i vxk11x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk11x01234567, vk11x01234567), 8);
5291
5292 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
5293
5294 const __m128i vsignprod11x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5295 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod11x01234567));
5296 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod11x01234567));
5297
5298 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5299 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(int8_t)));
5300
5301 const __m128i vxi12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi12x01234567, vi12x01234567), 8);
5302 const __m128i vxk12x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk12x01234567, vk12x01234567), 8);
5303
5304 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
5305
5306
5307 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
5308 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(int8_t)));
5309
5310 const __m128i vxi13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi13x01234567, vi13x01234567), 8);
5311 const __m128i vxk13x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk13x01234567, vk13x01234567), 8);
5312
5313 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
5314
5315 const __m128i vsignprod13x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5316 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod13x01234567));
5317 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod13x01234567));
5318
5319 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
5320 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(int8_t)));
5321
5322 const __m128i vxi14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi14x01234567, vi14x01234567), 8);
5323 const __m128i vxk14x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk14x01234567, vk14x01234567), 8);
5324
5325 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
5326
5327
5328 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
5329 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(int8_t)));
5330
5331 const __m128i vxi15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi15x01234567, vi15x01234567), 8);
5332 const __m128i vxk15x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk15x01234567, vk15x01234567), 8);
5333
5334 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
5335
5336 const __m128i vsignprod15x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5337 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod15x01234567));
5338 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod15x01234567));
5339
5340 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
5341 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(int8_t)));
5342
5343 const __m128i vxi16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi16x01234567, vi16x01234567), 8);
5344 const __m128i vxk16x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk16x01234567, vk16x01234567), 8);
5345
5346 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
5347
5348
5349 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
5350 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(int8_t)));
5351
5352 const __m128i vxi17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi17x01234567, vi17x01234567), 8);
5353 const __m128i vxk17x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk17x01234567, vk17x01234567), 8);
5354
5355 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
5356
5357 const __m128i vsignprod17x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5358 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod17x01234567));
5359 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod17x01234567));
5360
5361 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
5362 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(int8_t)));
5363
5364 const __m128i vxi18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi18x01234567, vi18x01234567), 8);
5365 const __m128i vxk18x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk18x01234567, vk18x01234567), 8);
5366
5367 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
5368
5369
5370 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
5371 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(int8_t)));
5372
5373 const __m128i vxi19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi19x01234567, vi19x01234567), 8);
5374 const __m128i vxk19x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk19x01234567, vk19x01234567), 8);
5375
5376 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
5377
5378 const __m128i vsignprod19x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5379 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod19x01234567));
5380 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod19x01234567));
5381
5382 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
5383 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(int8_t)));
5384
5385 const __m128i vxi20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi20x01234567, vi20x01234567), 8);
5386 const __m128i vxk20x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk20x01234567, vk20x01234567), 8);
5387
5388 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
5389
5390
5391 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
5392 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(int8_t)));
5393
5394 const __m128i vxi21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi21x01234567, vi21x01234567), 8);
5395 const __m128i vxk21x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk21x01234567, vk21x01234567), 8);
5396
5397 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
5398
5399 const __m128i vsignprod21x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5400 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod21x01234567));
5401 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod21x01234567));
5402
5403 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
5404 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(int8_t)));
5405
5406 const __m128i vxi22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi22x01234567, vi22x01234567), 8);
5407 const __m128i vxk22x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk22x01234567, vk22x01234567), 8);
5408
5409 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
5410
5411
5412 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
5413 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(int8_t)));
5414
5415 const __m128i vxi23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi23x01234567, vi23x01234567), 8);
5416 const __m128i vxk23x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk23x01234567, vk23x01234567), 8);
5417
5418 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
5419
5420 const __m128i vsignprod23x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5421 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod23x01234567));
5422 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod23x01234567));
5423
5424 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
5425 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(int8_t)));
5426
5427 const __m128i vxi24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi24x01234567, vi24x01234567), 8);
5428 const __m128i vxk24x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk24x01234567, vk24x01234567), 8);
5429
5430 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
5431
5432 const __m128i vsignprod24x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5433 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod24x01234567));
5434 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod24x01234567));
5435
5436
5437 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5438 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5439
5440 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
5441 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
5442 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
5443
5444 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
5445 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5446 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5447
5448 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5449 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5450
5451
5452 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
5453 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5454
5455 vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
5456
5457 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5458
5459
5460 if (c & 4) {
5461 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5462 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5463 output += 4;
5464 }
5465 if (c & 2) {
5466 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5467 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5468 output += 2;
5469 }
5470 if (c & 1) {
5471 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
5472 output += 1;
5473 }
5474 }
5475 }
5476
5477 output = (int8_t*) ((uintptr_t) output + output_increment);
5478 } while (--output_width != 0);
5479 }
5480
xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5481 void xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16(
5482 size_t channels,
5483 size_t output_width,
5484 const int8_t** input,
5485 const void* weights,
5486 int8_t* output,
5487 size_t input_stride,
5488 size_t output_increment,
5489 size_t input_offset,
5490 const int8_t* zero,
5491 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5492 {
5493 assert(channels != 0);
5494 assert(output_width != 0);
5495
5496 do {
5497 const int8_t* i0 = input[0];
5498 assert(i0 != NULL);
5499 if XNN_UNPREDICTABLE(i0 != zero) {
5500 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
5501 }
5502 const int8_t* i1 = input[1];
5503 assert(i1 != NULL);
5504 if XNN_UNPREDICTABLE(i1 != zero) {
5505 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
5506 }
5507 const int8_t* i2 = input[2];
5508 assert(i2 != NULL);
5509 if XNN_UNPREDICTABLE(i2 != zero) {
5510 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
5511 }
5512 const int8_t* i3 = input[3];
5513 assert(i3 != NULL);
5514 if XNN_UNPREDICTABLE(i3 != zero) {
5515 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
5516 }
5517 const int8_t* i4 = input[4];
5518 assert(i4 != NULL);
5519 if XNN_UNPREDICTABLE(i4 != zero) {
5520 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
5521 }
5522 const int8_t* i5 = input[5];
5523 assert(i5 != NULL);
5524 if XNN_UNPREDICTABLE(i5 != zero) {
5525 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
5526 }
5527 const int8_t* i6 = input[6];
5528 assert(i6 != NULL);
5529 if XNN_UNPREDICTABLE(i6 != zero) {
5530 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
5531 }
5532 const int8_t* i7 = input[7];
5533 assert(i7 != NULL);
5534 if XNN_UNPREDICTABLE(i7 != zero) {
5535 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
5536 }
5537 const int8_t* i8 = input[8];
5538 assert(i8 != NULL);
5539 if XNN_UNPREDICTABLE(i8 != zero) {
5540 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
5541 }
5542 input = (const int8_t**) ((uintptr_t) input + input_stride);
5543
5544 size_t c = channels;
5545 const void* w = weights;
5546 for (; c >= 8; c -= 8) {
5547 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5548 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5549
5550
5551 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5552 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
5553 i0 += 8;
5554
5555 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
5556 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
5557
5558 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5559
5560
5561 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5562 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
5563 i1 += 8;
5564
5565 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
5566 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
5567
5568 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5569
5570 const __m128i vsignprod1x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5571 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod1x01234567));
5572 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod1x01234567));
5573
5574 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5575 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
5576 i2 += 8;
5577
5578 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
5579 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
5580
5581 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5582
5583
5584 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5585 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
5586 i3 += 8;
5587
5588 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
5589 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
5590
5591 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
5592
5593 const __m128i vsignprod3x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5594 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod3x01234567));
5595 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod3x01234567));
5596
5597 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5598 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
5599 i4 += 8;
5600
5601 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
5602 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
5603
5604 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5605
5606
5607 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5608 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
5609 i5 += 8;
5610
5611 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
5612 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
5613
5614 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5615
5616 const __m128i vsignprod5x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5617 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod5x01234567));
5618 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod5x01234567));
5619
5620 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5621 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
5622 i6 += 8;
5623
5624 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
5625 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
5626
5627 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5628
5629
5630 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5631 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
5632 i7 += 8;
5633
5634 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
5635 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
5636
5637 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5638
5639 const __m128i vsignprod7x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5640 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod7x01234567));
5641 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod7x01234567));
5642
5643 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5644 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
5645 i8 += 8;
5646
5647 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
5648 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
5649
5650 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5651
5652 const __m128i vsignprod8x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5653 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod8x01234567));
5654 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod8x01234567));
5655
5656 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
5657
5658 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5659 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5660
5661 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
5662 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
5663 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
5664
5665 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
5666 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5667 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5668
5669 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5670 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5671
5672 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
5673 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5674
5675 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
5676 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
5677
5678 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5679
5680
5681 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5682 output += 8;
5683 }
5684 if XNN_UNLIKELY(c != 0) {
5685 {
5686 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5687 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5688
5689
5690 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5691 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
5692
5693 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
5694 const __m128i vxk0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk0x01234567, vk0x01234567), 8);
5695
5696 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5697
5698
5699 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5700 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
5701
5702 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
5703 const __m128i vxk1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk1x01234567, vk1x01234567), 8);
5704
5705 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5706
5707 const __m128i vsignprod1x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5708 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod1x01234567));
5709 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod1x01234567));
5710
5711 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5712 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
5713
5714 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
5715 const __m128i vxk2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk2x01234567, vk2x01234567), 8);
5716
5717 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5718
5719
5720 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5721 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
5722
5723 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
5724 const __m128i vxk3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk3x01234567, vk3x01234567), 8);
5725
5726 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
5727
5728 const __m128i vsignprod3x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5729 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod3x01234567));
5730 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod3x01234567));
5731
5732 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5733 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
5734
5735 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
5736 const __m128i vxk4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk4x01234567, vk4x01234567), 8);
5737
5738 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5739
5740
5741 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5742 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
5743
5744 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
5745 const __m128i vxk5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk5x01234567, vk5x01234567), 8);
5746
5747 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5748
5749 const __m128i vsignprod5x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5750 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod5x01234567));
5751 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod5x01234567));
5752
5753 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5754 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
5755
5756 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
5757 const __m128i vxk6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk6x01234567, vk6x01234567), 8);
5758
5759 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5760
5761
5762 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5763 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
5764
5765 const __m128i vxi7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi7x01234567, vi7x01234567), 8);
5766 const __m128i vxk7x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk7x01234567, vk7x01234567), 8);
5767
5768 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5769
5770 const __m128i vsignprod7x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5771 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod7x01234567));
5772 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod7x01234567));
5773
5774 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5775 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
5776
5777 const __m128i vxi8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi8x01234567, vi8x01234567), 8);
5778 const __m128i vxk8x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vk8x01234567, vk8x01234567), 8);
5779
5780 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5781
5782 const __m128i vsignprod8x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vprod01234567);
5783 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod01234567, vsignprod8x01234567));
5784 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod01234567, vsignprod8x01234567));
5785
5786
5787 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5788 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5789
5790 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
5791 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
5792 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
5793
5794 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
5795 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5796 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5797
5798 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5799 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5800
5801
5802 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
5803 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5804
5805 vout01234567 = _mm_max_epi16(vout01234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
5806
5807 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5808
5809
5810 if (c & 4) {
5811 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5812 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5813 output += 4;
5814 }
5815 if (c & 2) {
5816 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5817 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5818 output += 2;
5819 }
5820 if (c & 1) {
5821 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
5822 output += 1;
5823 }
5824 }
5825 }
5826
5827 output = (int8_t*) ((uintptr_t) output + output_increment);
5828 } while (--output_width != 0);
5829 }
5830
xnn_qs8_f32_vcvt_ukernel__sse2_x32(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])5831 void xnn_qs8_f32_vcvt_ukernel__sse2_x32(
5832 size_t n,
5833 const int8_t* x,
5834 float* y,
5835 const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5836 {
5837 assert(n != 0);
5838 assert(n % sizeof(int8_t) == 0);
5839 assert(x != NULL);
5840 assert(y != NULL);
5841
5842 const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse2.sign_mask);
5843 const __m128i vmagic_exp = _mm_load_si128((const __m128i*) params->sse2.magic_exp);
5844 const __m128 vmagic_bias = _mm_load_ps(params->sse2.magic_bias);
5845 const __m128 vscale = _mm_load_ps(params->sse2.scale);
5846 const __m128i vzero = _mm_setzero_si128();
5847 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
5848 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) x);
5849 __m128i vx89ABCDEF = _mm_loadl_epi64((const __m128i*) (x + 8));
5850 __m128i vxGHIJKLMN = _mm_loadl_epi64((const __m128i*) (x + 16));
5851 __m128i vxOPQRSTUV = _mm_loadl_epi64((const __m128i*) (x + 24));
5852 x += 32;
5853
5854 vx01234567 = _mm_xor_si128(vx01234567, vsign_mask);
5855 vx89ABCDEF = _mm_xor_si128(vx89ABCDEF, vsign_mask);
5856 vxGHIJKLMN = _mm_xor_si128(vxGHIJKLMN, vsign_mask);
5857 vxOPQRSTUV = _mm_xor_si128(vxOPQRSTUV, vsign_mask);
5858
5859 vx01234567 = _mm_unpacklo_epi8(vx01234567, vzero);
5860 vx89ABCDEF = _mm_unpacklo_epi8(vx89ABCDEF, vzero);
5861 vxGHIJKLMN = _mm_unpacklo_epi8(vxGHIJKLMN, vzero);
5862 vxOPQRSTUV = _mm_unpacklo_epi8(vxOPQRSTUV, vzero);
5863
5864 __m128 vy0123 = _mm_castsi128_ps(_mm_unpacklo_epi16(vx01234567, vmagic_exp));
5865 __m128 vy4567 = _mm_castsi128_ps(_mm_unpackhi_epi16(vx01234567, vmagic_exp));
5866 __m128 vy89AB = _mm_castsi128_ps(_mm_unpacklo_epi16(vx89ABCDEF, vmagic_exp));
5867 __m128 vyCDEF = _mm_castsi128_ps(_mm_unpackhi_epi16(vx89ABCDEF, vmagic_exp));
5868 __m128 vyGHIJ = _mm_castsi128_ps(_mm_unpacklo_epi16(vxGHIJKLMN, vmagic_exp));
5869 __m128 vyKLMN = _mm_castsi128_ps(_mm_unpackhi_epi16(vxGHIJKLMN, vmagic_exp));
5870 __m128 vyOPQR = _mm_castsi128_ps(_mm_unpacklo_epi16(vxOPQRSTUV, vmagic_exp));
5871 __m128 vySTUV = _mm_castsi128_ps(_mm_unpackhi_epi16(vxOPQRSTUV, vmagic_exp));
5872
5873 vy0123 = _mm_sub_ps(vy0123, vmagic_bias);
5874 vy4567 = _mm_sub_ps(vy4567, vmagic_bias);
5875 vy89AB = _mm_sub_ps(vy89AB, vmagic_bias);
5876 vyCDEF = _mm_sub_ps(vyCDEF, vmagic_bias);
5877 vyGHIJ = _mm_sub_ps(vyGHIJ, vmagic_bias);
5878 vyKLMN = _mm_sub_ps(vyKLMN, vmagic_bias);
5879 vyOPQR = _mm_sub_ps(vyOPQR, vmagic_bias);
5880 vySTUV = _mm_sub_ps(vySTUV, vmagic_bias);
5881
5882 vy0123 = _mm_mul_ps(vy0123, vscale);
5883 vy4567 = _mm_mul_ps(vy4567, vscale);
5884 vy89AB = _mm_mul_ps(vy89AB, vscale);
5885 vyCDEF = _mm_mul_ps(vyCDEF, vscale);
5886 vyGHIJ = _mm_mul_ps(vyGHIJ, vscale);
5887 vyKLMN = _mm_mul_ps(vyKLMN, vscale);
5888 vyOPQR = _mm_mul_ps(vyOPQR, vscale);
5889 vySTUV = _mm_mul_ps(vySTUV, vscale);
5890
5891 _mm_storeu_ps(y, vy0123);
5892 _mm_storeu_ps(y + 4, vy4567);
5893 _mm_storeu_ps(y + 8, vy89AB);
5894 _mm_storeu_ps(y + 12, vyCDEF);
5895 _mm_storeu_ps(y + 16, vyGHIJ);
5896 _mm_storeu_ps(y + 20, vyKLMN);
5897 _mm_storeu_ps(y + 24, vyOPQR);
5898 _mm_storeu_ps(y + 28, vySTUV);
5899 y += 32;
5900 }
5901 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
5902 __m128i vx = _mm_loadl_epi64((const __m128i*) x);
5903 vx = _mm_xor_si128(vx, vsign_mask);
5904 vx = _mm_unpacklo_epi8(vx, vzero);
5905 x += 8;
5906
5907 __m128 vy_lo = _mm_castsi128_ps(_mm_unpacklo_epi16(vx, vmagic_exp));
5908 __m128 vy_hi = _mm_castsi128_ps(_mm_unpackhi_epi16(vx, vmagic_exp));
5909
5910 vy_lo = _mm_sub_ps(vy_lo, vmagic_bias);
5911 vy_hi = _mm_sub_ps(vy_hi, vmagic_bias);
5912
5913 vy_lo = _mm_mul_ps(vy_lo, vscale);
5914 vy_hi = _mm_mul_ps(vy_hi, vscale);
5915
5916 _mm_storeu_ps(y, vy_lo);
5917 _mm_storeu_ps(y + 4, vy_hi);
5918 y += 8;
5919 }
5920 if XNN_UNLIKELY(n != 0) {
5921 assert(n >= 1 * sizeof(int8_t));
5922 assert(n <= 7 * sizeof(int8_t));
5923
5924 __m128i vx = _mm_loadl_epi64((const __m128i*) x);
5925 vx = _mm_xor_si128(vx, vsign_mask);
5926 vx = _mm_unpacklo_epi8(vx, vzero);
5927
5928 __m128 vy = _mm_castsi128_ps(_mm_unpacklo_epi16(vx, vmagic_exp));
5929 vy = _mm_sub_ps(vy, vmagic_bias);
5930 vy = _mm_mul_ps(vy, vscale);
5931
5932 if (n & (4 * sizeof(int8_t))) {
5933 _mm_storeu_ps(y, vy);
5934 vy = _mm_castsi128_ps(_mm_unpackhi_epi16(vx, vmagic_exp));
5935 vy = _mm_sub_ps(vy, vmagic_bias);
5936 vy = _mm_mul_ps(vy, vscale);
5937 y += 4;
5938 }
5939 if (n & (2 * sizeof(int8_t))) {
5940 _mm_storel_pi((__m64*) y, vy);
5941 vy = _mm_movehl_ps(vy, vy);
5942 y += 2;
5943 }
5944 if (n & (1 * sizeof(int8_t))) {
5945 _mm_store_ss(y, vy);
5946 }
5947 }
5948 }
5949
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5950 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(
5951 size_t rows,
5952 size_t channels,
5953 const int8_t* input,
5954 size_t input_stride,
5955 const int8_t* zero,
5956 int32_t* buffer,
5957 int8_t* output,
5958 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5959 {
5960 assert(rows > 7);
5961 assert(channels != 0);
5962
5963 const int8_t* i0 = input;
5964 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
5965 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
5966 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
5967 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
5968 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
5969 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
5970 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
5971
5972 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
5973 int32_t* b = buffer;
5974 size_t c = channels;
5975 for (; c != 0; c = doz(c, 8)) {
5976
5977 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5978 i0 += 8;
5979
5980 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
5981 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5982 i1 += 8;
5983
5984 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
5985 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5986 i2 += 8;
5987
5988 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
5989 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
5990 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5991 i3 += 8;
5992
5993 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
5994 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
5995 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5996 i4 += 8;
5997
5998 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
5999 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
6000 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6001 i5 += 8;
6002
6003 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
6004 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
6005 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6006 i6 += 8;
6007
6008 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
6009 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
6010
6011 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
6012
6013 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
6014 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
6015 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
6016
6017 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
6018 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
6019
6020 _mm_store_si128((__m128i*) b, vacc0123);
6021 _mm_store_si128((__m128i*) (b + 4), vacc4567);
6022 b += 8;
6023 }
6024
6025 for (rows -= 7; rows > 7; rows -= 7) {
6026 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
6027 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
6028 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
6029 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
6030 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
6031 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
6032 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
6033
6034 int32_t* b = buffer;
6035 size_t c = channels;
6036 for (; c != 0; c = doz(c, 8)) {
6037
6038 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6039 i0 += 8;
6040
6041 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
6042 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6043 i1 += 8;
6044
6045 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
6046 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6047 i2 += 8;
6048
6049 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
6050 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
6051 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6052 i3 += 8;
6053
6054 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
6055 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
6056 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6057 i4 += 8;
6058
6059 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
6060 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
6061 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6062 i5 += 8;
6063
6064 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
6065 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
6066 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6067 i6 += 8;
6068
6069 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
6070 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
6071
6072 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
6073
6074 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
6075 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
6076 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
6077
6078 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
6079 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
6080
6081 _mm_store_si128((__m128i*) b, vacc0123);
6082 _mm_store_si128((__m128i*) (b + 4), vacc4567);
6083 b += 8;
6084 }
6085 }
6086
6087 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
6088 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
6089 if XNN_UNPREDICTABLE(rows < 2) {
6090 i1 = zero;
6091 }
6092 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
6093 if XNN_UNPREDICTABLE(rows <= 2) {
6094 i2 = zero;
6095 }
6096 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
6097 if XNN_UNPREDICTABLE(rows < 4) {
6098 i3 = zero;
6099 }
6100 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
6101 if XNN_UNPREDICTABLE(rows <= 4) {
6102 i4 = zero;
6103 }
6104 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
6105 if XNN_UNPREDICTABLE(rows < 6) {
6106 i5 = zero;
6107 }
6108 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
6109 if XNN_UNPREDICTABLE(rows <= 6) {
6110 i6 = zero;
6111 }
6112
6113 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6114 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6115 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6116 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6117 for (; channels >= 8; channels -= 8) {
6118
6119 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6120 i0 += 8;
6121
6122 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
6123 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6124 i1 += 8;
6125
6126 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
6127 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6128 i2 += 8;
6129
6130 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
6131 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
6132 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6133 i3 += 8;
6134
6135 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
6136 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
6137 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6138 i4 += 8;
6139
6140 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
6141 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
6142 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6143 i5 += 8;
6144
6145 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
6146 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
6147 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6148 i6 += 8;
6149
6150 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
6151 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
6152
6153 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
6154
6155 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
6156 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
6157 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
6158
6159 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
6160 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
6161 buffer += 8;
6162
6163 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
6164 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
6165
6166 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
6167 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
6168
6169 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
6170 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
6171
6172 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
6173 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
6174
6175 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6176
6177 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
6178
6179 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
6180
6181
6182 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6183 output += 8;
6184 }
6185 if XNN_UNLIKELY(channels != 0) {
6186 {
6187
6188 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6189 i0 += 8;
6190
6191 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6192 i1 += 8;
6193
6194 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
6195 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6196 i2 += 8;
6197
6198 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
6199 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6200 i3 += 8;
6201
6202 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
6203 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
6204 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6205 i4 += 8;
6206
6207 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
6208 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
6209 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6210 i5 += 8;
6211
6212 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
6213 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
6214 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6215 i6 += 8;
6216
6217 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
6218 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
6219
6220 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
6221 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
6222
6223 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
6224
6225 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
6226 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
6227 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
6228
6229 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
6230 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
6231 buffer += 8;
6232
6233 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
6234 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
6235
6236 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
6237 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
6238
6239 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
6240 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
6241
6242 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
6243 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
6244
6245 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6246 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
6247
6248 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
6249
6250 if (channels & 4) {
6251 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6252 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6253 output += 4;
6254 }
6255 uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
6256 if (channels & 2) {
6257 unaligned_store_u16(output, (uint16_t) vout0123);
6258 vout0123 >>= 16;
6259 output += 2;
6260 }
6261 if (channels & 1) {
6262 *output = (int8_t) vout0123;
6263 }
6264 }
6265 }
6266 }
6267
xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6268 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8(
6269 size_t rows,
6270 size_t channels,
6271 const int8_t* input,
6272 size_t input_stride,
6273 const int8_t* zero,
6274 int8_t* output,
6275 const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6276 {
6277 assert(rows != 0);
6278 assert(rows <= 7);
6279 assert(channels != 0);
6280
6281 const int8_t* i0 = input;
6282 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
6283 if XNN_UNPREDICTABLE(rows < 2) {
6284 i1 = zero;
6285 }
6286 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
6287 if XNN_UNPREDICTABLE(rows <= 2) {
6288 i2 = zero;
6289 }
6290 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
6291 if XNN_UNPREDICTABLE(rows < 4) {
6292 i3 = zero;
6293 }
6294 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
6295 if XNN_UNPREDICTABLE(rows <= 4) {
6296 i4 = zero;
6297 }
6298 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
6299 if XNN_UNPREDICTABLE(rows < 6) {
6300 i5 = zero;
6301 }
6302 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
6303 if XNN_UNPREDICTABLE(rows <= 6) {
6304 i6 = zero;
6305 }
6306
6307 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
6308 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6309 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6310 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6311 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6312 for (; channels >= 8; channels -= 8) {
6313
6314 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6315 i0 += 8;
6316
6317 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
6318 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6319 i1 += 8;
6320
6321 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
6322 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6323 i2 += 8;
6324
6325 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
6326 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
6327 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6328 i3 += 8;
6329
6330 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
6331 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
6332 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6333 i4 += 8;
6334
6335 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
6336 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
6337 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6338 i5 += 8;
6339
6340 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
6341 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
6342 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6343 i6 += 8;
6344
6345 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
6346 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
6347
6348 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
6349
6350 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
6351 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
6352 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
6353
6354 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
6355 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
6356
6357 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
6358 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
6359
6360 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
6361 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
6362
6363 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
6364 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
6365
6366 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
6367 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
6368
6369 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6370
6371 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
6372
6373 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
6374
6375
6376 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6377 output += 8;
6378 }
6379 if XNN_UNLIKELY(channels != 0) {
6380 {
6381
6382 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6383 i0 += 8;
6384
6385 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6386 i1 += 8;
6387
6388 const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8);
6389 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6390 i2 += 8;
6391
6392 const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8);
6393 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6394 i3 += 8;
6395
6396 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
6397 const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8);
6398 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6399 i4 += 8;
6400
6401 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
6402 const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8);
6403 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6404 i5 += 8;
6405
6406 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
6407 const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8);
6408 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6409 i6 += 8;
6410
6411 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
6412 const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8);
6413
6414 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
6415 const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8);
6416
6417 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
6418
6419 const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567);
6420 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567);
6421 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567);
6422
6423 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
6424 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
6425
6426 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
6427 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
6428
6429 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
6430 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
6431
6432 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
6433 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
6434
6435 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
6436 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
6437
6438 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6439 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
6440
6441 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
6442
6443 if (channels & 4) {
6444 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6445 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6446 output += 4;
6447 }
6448 uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
6449 if (channels & 2) {
6450 unaligned_store_u16(output, (uint16_t) vout0123);
6451 vout0123 >>= 16;
6452 output += 2;
6453 }
6454 if (channels & 1) {
6455 *output = (int8_t) vout0123;
6456 }
6457 }
6458 }
6459 }
6460
xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6461 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
6462 size_t mr,
6463 size_t nc,
6464 size_t kc,
6465 const int8_t* restrict a,
6466 size_t a_stride,
6467 const void* restrict w,
6468 int8_t* restrict c,
6469 size_t cm_stride,
6470 size_t cn_stride,
6471 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6472 {
6473 assert(mr != 0);
6474 assert(mr <= 1);
6475 assert(nc != 0);
6476 assert(kc != 0);
6477 assert(kc % sizeof(int8_t) == 0);
6478 assert(a != NULL);
6479 assert(w != NULL);
6480 assert(c != NULL);
6481
6482 kc = round_up_po2(kc, 8);
6483 const int8_t* a0 = a;
6484 int8_t* c0 = c;
6485
6486 do {
6487 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6488 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6489 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6490 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6491 w = (const int32_t*) w + 4;
6492
6493 size_t k = 0;
6494 while (k < kc) {
6495 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6496 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
6497 a0 += 8;
6498
6499 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
6500 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
6501
6502 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6503 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
6504 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
6505
6506 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6507 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
6508 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
6509
6510 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6511 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
6512 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
6513
6514 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6515
6516 w = (const void*) ((const int8_t*) w + 32);
6517 k += 8 * sizeof(int8_t);
6518 }
6519
6520 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
6521 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
6522
6523 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
6524
6525 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6526
6527 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6528 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
6529
6530 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6531 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6532
6533 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6534
6535 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6536 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
6537
6538 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6539 vacc00x0123 = _mm_max_epi16(vacc00x0123, voutput_min);
6540
6541 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
6542
6543
6544 if (nc >= 4) {
6545 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6546
6547 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6548
6549 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
6550
6551 nc -= 4;
6552 } else {
6553 if (nc & 2) {
6554 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6555 c0 += 2;
6556 vout = _mm_srli_epi32(vout, 16);
6557 }
6558 if (nc & 1) {
6559 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
6560 }
6561
6562 nc = 0;
6563 }
6564 } while (nc != 0);
6565 }
6566
xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6567 void xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
6568 size_t mr,
6569 size_t nc,
6570 size_t kc,
6571 const int8_t* restrict a,
6572 size_t a_stride,
6573 const void* restrict w,
6574 int8_t* restrict c,
6575 size_t cm_stride,
6576 size_t cn_stride,
6577 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6578 {
6579 assert(mr != 0);
6580 assert(mr <= 3);
6581 assert(nc != 0);
6582 assert(kc != 0);
6583 assert(kc % sizeof(int8_t) == 0);
6584 assert(a != NULL);
6585 assert(w != NULL);
6586 assert(c != NULL);
6587
6588 kc = round_up_po2(kc, 8);
6589 const int8_t* a0 = a;
6590 int8_t* c0 = c;
6591 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
6592 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
6593 if XNN_UNPREDICTABLE(mr < 2) {
6594 a1 = a0;
6595 c1 = c0;
6596 }
6597 const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
6598 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
6599 if XNN_UNPREDICTABLE(mr <= 2) {
6600 a2 = a1;
6601 c2 = c1;
6602 }
6603
6604 do {
6605 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6606 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6607 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6608 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6609 __m128i vacc1x0 = vacc0x0;
6610 __m128i vacc1x1 = vacc0x1;
6611 __m128i vacc1x2 = vacc0x2;
6612 __m128i vacc1x3 = vacc0x3;
6613 __m128i vacc2x0 = vacc0x0;
6614 __m128i vacc2x1 = vacc0x1;
6615 __m128i vacc2x2 = vacc0x2;
6616 __m128i vacc2x3 = vacc0x3;
6617 w = (const int32_t*) w + 4;
6618
6619 size_t k = 0;
6620 while (k < kc) {
6621 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6622 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
6623 a0 += 8;
6624 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
6625 const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
6626 a1 += 8;
6627 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
6628 const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
6629 a2 += 8;
6630
6631 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
6632 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
6633
6634 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6635 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
6636 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
6637 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
6638 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
6639
6640 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6641 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
6642 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
6643 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
6644 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
6645
6646 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6647 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
6648 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
6649 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
6650 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
6651
6652 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6653 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
6654 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
6655
6656 w = (const void*) ((const int8_t*) w + 32);
6657 k += 8 * sizeof(int8_t);
6658 }
6659
6660 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
6661 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
6662 const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
6663 const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
6664 const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
6665 const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
6666
6667 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
6668 __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
6669 __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
6670
6671 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6672 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
6673 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
6674
6675 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6676 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
6677 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
6678 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
6679
6680 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6681 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6682 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
6683 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
6684
6685 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6686 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
6687 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
6688
6689 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6690 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
6691 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
6692
6693 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6694 vacc01x0123 = _mm_max_epi16(vacc01x0123, voutput_min);
6695 vacc22x0123 = _mm_max_epi16(vacc22x0123, voutput_min);
6696
6697 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
6698
6699
6700 if (nc >= 4) {
6701 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6702 vout = _mm_srli_si128(vout, 4);
6703 unaligned_store_u32(c1, (uint32_t) _mm_cvtsi128_si32(vout));
6704 vout = _mm_srli_si128(vout, 4);
6705 unaligned_store_u32(c2, (uint32_t) _mm_cvtsi128_si32(vout));
6706
6707 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6708 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
6709 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
6710
6711 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
6712 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
6713 a2 = (const int8_t*) ((uintptr_t) a2 - kc);
6714
6715 nc -= 4;
6716 } else {
6717 if (nc & 2) {
6718 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6719 c0 += 2;
6720 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
6721 c1 += 2;
6722 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
6723 c2 += 2;
6724 vout = _mm_srli_epi32(vout, 16);
6725 }
6726 if (nc & 1) {
6727 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
6728 *c1 = (int8_t) _mm_extract_epi16(vout, 2);
6729 *c2 = (int8_t) _mm_extract_epi16(vout, 4);
6730 }
6731
6732 nc = 0;
6733 }
6734 } while (nc != 0);
6735 }
6736
xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6737 void xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
6738 size_t mr,
6739 size_t nc,
6740 size_t kc,
6741 size_t ks,
6742 const int8_t** restrict a,
6743 const void* restrict w,
6744 int8_t* restrict c,
6745 size_t cm_stride,
6746 size_t cn_stride,
6747 size_t a_offset,
6748 const int8_t* zero,
6749 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6750 {
6751 assert(mr != 0);
6752 assert(mr <= 1);
6753 assert(nc != 0);
6754 assert(kc != 0);
6755 assert(ks != 0);
6756 assert(ks % (1 * sizeof(void*)) == 0);
6757 assert(a_offset % sizeof(int8_t) == 0);
6758 assert(a != NULL);
6759 assert(w != NULL);
6760 assert(c != NULL);
6761
6762 kc = round_up_po2(kc, 8);
6763 int8_t* c0 = c;
6764
6765 do {
6766 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6767 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6768 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6769 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6770 w = (const int32_t*) w + 4;
6771
6772 size_t p = ks;
6773 do {
6774 const int8_t* restrict a0 = a[0];
6775 if XNN_UNPREDICTABLE(a0 != zero) {
6776 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
6777 }
6778 a += 1;
6779
6780 size_t k = 0;
6781 while (k < kc) {
6782 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6783 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
6784 a0 += 8;
6785
6786 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
6787 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
6788
6789 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6790 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
6791 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
6792
6793 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6794 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
6795 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
6796
6797 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6798 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
6799 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
6800
6801 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6802
6803 w = (const void*) ((const int8_t*) w + 32);
6804 k += 8 * sizeof(int8_t);
6805 }
6806 p -= 1 * sizeof(void*);
6807 } while (p != 0);
6808
6809 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
6810 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
6811
6812 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
6813
6814 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6815
6816 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6817 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
6818
6819 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6820 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6821
6822 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6823
6824 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6825 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
6826
6827 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6828 vacc00x0123 = _mm_max_epi16(vacc00x0123, voutput_min);
6829
6830 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
6831
6832
6833 if (nc >= 4) {
6834 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6835 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6836
6837 a = (const int8_t**restrict) ((uintptr_t) a - ks);
6838
6839 nc -= 4;
6840 } else {
6841 if (nc & 2) {
6842 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6843 c0 += 2;
6844 vout = _mm_srli_epi32(vout, 16);
6845 }
6846 if (nc & 1) {
6847 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
6848 }
6849
6850 nc = 0;
6851 }
6852 } while (nc != 0);
6853 }
6854
xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6855 void xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
6856 size_t mr,
6857 size_t nc,
6858 size_t kc,
6859 size_t ks,
6860 const int8_t** restrict a,
6861 const void* restrict w,
6862 int8_t* restrict c,
6863 size_t cm_stride,
6864 size_t cn_stride,
6865 size_t a_offset,
6866 const int8_t* zero,
6867 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6868 {
6869 assert(mr != 0);
6870 assert(mr <= 3);
6871 assert(nc != 0);
6872 assert(kc != 0);
6873 assert(ks != 0);
6874 assert(ks % (3 * sizeof(void*)) == 0);
6875 assert(a_offset % sizeof(int8_t) == 0);
6876 assert(a != NULL);
6877 assert(w != NULL);
6878 assert(c != NULL);
6879
6880 kc = round_up_po2(kc, 8);
6881 int8_t* c0 = c;
6882 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
6883 if XNN_UNPREDICTABLE(mr < 2) {
6884 c1 = c0;
6885 }
6886 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
6887 if XNN_UNPREDICTABLE(mr <= 2) {
6888 c2 = c1;
6889 }
6890
6891 do {
6892 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6893 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6894 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6895 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6896 __m128i vacc1x0 = vacc0x0;
6897 __m128i vacc1x1 = vacc0x1;
6898 __m128i vacc1x2 = vacc0x2;
6899 __m128i vacc1x3 = vacc0x3;
6900 __m128i vacc2x0 = vacc0x0;
6901 __m128i vacc2x1 = vacc0x1;
6902 __m128i vacc2x2 = vacc0x2;
6903 __m128i vacc2x3 = vacc0x3;
6904 w = (const int32_t*) w + 4;
6905
6906 size_t p = ks;
6907 do {
6908 const int8_t* restrict a0 = a[0];
6909 if XNN_UNPREDICTABLE(a0 != zero) {
6910 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
6911 }
6912 const int8_t* restrict a1 = a[1];
6913 if XNN_UNPREDICTABLE(a1 != zero) {
6914 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
6915 }
6916 const int8_t* restrict a2 = a[2];
6917 if XNN_UNPREDICTABLE(a2 != zero) {
6918 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
6919 }
6920 a += 3;
6921
6922 size_t k = 0;
6923 while (k < kc) {
6924 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6925 const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
6926 a0 += 8;
6927 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
6928 const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
6929 a1 += 8;
6930 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
6931 const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
6932 a2 += 8;
6933
6934 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
6935 const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
6936
6937 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6938 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
6939 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
6940 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 8));
6941 const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
6942
6943 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6944 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
6945 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
6946 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 16));
6947 const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
6948
6949 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6950 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
6951 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
6952 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + 24));
6953 const __m128i vxb3 = _mm_srai_epi16(_mm_unpacklo_epi8(vb3, vb3), 8);
6954
6955 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6956 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
6957 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
6958
6959 w = (const void*) ((const int8_t*) w + 32);
6960 k += 8 * sizeof(int8_t);
6961 }
6962 p -= 3 * sizeof(void*);
6963 } while (p != 0);
6964
6965 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
6966 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
6967 const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
6968 const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
6969 const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
6970 const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
6971
6972 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
6973 __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
6974 __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
6975
6976 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6977 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
6978 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
6979
6980 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
6981 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
6982 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
6983 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
6984
6985 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
6986 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6987 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
6988 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
6989
6990 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6991 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
6992 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
6993
6994 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
6995 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
6996 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
6997
6998 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
6999 vacc01x0123 = _mm_max_epi16(vacc01x0123, voutput_min);
7000 vacc22x0123 = _mm_max_epi16(vacc22x0123, voutput_min);
7001
7002 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
7003
7004
7005 if (nc >= 4) {
7006 unaligned_store_u32(c2, (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2))));
7007 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
7008 unaligned_store_u32(c1, (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1))));
7009 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
7010 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
7011 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
7012
7013 a = (const int8_t**restrict) ((uintptr_t) a - ks);
7014
7015 nc -= 4;
7016 } else {
7017 if (nc & 2) {
7018 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
7019 c2 += 2;
7020 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
7021 c1 += 2;
7022 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
7023 c0 += 2;
7024 vout = _mm_srli_epi32(vout, 16);
7025 }
7026 if (nc & 1) {
7027 *c2 = (int8_t) _mm_extract_epi16(vout, 4);
7028 *c1 = (int8_t) _mm_extract_epi16(vout, 2);
7029 *c0 = (int8_t) _mm_cvtsi128_si32(vout);
7030 }
7031
7032 nc = 0;
7033 }
7034 } while (nc != 0);
7035 }
7036
xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7037 void xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8(
7038 size_t n,
7039 const int8_t* input_a,
7040 const int8_t* input_b,
7041 int8_t* output,
7042 const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7043 {
7044 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
7045 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
7046 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
7047 const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
7048 const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
7049 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
7050 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
7051 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
7052 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
7053
7054 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
7055 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7056 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
7057 input_a += 8;
7058 input_b += 8;
7059
7060 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7061 vb01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vb01234567, vb01234567), 8);
7062
7063 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
7064 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
7065 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
7066 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
7067
7068 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
7069 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
7070
7071 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
7072 vbprod01234567hi = _mm_sub_epi16(vbprod01234567hi, _mm_and_si128(_mm_srai_epi16(vb01234567, 15), vb_multiplier_lo));
7073
7074 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
7075 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
7076
7077 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
7078 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
7079
7080 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
7081 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
7082
7083 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7084
7085 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7086
7087 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7088
7089 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7090
7091
7092 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7093 output += 8;
7094 }
7095 if XNN_UNLIKELY(n != 0) {
7096 {
7097 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7098 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
7099
7100 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7101 vb01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vb01234567, vb01234567), 8);
7102
7103 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
7104 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
7105 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
7106 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
7107
7108 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
7109 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
7110
7111 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
7112 vbprod01234567hi = _mm_sub_epi16(vbprod01234567hi, _mm_and_si128(_mm_srai_epi16(vb01234567, 15), vb_multiplier_lo));
7113
7114 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
7115 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
7116
7117 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
7118 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
7119
7120 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
7121 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
7122
7123 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7124 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7125 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7126
7127 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7128
7129 if (n & (4 * sizeof(int8_t))) {
7130 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7131 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7132 output += 4;
7133 }
7134 if (n & (2 * sizeof(int8_t))) {
7135 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
7136 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7137 output += 2;
7138 }
7139 if (n & (1 * sizeof(int8_t))) {
7140 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
7141 }
7142 }
7143 }
7144 }
7145
xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7146 void xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8(
7147 size_t n,
7148 const int8_t* input_a,
7149 const int8_t* input_b,
7150 int8_t* output,
7151 const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7152 {
7153 const __m128i vbias = _mm_add_epi32(
7154 _mm_shuffle_epi32(_mm_cvtsi32_si128(params->sse2.b_multiplier * (int32_t) *input_b), _MM_SHUFFLE(0, 0, 0, 0)),
7155 _mm_load_si128((const __m128i*) params->sse2.bias));
7156 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
7157 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
7158 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
7159 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
7160 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
7161 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
7162
7163 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
7164 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7165 input_a += 8;
7166
7167 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7168
7169 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
7170 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
7171
7172 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
7173
7174 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
7175
7176 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
7177 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
7178
7179 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
7180 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
7181
7182 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7183
7184 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7185
7186 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7187
7188 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7189
7190
7191 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7192 output += 8;
7193 }
7194 if XNN_UNLIKELY(n != 0) {
7195 {
7196 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7197
7198 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7199
7200 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
7201 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
7202
7203 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
7204
7205 vaprod01234567hi = _mm_sub_epi16(vaprod01234567hi, _mm_and_si128(_mm_srai_epi16(va01234567, 15), va_multiplier_lo));
7206
7207 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
7208 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
7209
7210 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
7211 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
7212
7213 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7214 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7215 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7216
7217 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7218
7219 if (n & (4 * sizeof(int8_t))) {
7220 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7221 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7222 output += 4;
7223 }
7224 if (n & (2 * sizeof(int8_t))) {
7225 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
7226 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7227 output += 2;
7228 }
7229 if (n & (1 * sizeof(int8_t))) {
7230 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
7231 }
7232 }
7233 }
7234 }
7235
xnn_qs8_vcvt_ukernel__sse2_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])7236 void xnn_qs8_vcvt_ukernel__sse2_x32(
7237 size_t n,
7238 const int8_t* x,
7239 int8_t* y,
7240 const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7241 {
7242 assert(n != 0);
7243 assert(n % sizeof(int8_t) == 0);
7244 assert(x != NULL);
7245 assert(y != NULL);
7246
7247 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
7248 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
7249 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
7250 const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
7251 const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
7252 x += 32;
7253
7254 const __m128i vm0 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx0);
7255 const __m128i vextx0 = _mm_unpacklo_epi8(vx0, vm0);
7256 const __m128i vextx1 = _mm_unpackhi_epi8(vx0, vm0);
7257 const __m128i vm1 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx1);
7258 const __m128i vextx2 = _mm_unpacklo_epi8(vx1, vm1);
7259 const __m128i vextx3 = _mm_unpackhi_epi8(vx1, vm1);
7260
7261 const __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier);
7262 const __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier);
7263 const __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier);
7264 const __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier);
7265 const __m128i vprodlo2 = _mm_mullo_epi16(vextx2, vmultiplier);
7266 const __m128i vprodhi2 = _mm_mulhi_epi16(vextx2, vmultiplier);
7267 const __m128i vprodlo3 = _mm_mullo_epi16(vextx3, vmultiplier);
7268 const __m128i vprodhi3 = _mm_mulhi_epi16(vextx3, vmultiplier);
7269
7270 __m128i vacc0 = _mm_unpacklo_epi16(vprodlo0, vprodhi0);
7271 __m128i vacc1 = _mm_unpackhi_epi16(vprodlo0, vprodhi0);
7272 __m128i vacc2 = _mm_unpacklo_epi16(vprodlo1, vprodhi1);
7273 __m128i vacc3 = _mm_unpackhi_epi16(vprodlo1, vprodhi1);
7274 __m128i vacc4 = _mm_unpacklo_epi16(vprodlo2, vprodhi2);
7275 __m128i vacc5 = _mm_unpackhi_epi16(vprodlo2, vprodhi2);
7276 __m128i vacc6 = _mm_unpacklo_epi16(vprodlo3, vprodhi3);
7277 __m128i vacc7 = _mm_unpackhi_epi16(vprodlo3, vprodhi3);
7278
7279 vacc0 = _mm_sub_epi32(vbias, vacc0);
7280 vacc1 = _mm_sub_epi32(vbias, vacc1);
7281 vacc2 = _mm_sub_epi32(vbias, vacc2);
7282 vacc3 = _mm_sub_epi32(vbias, vacc3);
7283 vacc4 = _mm_sub_epi32(vbias, vacc4);
7284 vacc5 = _mm_sub_epi32(vbias, vacc5);
7285 vacc6 = _mm_sub_epi32(vbias, vacc6);
7286 vacc7 = _mm_sub_epi32(vbias, vacc7);
7287
7288 vacc0 = _mm_srai_epi32(vacc0, 8);
7289 vacc1 = _mm_srai_epi32(vacc1, 8);
7290 vacc2 = _mm_srai_epi32(vacc2, 8);
7291 vacc3 = _mm_srai_epi32(vacc3, 8);
7292 vacc4 = _mm_srai_epi32(vacc4, 8);
7293 vacc5 = _mm_srai_epi32(vacc5, 8);
7294 vacc6 = _mm_srai_epi32(vacc6, 8);
7295 vacc7 = _mm_srai_epi32(vacc7, 8);
7296
7297 vacc0 = _mm_packs_epi32(vacc0, vacc1);
7298 vacc1 = _mm_packs_epi32(vacc2, vacc3);
7299 vacc2 = _mm_packs_epi32(vacc4, vacc5);
7300 vacc3 = _mm_packs_epi32(vacc6, vacc7);
7301
7302 const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
7303 const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
7304
7305 _mm_storeu_si128((__m128i*) y, vy0);
7306 _mm_storeu_si128((__m128i*) (y + 16), vy1);
7307 y += 32;
7308 }
7309 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
7310 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
7311 x += 16;
7312
7313 const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
7314 const __m128i vextx_lo = _mm_unpacklo_epi8(vx, vm);
7315 const __m128i vextx_hi = _mm_unpackhi_epi8(vx, vm);
7316
7317 const __m128i vprodlo_lo = _mm_mullo_epi16(vextx_lo, vmultiplier);
7318 const __m128i vprodlo_hi = _mm_mullo_epi16(vextx_hi, vmultiplier);
7319 const __m128i vprodhi_lo = _mm_mulhi_epi16(vextx_lo, vmultiplier);
7320 const __m128i vprodhi_hi = _mm_mulhi_epi16(vextx_hi, vmultiplier);
7321
7322 __m128i vacc_ll = _mm_unpacklo_epi16(vprodlo_lo, vprodhi_lo);
7323 __m128i vacc_lh = _mm_unpackhi_epi16(vprodlo_lo, vprodhi_lo);
7324 __m128i vacc_hl = _mm_unpacklo_epi16(vprodlo_hi, vprodhi_hi);
7325 __m128i vacc_hh = _mm_unpackhi_epi16(vprodlo_hi, vprodhi_hi);
7326
7327 vacc_ll = _mm_sub_epi32(vbias, vacc_ll);
7328 vacc_lh = _mm_sub_epi32(vbias, vacc_lh);
7329 vacc_hl = _mm_sub_epi32(vbias, vacc_hl);
7330 vacc_hh = _mm_sub_epi32(vbias, vacc_hh);
7331
7332 vacc_ll = _mm_srai_epi32(vacc_ll, 8);
7333 vacc_lh = _mm_srai_epi32(vacc_lh, 8);
7334 vacc_hl = _mm_srai_epi32(vacc_hl, 8);
7335 vacc_hh = _mm_srai_epi32(vacc_hh, 8);
7336
7337 const __m128i vacc_lo = _mm_packs_epi32(vacc_ll, vacc_lh);
7338 const __m128i vacc_hi = _mm_packs_epi32(vacc_hl, vacc_hh);
7339
7340 const __m128i vy = _mm_packs_epi16(vacc_lo, vacc_hi);
7341 _mm_storeu_si128((__m128i*) y, vy);
7342 y += 16;
7343 }
7344 if XNN_UNLIKELY(n != 0) {
7345 assert(n >= 1 * sizeof(int8_t));
7346 assert(n <= 15 * sizeof(int8_t));
7347
7348 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
7349
7350 const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
7351 const __m128i vextx_lo = _mm_unpacklo_epi8(vx, vm);
7352 const __m128i vextx_hi = _mm_unpackhi_epi8(vx, vm);
7353
7354 const __m128i vprodlo_lo = _mm_mullo_epi16(vextx_lo, vmultiplier);
7355 const __m128i vprodlo_hi = _mm_mullo_epi16(vextx_hi, vmultiplier);
7356 const __m128i vprodhi_lo = _mm_mulhi_epi16(vextx_lo, vmultiplier);
7357 const __m128i vprodhi_hi = _mm_mulhi_epi16(vextx_hi, vmultiplier);
7358
7359 __m128i vacc_ll = _mm_unpacklo_epi16(vprodlo_lo, vprodhi_lo);
7360 __m128i vacc_lh = _mm_unpackhi_epi16(vprodlo_lo, vprodhi_lo);
7361 __m128i vacc_hl = _mm_unpacklo_epi16(vprodlo_hi, vprodhi_hi);
7362 __m128i vacc_hh = _mm_unpackhi_epi16(vprodlo_hi, vprodhi_hi);
7363
7364 vacc_ll = _mm_sub_epi32(vbias, vacc_ll);
7365 vacc_lh = _mm_sub_epi32(vbias, vacc_lh);
7366 vacc_hl = _mm_sub_epi32(vbias, vacc_hl);
7367 vacc_hh = _mm_sub_epi32(vbias, vacc_hh);
7368
7369 vacc_ll = _mm_srai_epi32(vacc_ll, 8);
7370 vacc_lh = _mm_srai_epi32(vacc_lh, 8);
7371 vacc_hl = _mm_srai_epi32(vacc_hl, 8);
7372 vacc_hh = _mm_srai_epi32(vacc_hh, 8);
7373
7374 const __m128i vacc_lo = _mm_packs_epi32(vacc_ll, vacc_lh);
7375 const __m128i vacc_hi = _mm_packs_epi32(vacc_hl, vacc_hh);
7376
7377 __m128i vy = _mm_packs_epi16(vacc_lo, vacc_hi);
7378 if (n & (8 * sizeof(int8_t))) {
7379 _mm_storel_epi64((__m128i*) y, vy);
7380 vy = _mm_unpackhi_epi64(vy, vy);
7381 y += 8;
7382 }
7383 if (n & (4 * sizeof(int8_t))) {
7384 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
7385 vy = _mm_srli_epi64(vy, 32);
7386 y += 4;
7387 }
7388 uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
7389 if (n & (2 * sizeof(int8_t))) {
7390 unaligned_store_u16(y, (uint16_t) vy_lo);
7391 vy_lo >>= 16;
7392 y += 2;
7393 }
7394 if (n & (1 * sizeof(int8_t))) {
7395 *y = (int8_t) vy_lo;
7396 }
7397 }
7398 }
7399
xnn_qs8_vlrelu_ukernel__sse2_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])7400 void xnn_qs8_vlrelu_ukernel__sse2_x32(
7401 size_t n,
7402 const int8_t* x,
7403 int8_t* y,
7404 const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7405 {
7406 assert(n != 0);
7407 assert(n % sizeof(int8_t) == 0);
7408 assert(x != NULL);
7409 assert(y != NULL);
7410
7411 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
7412 const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
7413 const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
7414 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
7415 const __m128i vzero = _mm_setzero_si128();
7416 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
7417 const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
7418 const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
7419 x += 32;
7420
7421 const __m128i vm0 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx0);
7422 __m128i vextx0 = _mm_unpacklo_epi8(vx0, vm0);
7423 __m128i vextx1 = _mm_unpackhi_epi8(vx0, vm0);
7424 const __m128i vm1 = _mm_cmpgt_epi8(_mm_setzero_si128(), vx1);
7425 __m128i vextx2 = _mm_unpacklo_epi8(vx1, vm1);
7426 __m128i vextx3 = _mm_unpackhi_epi8(vx1, vm1);
7427
7428 __m128i vmultiplier0 = _mm_cmpgt_epi16(vextx0, vinput_zero_point);
7429 vextx0 = _mm_sub_epi16(vinput_zero_point, vextx0);
7430 __m128i vmultiplier1 = _mm_cmpgt_epi16(vextx1, vinput_zero_point);
7431 vextx1 = _mm_sub_epi16(vinput_zero_point, vextx1);
7432 __m128i vmultiplier2 = _mm_cmpgt_epi16(vextx2, vinput_zero_point);
7433 vextx2 = _mm_sub_epi16(vinput_zero_point, vextx2);
7434 __m128i vmultiplier3 = _mm_cmpgt_epi16(vextx3, vinput_zero_point);
7435 vextx3 = _mm_sub_epi16(vinput_zero_point, vextx3);
7436
7437 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
7438 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
7439 vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
7440 vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
7441
7442 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
7443 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
7444 vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
7445 vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
7446
7447 __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier0);
7448 __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier1);
7449 __m128i vprodlo2 = _mm_mullo_epi16(vextx2, vmultiplier2);
7450 __m128i vprodlo3 = _mm_mullo_epi16(vextx3, vmultiplier3);
7451
7452 vprodlo0 = _mm_srli_epi16(vprodlo0, 7);
7453 __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier0);
7454 vprodlo1 = _mm_srli_epi16(vprodlo1, 7);
7455 __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier1);
7456 vprodlo2 = _mm_srli_epi16(vprodlo2, 7);
7457 __m128i vprodhi2 = _mm_mulhi_epi16(vextx2, vmultiplier2);
7458 vprodlo3 = _mm_srli_epi16(vprodlo3, 7);
7459 __m128i vprodhi3 = _mm_mulhi_epi16(vextx3, vmultiplier3);
7460
7461 vprodhi0 = _mm_slli_epi16(vprodhi0, 8);
7462 vprodlo0 = _mm_avg_epu16(vprodlo0, vzero);
7463 vprodhi1 = _mm_slli_epi16(vprodhi1, 8);
7464 vprodlo1 = _mm_avg_epu16(vprodlo1, vzero);
7465 vprodhi2 = _mm_slli_epi16(vprodhi2, 8);
7466 vprodlo2 = _mm_avg_epu16(vprodlo2, vzero);
7467 vprodhi3 = _mm_slli_epi16(vprodhi3, 8);
7468 vprodlo3 = _mm_avg_epu16(vprodlo3, vzero);
7469
7470 __m128i vacc0 = _mm_add_epi16(vprodlo0, vprodhi0);
7471 __m128i vacc1 = _mm_add_epi16(vprodlo1, vprodhi1);
7472 __m128i vacc2 = _mm_add_epi16(vprodlo2, vprodhi2);
7473 __m128i vacc3 = _mm_add_epi16(vprodlo3, vprodhi3);
7474
7475 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
7476 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
7477 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
7478 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
7479
7480 const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
7481 const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
7482
7483 _mm_storeu_si128((__m128i*) y, vy0);
7484 _mm_storeu_si128((__m128i*) (y + 16), vy1);
7485 y += 32;
7486 }
7487 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
7488 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
7489 x += 16;
7490
7491 const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
7492 __m128i vextx0 = _mm_unpacklo_epi8(vx, vm);
7493 __m128i vextx1 = _mm_unpackhi_epi8(vx, vm);
7494
7495 __m128i vmultiplier0 = _mm_cmpgt_epi16(vextx0, vinput_zero_point);
7496 __m128i vmultiplier1 = _mm_cmpgt_epi16(vextx1, vinput_zero_point);
7497 vextx0 = _mm_sub_epi16(vinput_zero_point, vextx0);
7498 vextx1 = _mm_sub_epi16(vinput_zero_point, vextx1);
7499
7500 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
7501 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
7502
7503 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
7504 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
7505
7506 __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier0);
7507 __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier1);
7508
7509 vprodlo0 = _mm_srli_epi16(vprodlo0, 7);
7510 vprodlo1 = _mm_srli_epi16(vprodlo1, 7);
7511 __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier0);
7512 __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier1);
7513
7514 vprodhi0 = _mm_slli_epi16(vprodhi0, 8);
7515 vprodhi1 = _mm_slli_epi16(vprodhi1, 8);
7516 vprodlo0 = _mm_avg_epu16(vprodlo0, vzero);
7517 vprodlo1 = _mm_avg_epu16(vprodlo1, vzero);
7518
7519 __m128i vacc0 = _mm_add_epi16(vprodlo0, vprodhi0);
7520 __m128i vacc1 = _mm_add_epi16(vprodlo1, vprodhi1);
7521
7522 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
7523 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
7524
7525 const __m128i vy = _mm_packs_epi16(vacc0, vacc1);
7526 _mm_storeu_si128((__m128i*) y, vy);
7527 y += 16;
7528 }
7529 if XNN_UNLIKELY(n != 0) {
7530 assert(n >= 1 * sizeof(int8_t));
7531 assert(n <= 15 * sizeof(int8_t));
7532
7533 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
7534
7535 const __m128i vm = _mm_cmpgt_epi8(_mm_setzero_si128(), vx);
7536 __m128i vextx0 = _mm_unpacklo_epi8(vx, vm);
7537 __m128i vextx1 = _mm_unpackhi_epi8(vx, vm);
7538
7539 __m128i vmultiplier0 = _mm_cmpgt_epi16(vextx0, vinput_zero_point);
7540 __m128i vmultiplier1 = _mm_cmpgt_epi16(vextx1, vinput_zero_point);
7541 vextx0 = _mm_sub_epi16(vinput_zero_point, vextx0);
7542 vextx1 = _mm_sub_epi16(vinput_zero_point, vextx1);
7543
7544 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
7545 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
7546
7547 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
7548 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
7549
7550 __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier0);
7551 __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier1);
7552
7553 vprodlo0 = _mm_srli_epi16(vprodlo0, 7);
7554 vprodlo1 = _mm_srli_epi16(vprodlo1, 7);
7555 __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier0);
7556 __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier1);
7557
7558 vprodhi0 = _mm_slli_epi16(vprodhi0, 8);
7559 vprodhi1 = _mm_slli_epi16(vprodhi1, 8);
7560 vprodlo0 = _mm_avg_epu16(vprodlo0, vzero);
7561 vprodlo1 = _mm_avg_epu16(vprodlo1, vzero);
7562
7563 __m128i vacc0 = _mm_add_epi16(vprodlo0, vprodhi0);
7564 __m128i vacc1 = _mm_add_epi16(vprodlo1, vprodhi1);
7565
7566 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
7567 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
7568
7569 __m128i vy = _mm_packs_epi16(vacc0, vacc1);
7570 if (n & (8 * sizeof(int8_t))) {
7571 _mm_storel_epi64((__m128i*) y, vy);
7572 vy = _mm_unpackhi_epi64(vy, vy);
7573 y += 8;
7574 }
7575 if (n & (4 * sizeof(int8_t))) {
7576 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
7577 vy = _mm_srli_epi64(vy, 32);
7578 y += 4;
7579 }
7580 uint32_t vy0 = (uint32_t) _mm_cvtsi128_si32(vy);
7581 if (n & (2 * sizeof(int8_t))) {
7582 unaligned_store_u16(y, (uint16_t) vy0);
7583 vy0 >>= 16;
7584 y += 2;
7585 }
7586 if (n & (1 * sizeof(int8_t))) {
7587 *y = (int8_t) vy0;
7588 }
7589 }
7590 }
7591
xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7592 void xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8(
7593 size_t n,
7594 const int8_t* input_a,
7595 const int8_t* input_b,
7596 int8_t* output,
7597 const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7598
7599 {
7600 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
7601 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point);
7602 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7603 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7604 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
7605 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
7606
7607 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
7608 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7609 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
7610 input_a += 8;
7611 input_b += 8;
7612
7613 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7614 vb01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vb01234567, vb01234567), 8);
7615
7616 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
7617 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
7618
7619 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
7620 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
7621
7622 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
7623 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
7624
7625 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
7626 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
7627
7628 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7629 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7630
7631 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7632 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7633
7634 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7635
7636 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7637
7638 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7639
7640 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7641
7642
7643 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7644 output += 8;
7645 }
7646 if XNN_UNLIKELY(n != 0) {
7647 {
7648 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7649 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
7650
7651 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7652 vb01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vb01234567, vb01234567), 8);
7653
7654 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
7655 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
7656
7657 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
7658 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
7659
7660 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
7661 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
7662
7663 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
7664 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
7665
7666 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7667 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7668
7669 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7670 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7671
7672 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7673 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7674 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7675
7676 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7677
7678 if (n & (4 * sizeof(int8_t))) {
7679 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7680 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7681 output += 4;
7682 }
7683 if (n & (2 * sizeof(int8_t))) {
7684 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
7685 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7686 output += 2;
7687 }
7688 if (n & (1 * sizeof(int8_t))) {
7689 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
7690 }
7691 }
7692 }
7693 }
7694
xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7695 void xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8(
7696 size_t n,
7697 const int8_t* input_a,
7698 const int8_t* input_b,
7699 int8_t* output,
7700 const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7701
7702 {
7703 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
7704 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
7705 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
7706 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
7707 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
7708
7709 __m128i vxb = _mm_sub_epi16(
7710 _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
7711 _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point));
7712 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
7713 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7714 input_a += 8;
7715
7716 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7717
7718 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
7719
7720 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
7721 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
7722
7723 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
7724 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
7725
7726 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
7727 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
7728
7729 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7730 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7731
7732 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7733 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7734
7735 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7736
7737 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7738
7739 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7740
7741 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7742
7743
7744 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7745 output += 8;
7746 }
7747 if XNN_UNLIKELY(n != 0) {
7748 {
7749 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
7750
7751 va01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(va01234567, va01234567), 8);
7752
7753 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
7754
7755 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
7756 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
7757
7758 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
7759 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
7760
7761 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
7762 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
7763
7764 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
7765 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
7766
7767 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
7768 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
7769
7770 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7771 vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
7772 vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
7773
7774 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7775
7776 if (n & (4 * sizeof(int8_t))) {
7777 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7778 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7779 output += 4;
7780 }
7781 if (n & (2 * sizeof(int8_t))) {
7782 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
7783 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7784 output += 2;
7785 }
7786 if (n & (1 * sizeof(int8_t))) {
7787 *output = (int8_t) _mm_cvtsi128_si32(vout0123456701234567);
7788 }
7789 }
7790 }
7791 }
7792
xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const uint8_t ** input,size_t input_offset,const uint8_t * zero,int32_t * buffer,uint8_t * output,size_t input_increment,size_t output_increment,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7793 void xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8(
7794 size_t output_pixels,
7795 size_t kernel_elements,
7796 size_t channels,
7797 const uint8_t** input,
7798 size_t input_offset,
7799 const uint8_t* zero,
7800 int32_t* buffer,
7801 uint8_t* output,
7802 size_t input_increment,
7803 size_t output_increment,
7804 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7805 {
7806 assert(output_pixels != 0);
7807 assert(kernel_elements > 9);
7808 assert(channels != 0);
7809
7810 const __m128i vbias = _mm_load_si128((const __m128i*) ¶ms->sse2.bias);
7811 const __m128i vzero = _mm_setzero_si128();
7812 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
7813 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
7814 const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
7815
7816 do {
7817 {
7818 const uint8_t* i0 = *input++;
7819 assert(i0 != NULL);
7820 if XNN_UNPREDICTABLE(i0 != zero) {
7821 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
7822 }
7823 const uint8_t* i1 = *input++;
7824 assert(i1 != NULL);
7825 if XNN_UNPREDICTABLE(i1 != zero) {
7826 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
7827 }
7828 const uint8_t* i2 = *input++;
7829 assert(i2 != NULL);
7830 if XNN_UNPREDICTABLE(i2 != zero) {
7831 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
7832 }
7833 const uint8_t* i3 = *input++;
7834 assert(i3 != NULL);
7835 if XNN_UNPREDICTABLE(i3 != zero) {
7836 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
7837 }
7838 const uint8_t* i4 = *input++;
7839 assert(i4 != NULL);
7840 if XNN_UNPREDICTABLE(i4 != zero) {
7841 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
7842 }
7843 const uint8_t* i5 = *input++;
7844 assert(i5 != NULL);
7845 if XNN_UNPREDICTABLE(i5 != zero) {
7846 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
7847 }
7848 const uint8_t* i6 = *input++;
7849 assert(i6 != NULL);
7850 if XNN_UNPREDICTABLE(i6 != zero) {
7851 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
7852 }
7853 const uint8_t* i7 = *input++;
7854 assert(i7 != NULL);
7855 if XNN_UNPREDICTABLE(i7 != zero) {
7856 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
7857 }
7858 const uint8_t* i8 = *input++;
7859 assert(i8 != NULL);
7860 if XNN_UNPREDICTABLE(i8 != zero) {
7861 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
7862 }
7863
7864 int32_t* b = buffer;
7865 for (size_t c = 0; c < channels; c += 8) {
7866 const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
7867 const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
7868 const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
7869 const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
7870 const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
7871 const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
7872 const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
7873 const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
7874 const __m128i vi8 = _mm_loadl_epi64((const __m128i*) i8); i8 += 8;
7875
7876 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
7877 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
7878 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
7879 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
7880 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
7881 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
7882 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
7883 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
7884 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
7885
7886 const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
7887 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
7888 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
7889 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
7890
7891 const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
7892 const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
7893 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
7894
7895 const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
7896 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
7897
7898 _mm_store_si128((__m128i*) b, vacc_lo);
7899 _mm_store_si128((__m128i*) b + 1, vacc_hi);
7900 b += 8;
7901 }
7902 }
7903
7904 size_t k = kernel_elements;
7905 for (k -= 9; k > 8; k -= 8) {
7906 const uint8_t* i0 = *input++;
7907 assert(i0 != NULL);
7908 if XNN_UNPREDICTABLE(i0 != zero) {
7909 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
7910 }
7911 const uint8_t* i1 = *input++;
7912 assert(i1 != NULL);
7913 if XNN_UNPREDICTABLE(i1 != zero) {
7914 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
7915 }
7916 const uint8_t* i2 = *input++;
7917 assert(i2 != NULL);
7918 if XNN_UNPREDICTABLE(i2 != zero) {
7919 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
7920 }
7921 const uint8_t* i3 = *input++;
7922 assert(i3 != NULL);
7923 if XNN_UNPREDICTABLE(i3 != zero) {
7924 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
7925 }
7926 const uint8_t* i4 = *input++;
7927 assert(i4 != NULL);
7928 if XNN_UNPREDICTABLE(i4 != zero) {
7929 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
7930 }
7931 const uint8_t* i5 = *input++;
7932 assert(i5 != NULL);
7933 if XNN_UNPREDICTABLE(i5 != zero) {
7934 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
7935 }
7936 const uint8_t* i6 = *input++;
7937 assert(i6 != NULL);
7938 if XNN_UNPREDICTABLE(i6 != zero) {
7939 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
7940 }
7941 const uint8_t* i7 = *input++;
7942 assert(i7 != NULL);
7943 if XNN_UNPREDICTABLE(i7 != zero) {
7944 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
7945 }
7946
7947 int32_t* b = buffer;
7948 for (size_t c = 0; c < channels; c += 8) {
7949 const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
7950 const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
7951 const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
7952 const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
7953 const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
7954 const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
7955 const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
7956 const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
7957 __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
7958 __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
7959
7960 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
7961 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
7962 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
7963 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
7964 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
7965 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
7966 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
7967 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
7968
7969 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
7970 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
7971 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
7972 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
7973
7974 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
7975 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
7976 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
7977
7978 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
7979 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
7980
7981 _mm_store_si128((__m128i*) b, vacc_lo);
7982 _mm_store_si128((__m128i*) b + 1, vacc_hi);
7983 b += 8;
7984 }
7985 }
7986
7987 {
7988 const uint8_t* i0 = input[0];
7989 assert(i0 != NULL);
7990 const uint8_t* i1 = input[1];
7991 const uint8_t* i2 = input[2];
7992 const uint8_t* i3 = input[3];
7993 const uint8_t* i4 = input[4];
7994 const uint8_t* i5 = input[5];
7995 const uint8_t* i6 = input[6];
7996 const uint8_t* i7 = input[7];
7997 input = (const uint8_t**) ((uintptr_t) input + input_increment);
7998 if (k < 2) {
7999 i1 = zero;
8000 }
8001 assert(i1 != NULL);
8002 if (k <= 2) {
8003 i2 = zero;
8004 }
8005 assert(i2 != NULL);
8006 if (k < 4) {
8007 i3 = zero;
8008 }
8009 assert(i3 != NULL);
8010 if (k <= 4) {
8011 i4 = zero;
8012 }
8013 assert(i4 != NULL);
8014 if (k < 6) {
8015 i5 = zero;
8016 }
8017 assert(i5 != NULL);
8018 if (k <= 6) {
8019 i6 = zero;
8020 }
8021 assert(i6 != NULL);
8022 if (k < 8) {
8023 i7 = zero;
8024 }
8025 assert(i7 != NULL);
8026 if XNN_UNPREDICTABLE(i0 != zero) {
8027 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
8028 }
8029 if XNN_UNPREDICTABLE(i1 != zero) {
8030 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
8031 }
8032 if XNN_UNPREDICTABLE(i2 != zero) {
8033 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
8034 }
8035 if XNN_UNPREDICTABLE(i3 != zero) {
8036 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
8037 }
8038 if XNN_UNPREDICTABLE(i4 != zero) {
8039 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
8040 }
8041 if XNN_UNPREDICTABLE(i5 != zero) {
8042 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
8043 }
8044 if XNN_UNPREDICTABLE(i6 != zero) {
8045 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
8046 }
8047 if XNN_UNPREDICTABLE(i7 != zero) {
8048 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
8049 }
8050
8051 size_t c = channels;
8052 int32_t* b = buffer;
8053 while (c >= 8) {
8054 const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
8055 const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
8056 const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
8057 const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
8058 const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
8059 const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
8060 const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
8061 const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
8062 __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
8063 __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
8064 b += 8;
8065
8066 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
8067 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
8068 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
8069 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
8070 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
8071 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
8072 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
8073 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
8074
8075 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
8076 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
8077 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
8078 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
8079
8080 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
8081 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
8082 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
8083
8084 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
8085 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
8086
8087 const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
8088 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
8089
8090 const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
8091 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
8092
8093 const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
8094 const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
8095
8096 const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
8097 const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
8098
8099 const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
8100 const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
8101
8102 const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
8103 const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
8104 const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
8105 const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
8106
8107 const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
8108 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
8109 const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
8110 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
8111
8112 const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
8113 const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
8114
8115 const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
8116 const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
8117
8118 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
8119 vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_zero_point));
8120 vout = _mm_packus_epi16(vout, vout);
8121 vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_max));
8122 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_min));
8123
8124 _mm_storel_epi64((__m128i*) output, vout);
8125 output += 8;
8126
8127 c -= 8;
8128 }
8129 if (c != 0) {
8130 const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
8131 const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
8132 const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
8133 const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
8134 const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
8135 const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
8136 const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
8137 const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7);
8138 __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
8139 __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
8140
8141 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
8142 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
8143 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
8144 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
8145 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
8146 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
8147 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
8148 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
8149
8150 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
8151 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
8152 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
8153 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
8154
8155 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
8156 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
8157 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
8158
8159 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
8160 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
8161
8162 const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
8163 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
8164
8165 const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
8166 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
8167
8168 const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
8169 const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
8170
8171 const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
8172 const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
8173
8174 const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
8175 const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
8176
8177 const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
8178 const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
8179 const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
8180 const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
8181
8182 const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
8183 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
8184 const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
8185 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
8186
8187 const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
8188 const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
8189
8190 const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
8191 const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
8192
8193 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
8194 vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_zero_point));
8195 vout = _mm_packus_epi16(vout, vout);
8196 vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_max));
8197 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_min));
8198
8199 if (c & 4) {
8200 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout));
8201 output += 4;
8202 vout = _mm_srli_epi64(vout, 32);
8203 }
8204 if (c & 2) {
8205 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout, 0));
8206 output += 2;
8207 vout = _mm_srli_epi32(vout, 16);
8208 }
8209 if (c & 1) {
8210 *output = (uint8_t) _mm_cvtsi128_si32(vout);
8211 output += 1;
8212 }
8213 }
8214 }
8215 output = (uint8_t*) ((uintptr_t) output + output_increment);
8216 } while (--output_pixels != 0);
8217 }
8218
xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const uint8_t ** input,size_t input_offset,const uint8_t * zero,uint8_t * output,size_t input_increment,size_t output_increment,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8219 void xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8(
8220 size_t output_pixels,
8221 size_t kernel_elements,
8222 size_t channels,
8223 const uint8_t** input,
8224 size_t input_offset,
8225 const uint8_t* zero,
8226 uint8_t* output,
8227 size_t input_increment,
8228 size_t output_increment,
8229 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8230 {
8231 assert(output_pixels != 0);
8232 assert(kernel_elements != 0);
8233 assert(kernel_elements <= 9);
8234 assert(channels != 0);
8235
8236 const __m128i vbias = _mm_load_si128((const __m128i*) ¶ms->sse2.bias);
8237 const __m128i vzero = _mm_setzero_si128();
8238 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
8239 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
8240 const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
8241
8242 do {
8243 const uint8_t* i0 = input[0];
8244 assert(i0 != NULL);
8245 const uint8_t* i1 = input[1];
8246 const uint8_t* i2 = input[2];
8247 const uint8_t* i3 = input[3];
8248 const uint8_t* i4 = input[4];
8249 const uint8_t* i5 = input[5];
8250 const uint8_t* i6 = input[6];
8251 const uint8_t* i7 = input[7];
8252 const uint8_t* i8 = input[8];
8253 input = (const uint8_t**) ((uintptr_t) input + input_increment);
8254 if (kernel_elements < 2) {
8255 i1 = zero;
8256 }
8257 assert(i1 != NULL);
8258 if (kernel_elements <= 2) {
8259 i2 = zero;
8260 }
8261 assert(i2 != NULL);
8262 if (kernel_elements < 4) {
8263 i3 = zero;
8264 }
8265 assert(i3 != NULL);
8266 if (kernel_elements <= 4) {
8267 i4 = zero;
8268 }
8269 assert(i4 != NULL);
8270 if (kernel_elements < 6) {
8271 i5 = zero;
8272 }
8273 assert(i5 != NULL);
8274 if (kernel_elements <= 6) {
8275 i6 = zero;
8276 }
8277 assert(i6 != NULL);
8278 if (kernel_elements < 8) {
8279 i7 = zero;
8280 }
8281 assert(i7 != NULL);
8282 if (kernel_elements <= 8) {
8283 i8 = zero;
8284 }
8285 assert(i8 != NULL);
8286 if XNN_UNPREDICTABLE(i0 != zero) {
8287 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
8288 }
8289 if XNN_UNPREDICTABLE(i1 != zero) {
8290 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
8291 }
8292 if XNN_UNPREDICTABLE(i2 != zero) {
8293 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
8294 }
8295 if XNN_UNPREDICTABLE(i3 != zero) {
8296 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
8297 }
8298 if XNN_UNPREDICTABLE(i4 != zero) {
8299 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
8300 }
8301 if XNN_UNPREDICTABLE(i5 != zero) {
8302 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
8303 }
8304 if XNN_UNPREDICTABLE(i6 != zero) {
8305 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
8306 }
8307 if XNN_UNPREDICTABLE(i7 != zero) {
8308 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
8309 }
8310 if XNN_UNPREDICTABLE(i8 != zero) {
8311 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
8312 }
8313
8314 size_t c = channels;
8315 while (c >= 8) {
8316 const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
8317 const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
8318 const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
8319 const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
8320 const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
8321 const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
8322 const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
8323 const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
8324 const __m128i vi8 = _mm_loadl_epi64((const __m128i*) i8); i8 += 8;
8325
8326 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
8327 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
8328 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
8329 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
8330 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
8331 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
8332 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
8333 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
8334 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
8335
8336 const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
8337 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
8338 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
8339 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
8340
8341 const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
8342 const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
8343 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
8344
8345 const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
8346 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
8347
8348 const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
8349 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
8350
8351 const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
8352 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
8353
8354 const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
8355 const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
8356
8357 const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
8358 const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
8359
8360 const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
8361 const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
8362
8363 const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
8364 const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
8365 const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
8366 const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
8367
8368 const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
8369 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
8370 const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
8371 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
8372
8373 const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
8374 const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
8375
8376 const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
8377 const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
8378
8379 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
8380 vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_zero_point));
8381 vout = _mm_packus_epi16(vout, vout);
8382 vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_max));
8383 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_min));
8384
8385 _mm_storel_epi64((__m128i*) output, vout);
8386 output += 8;
8387
8388 c -= 8;
8389 }
8390 if (c != 0) {
8391 const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
8392 const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
8393 const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
8394 const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
8395 const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
8396 const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
8397 const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
8398 const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7);
8399 const __m128i vi8 = _mm_loadl_epi64((const __m128i*) i8);
8400
8401 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
8402 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
8403 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
8404 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
8405 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
8406 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
8407 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
8408 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
8409 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
8410
8411 const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
8412 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
8413 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
8414 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
8415
8416 const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
8417 const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
8418 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
8419
8420 const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
8421 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
8422
8423 const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
8424 const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
8425
8426 const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
8427 const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
8428
8429 const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
8430 const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
8431
8432 const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
8433 const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
8434
8435 const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
8436 const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
8437
8438 const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
8439 const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
8440 const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
8441 const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
8442
8443 const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
8444 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
8445 const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
8446 _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
8447
8448 const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
8449 const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
8450
8451 const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
8452 const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
8453
8454 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
8455 vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_zero_point));
8456 vout = _mm_packus_epi16(vout, vout);
8457 vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_max));
8458 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) ¶ms->sse2.output_min));
8459
8460 if (c & 4) {
8461 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout));
8462 output += 4;
8463 vout = _mm_srli_epi64(vout, 32);
8464 }
8465 if (c & 2) {
8466 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout, 0));
8467 output += 2;
8468 vout = _mm_srli_epi32(vout, 16);
8469 }
8470 if (c & 1) {
8471 *output = (uint8_t) _mm_cvtsi128_si32(vout);
8472 output += 1;
8473 }
8474 }
8475 output = (uint8_t*) ((uintptr_t) output + output_increment);
8476 } while (--output_pixels != 0);
8477 }
8478
xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8479 void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16(
8480 size_t channels,
8481 size_t output_width,
8482 const uint8_t** input,
8483 const void* weights,
8484 uint8_t* output,
8485 size_t input_stride,
8486 size_t output_increment,
8487 size_t input_offset,
8488 const uint8_t* zero,
8489 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8490 {
8491 assert(channels != 0);
8492 assert(output_width != 0);
8493
8494 do {
8495 const uint8_t* i0 = input[0];
8496 assert(i0 != NULL);
8497 if XNN_UNPREDICTABLE(i0 != zero) {
8498 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
8499 }
8500 const uint8_t* i1 = input[1];
8501 assert(i1 != NULL);
8502 if XNN_UNPREDICTABLE(i1 != zero) {
8503 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
8504 }
8505 const uint8_t* i2 = input[2];
8506 assert(i2 != NULL);
8507 if XNN_UNPREDICTABLE(i2 != zero) {
8508 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
8509 }
8510 const uint8_t* i3 = input[3];
8511 assert(i3 != NULL);
8512 if XNN_UNPREDICTABLE(i3 != zero) {
8513 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
8514 }
8515 const uint8_t* i4 = input[4];
8516 assert(i4 != NULL);
8517 if XNN_UNPREDICTABLE(i4 != zero) {
8518 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
8519 }
8520 const uint8_t* i5 = input[5];
8521 assert(i5 != NULL);
8522 if XNN_UNPREDICTABLE(i5 != zero) {
8523 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
8524 }
8525 const uint8_t* i6 = input[6];
8526 assert(i6 != NULL);
8527 if XNN_UNPREDICTABLE(i6 != zero) {
8528 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
8529 }
8530 const uint8_t* i7 = input[7];
8531 assert(i7 != NULL);
8532 if XNN_UNPREDICTABLE(i7 != zero) {
8533 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
8534 }
8535 const uint8_t* i8 = input[8];
8536 assert(i8 != NULL);
8537 if XNN_UNPREDICTABLE(i8 != zero) {
8538 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
8539 }
8540 const uint8_t* i9 = input[9];
8541 assert(i9 != NULL);
8542 if XNN_UNPREDICTABLE(i9 != zero) {
8543 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
8544 }
8545 const uint8_t* i10 = input[10];
8546 assert(i10 != NULL);
8547 if XNN_UNPREDICTABLE(i10 != zero) {
8548 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
8549 }
8550 const uint8_t* i11 = input[11];
8551 assert(i11 != NULL);
8552 if XNN_UNPREDICTABLE(i11 != zero) {
8553 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
8554 }
8555 const uint8_t* i12 = input[12];
8556 assert(i12 != NULL);
8557 if XNN_UNPREDICTABLE(i12 != zero) {
8558 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
8559 }
8560 const uint8_t* i13 = input[13];
8561 assert(i13 != NULL);
8562 if XNN_UNPREDICTABLE(i13 != zero) {
8563 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
8564 }
8565 const uint8_t* i14 = input[14];
8566 assert(i14 != NULL);
8567 if XNN_UNPREDICTABLE(i14 != zero) {
8568 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
8569 }
8570 const uint8_t* i15 = input[15];
8571 assert(i15 != NULL);
8572 if XNN_UNPREDICTABLE(i15 != zero) {
8573 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
8574 }
8575 const uint8_t* i16 = input[16];
8576 assert(i16 != NULL);
8577 if XNN_UNPREDICTABLE(i16 != zero) {
8578 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
8579 }
8580 const uint8_t* i17 = input[17];
8581 assert(i17 != NULL);
8582 if XNN_UNPREDICTABLE(i17 != zero) {
8583 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
8584 }
8585 const uint8_t* i18 = input[18];
8586 assert(i18 != NULL);
8587 if XNN_UNPREDICTABLE(i18 != zero) {
8588 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
8589 }
8590 const uint8_t* i19 = input[19];
8591 assert(i19 != NULL);
8592 if XNN_UNPREDICTABLE(i19 != zero) {
8593 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
8594 }
8595 const uint8_t* i20 = input[20];
8596 assert(i20 != NULL);
8597 if XNN_UNPREDICTABLE(i20 != zero) {
8598 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
8599 }
8600 const uint8_t* i21 = input[21];
8601 assert(i21 != NULL);
8602 if XNN_UNPREDICTABLE(i21 != zero) {
8603 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
8604 }
8605 const uint8_t* i22 = input[22];
8606 assert(i22 != NULL);
8607 if XNN_UNPREDICTABLE(i22 != zero) {
8608 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
8609 }
8610 const uint8_t* i23 = input[23];
8611 assert(i23 != NULL);
8612 if XNN_UNPREDICTABLE(i23 != zero) {
8613 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
8614 }
8615 const uint8_t* i24 = input[24];
8616 assert(i24 != NULL);
8617 if XNN_UNPREDICTABLE(i24 != zero) {
8618 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
8619 }
8620 input = (const uint8_t**) ((uintptr_t) input + input_stride);
8621
8622 size_t c = channels;
8623 const void* w = weights;
8624 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
8625 for (; c >= 8; c -= 8) {
8626 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
8627 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
8628
8629
8630 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
8631 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
8632 i0 += 8;
8633
8634 const __m128i vzero = _mm_setzero_si128();
8635 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
8636 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk0x01234567, vzero), vk_zero_point);
8637
8638 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
8639 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
8640
8641 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
8642 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
8643
8644 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
8645 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
8646 i1 += 8;
8647
8648 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
8649 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk1x01234567, vzero), vk_zero_point);
8650
8651 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
8652 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
8653
8654 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
8655 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
8656
8657 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
8658 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
8659 i2 += 8;
8660
8661 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
8662 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk2x01234567, vzero), vk_zero_point);
8663
8664 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
8665 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
8666
8667 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
8668 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
8669
8670 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
8671 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
8672 i3 += 8;
8673
8674 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
8675 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk3x01234567, vzero), vk_zero_point);
8676
8677 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
8678 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
8679
8680 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
8681 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
8682
8683 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
8684 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
8685 i4 += 8;
8686
8687 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
8688 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk4x01234567, vzero), vk_zero_point);
8689
8690 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
8691 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
8692
8693 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
8694 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
8695
8696 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
8697 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
8698 i5 += 8;
8699
8700 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
8701 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk5x01234567, vzero), vk_zero_point);
8702
8703 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
8704 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
8705
8706 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
8707 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
8708
8709 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
8710 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
8711 i6 += 8;
8712
8713 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
8714 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk6x01234567, vzero), vk_zero_point);
8715
8716 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
8717 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
8718
8719 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
8720 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
8721
8722 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
8723 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
8724 i7 += 8;
8725
8726 const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, vzero);
8727 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk7x01234567, vzero), vk_zero_point);
8728
8729 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
8730 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
8731
8732 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
8733 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
8734
8735 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
8736 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
8737 i8 += 8;
8738
8739 const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, vzero);
8740 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk8x01234567, vzero), vk_zero_point);
8741
8742 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
8743 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
8744
8745 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
8746 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
8747
8748 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
8749 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
8750 i9 += 8;
8751
8752 const __m128i vxi9x01234567 = _mm_unpacklo_epi8(vi9x01234567, vzero);
8753 const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk9x01234567, vzero), vk_zero_point);
8754
8755 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
8756 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
8757
8758 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
8759 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
8760
8761 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
8762 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
8763 i10 += 8;
8764
8765 const __m128i vxi10x01234567 = _mm_unpacklo_epi8(vi10x01234567, vzero);
8766 const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk10x01234567, vzero), vk_zero_point);
8767
8768 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
8769 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
8770
8771 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
8772 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
8773
8774 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
8775 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
8776 i11 += 8;
8777
8778 const __m128i vxi11x01234567 = _mm_unpacklo_epi8(vi11x01234567, vzero);
8779 const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk11x01234567, vzero), vk_zero_point);
8780
8781 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
8782 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
8783
8784 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
8785 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
8786
8787 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
8788 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
8789 i12 += 8;
8790
8791 const __m128i vxi12x01234567 = _mm_unpacklo_epi8(vi12x01234567, vzero);
8792 const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk12x01234567, vzero), vk_zero_point);
8793
8794 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
8795 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
8796
8797 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
8798 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
8799
8800 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
8801 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
8802 i13 += 8;
8803
8804 const __m128i vxi13x01234567 = _mm_unpacklo_epi8(vi13x01234567, vzero);
8805 const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk13x01234567, vzero), vk_zero_point);
8806
8807 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
8808 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
8809
8810 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
8811 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
8812
8813 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
8814 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
8815 i14 += 8;
8816
8817 const __m128i vxi14x01234567 = _mm_unpacklo_epi8(vi14x01234567, vzero);
8818 const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk14x01234567, vzero), vk_zero_point);
8819
8820 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
8821 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
8822
8823 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
8824 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
8825
8826 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
8827 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
8828 i15 += 8;
8829
8830 const __m128i vxi15x01234567 = _mm_unpacklo_epi8(vi15x01234567, vzero);
8831 const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk15x01234567, vzero), vk_zero_point);
8832
8833 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
8834 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
8835
8836 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
8837 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
8838
8839 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
8840 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
8841 i16 += 8;
8842
8843 const __m128i vxi16x01234567 = _mm_unpacklo_epi8(vi16x01234567, vzero);
8844 const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk16x01234567, vzero), vk_zero_point);
8845
8846 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
8847 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
8848
8849 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
8850 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
8851
8852 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
8853 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
8854 i17 += 8;
8855
8856 const __m128i vxi17x01234567 = _mm_unpacklo_epi8(vi17x01234567, vzero);
8857 const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk17x01234567, vzero), vk_zero_point);
8858
8859 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
8860 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
8861
8862 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
8863 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
8864
8865 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
8866 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
8867 i18 += 8;
8868
8869 const __m128i vxi18x01234567 = _mm_unpacklo_epi8(vi18x01234567, vzero);
8870 const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk18x01234567, vzero), vk_zero_point);
8871
8872 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
8873 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
8874
8875 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
8876 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
8877
8878 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
8879 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
8880 i19 += 8;
8881
8882 const __m128i vxi19x01234567 = _mm_unpacklo_epi8(vi19x01234567, vzero);
8883 const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk19x01234567, vzero), vk_zero_point);
8884
8885 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
8886 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
8887
8888 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
8889 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
8890
8891 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
8892 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
8893 i20 += 8;
8894
8895 const __m128i vxi20x01234567 = _mm_unpacklo_epi8(vi20x01234567, vzero);
8896 const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk20x01234567, vzero), vk_zero_point);
8897
8898 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
8899 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
8900
8901 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
8902 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
8903
8904 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
8905 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
8906 i21 += 8;
8907
8908 const __m128i vxi21x01234567 = _mm_unpacklo_epi8(vi21x01234567, vzero);
8909 const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk21x01234567, vzero), vk_zero_point);
8910
8911 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
8912 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
8913
8914 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
8915 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
8916
8917 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
8918 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
8919 i22 += 8;
8920
8921 const __m128i vxi22x01234567 = _mm_unpacklo_epi8(vi22x01234567, vzero);
8922 const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk22x01234567, vzero), vk_zero_point);
8923
8924 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
8925 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
8926
8927 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
8928 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
8929
8930 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
8931 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
8932 i23 += 8;
8933
8934 const __m128i vxi23x01234567 = _mm_unpacklo_epi8(vi23x01234567, vzero);
8935 const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk23x01234567, vzero), vk_zero_point);
8936
8937 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
8938 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
8939
8940 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
8941 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
8942
8943 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
8944 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
8945 i24 += 8;
8946
8947 const __m128i vxi24x01234567 = _mm_unpacklo_epi8(vi24x01234567, vzero);
8948 const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk24x01234567, vzero), vk_zero_point);
8949
8950 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
8951 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
8952
8953 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
8954 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
8955
8956 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 200 * sizeof(uint8_t));
8957
8958 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
8959 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
8960
8961 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
8962 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
8963 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
8964
8965 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
8966 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
8967 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
8968
8969 vacc0123 = _mm_cvtps_epi32(vscaled0123);
8970 vacc4567 = _mm_cvtps_epi32(vscaled4567);
8971
8972 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
8973 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8974
8975 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
8976
8977 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
8978 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
8979
8980 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8981 output += 8;
8982 }
8983 if XNN_UNLIKELY(c != 0) {
8984 {
8985 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
8986 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
8987
8988
8989 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
8990 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
8991
8992 const __m128i vzero = _mm_setzero_si128();
8993 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
8994 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk0x01234567, vzero), vk_zero_point);
8995
8996 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
8997 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
8998
8999 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
9000 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
9001
9002 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9003 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
9004
9005 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
9006 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk1x01234567, vzero), vk_zero_point);
9007
9008 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
9009 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
9010
9011 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
9012 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
9013
9014 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9015 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
9016
9017 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
9018 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk2x01234567, vzero), vk_zero_point);
9019
9020 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
9021 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
9022
9023 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
9024 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
9025
9026 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9027 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
9028
9029 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
9030 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk3x01234567, vzero), vk_zero_point);
9031
9032 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
9033 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
9034
9035 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
9036 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
9037
9038 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9039 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
9040
9041 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
9042 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk4x01234567, vzero), vk_zero_point);
9043
9044 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
9045 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
9046
9047 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
9048 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
9049
9050 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9051 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
9052
9053 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
9054 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk5x01234567, vzero), vk_zero_point);
9055
9056 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
9057 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
9058
9059 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
9060 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
9061
9062 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9063 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
9064
9065 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
9066 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk6x01234567, vzero), vk_zero_point);
9067
9068 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
9069 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
9070
9071 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
9072 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
9073
9074 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
9075 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
9076
9077 const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, vzero);
9078 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk7x01234567, vzero), vk_zero_point);
9079
9080 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
9081 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
9082
9083 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
9084 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
9085
9086 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
9087 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
9088
9089 const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, vzero);
9090 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk8x01234567, vzero), vk_zero_point);
9091
9092 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
9093 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
9094
9095 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
9096 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
9097
9098 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
9099 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
9100
9101 const __m128i vxi9x01234567 = _mm_unpacklo_epi8(vi9x01234567, vzero);
9102 const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk9x01234567, vzero), vk_zero_point);
9103
9104 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
9105 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
9106
9107 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
9108 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
9109
9110 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
9111 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
9112
9113 const __m128i vxi10x01234567 = _mm_unpacklo_epi8(vi10x01234567, vzero);
9114 const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk10x01234567, vzero), vk_zero_point);
9115
9116 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
9117 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
9118
9119 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
9120 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
9121
9122 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
9123 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
9124
9125 const __m128i vxi11x01234567 = _mm_unpacklo_epi8(vi11x01234567, vzero);
9126 const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk11x01234567, vzero), vk_zero_point);
9127
9128 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
9129 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
9130
9131 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
9132 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
9133
9134 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
9135 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
9136
9137 const __m128i vxi12x01234567 = _mm_unpacklo_epi8(vi12x01234567, vzero);
9138 const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk12x01234567, vzero), vk_zero_point);
9139
9140 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
9141 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
9142
9143 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
9144 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
9145
9146 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
9147 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
9148
9149 const __m128i vxi13x01234567 = _mm_unpacklo_epi8(vi13x01234567, vzero);
9150 const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk13x01234567, vzero), vk_zero_point);
9151
9152 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
9153 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
9154
9155 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
9156 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
9157
9158 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
9159 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
9160
9161 const __m128i vxi14x01234567 = _mm_unpacklo_epi8(vi14x01234567, vzero);
9162 const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk14x01234567, vzero), vk_zero_point);
9163
9164 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
9165 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
9166
9167 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
9168 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
9169
9170 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
9171 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
9172
9173 const __m128i vxi15x01234567 = _mm_unpacklo_epi8(vi15x01234567, vzero);
9174 const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk15x01234567, vzero), vk_zero_point);
9175
9176 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
9177 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
9178
9179 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
9180 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
9181
9182 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
9183 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
9184
9185 const __m128i vxi16x01234567 = _mm_unpacklo_epi8(vi16x01234567, vzero);
9186 const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk16x01234567, vzero), vk_zero_point);
9187
9188 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
9189 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
9190
9191 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
9192 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
9193
9194 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
9195 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
9196
9197 const __m128i vxi17x01234567 = _mm_unpacklo_epi8(vi17x01234567, vzero);
9198 const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk17x01234567, vzero), vk_zero_point);
9199
9200 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
9201 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
9202
9203 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
9204 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
9205
9206 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
9207 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
9208
9209 const __m128i vxi18x01234567 = _mm_unpacklo_epi8(vi18x01234567, vzero);
9210 const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk18x01234567, vzero), vk_zero_point);
9211
9212 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
9213 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
9214
9215 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
9216 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
9217
9218 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
9219 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
9220
9221 const __m128i vxi19x01234567 = _mm_unpacklo_epi8(vi19x01234567, vzero);
9222 const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk19x01234567, vzero), vk_zero_point);
9223
9224 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
9225 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
9226
9227 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
9228 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
9229
9230 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
9231 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
9232
9233 const __m128i vxi20x01234567 = _mm_unpacklo_epi8(vi20x01234567, vzero);
9234 const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk20x01234567, vzero), vk_zero_point);
9235
9236 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
9237 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
9238
9239 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
9240 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
9241
9242 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
9243 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
9244
9245 const __m128i vxi21x01234567 = _mm_unpacklo_epi8(vi21x01234567, vzero);
9246 const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk21x01234567, vzero), vk_zero_point);
9247
9248 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
9249 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
9250
9251 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
9252 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
9253
9254 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
9255 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
9256
9257 const __m128i vxi22x01234567 = _mm_unpacklo_epi8(vi22x01234567, vzero);
9258 const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk22x01234567, vzero), vk_zero_point);
9259
9260 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
9261 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
9262
9263 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
9264 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
9265
9266 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
9267 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
9268
9269 const __m128i vxi23x01234567 = _mm_unpacklo_epi8(vi23x01234567, vzero);
9270 const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk23x01234567, vzero), vk_zero_point);
9271
9272 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
9273 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
9274
9275 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
9276 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
9277
9278 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
9279 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
9280
9281 const __m128i vxi24x01234567 = _mm_unpacklo_epi8(vi24x01234567, vzero);
9282 const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk24x01234567, vzero), vk_zero_point);
9283
9284 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
9285 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
9286
9287 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
9288 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
9289
9290
9291 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
9292 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
9293
9294 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
9295 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
9296 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
9297
9298 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
9299 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
9300 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
9301
9302 vacc0123 = _mm_cvtps_epi32(vscaled0123);
9303 vacc4567 = _mm_cvtps_epi32(vscaled4567);
9304
9305
9306 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
9307 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9308
9309 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
9310
9311 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
9312
9313 if (c & 4) {
9314 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
9315 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
9316 output += 4;
9317 }
9318 if (c & 2) {
9319 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
9320 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
9321 output += 2;
9322 }
9323 if (c & 1) {
9324 *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
9325 output += 1;
9326 }
9327 }
9328 }
9329
9330 output = (uint8_t*) ((uintptr_t) output + output_increment);
9331 } while (--output_width != 0);
9332 }
9333
xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9334 void xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16(
9335 size_t channels,
9336 size_t output_width,
9337 const uint8_t** input,
9338 const void* weights,
9339 uint8_t* output,
9340 size_t input_stride,
9341 size_t output_increment,
9342 size_t input_offset,
9343 const uint8_t* zero,
9344 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9345 {
9346 assert(channels != 0);
9347 assert(output_width != 0);
9348
9349 do {
9350 const uint8_t* i0 = input[0];
9351 assert(i0 != NULL);
9352 if XNN_UNPREDICTABLE(i0 != zero) {
9353 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
9354 }
9355 const uint8_t* i1 = input[1];
9356 assert(i1 != NULL);
9357 if XNN_UNPREDICTABLE(i1 != zero) {
9358 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
9359 }
9360 const uint8_t* i2 = input[2];
9361 assert(i2 != NULL);
9362 if XNN_UNPREDICTABLE(i2 != zero) {
9363 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
9364 }
9365 const uint8_t* i3 = input[3];
9366 assert(i3 != NULL);
9367 if XNN_UNPREDICTABLE(i3 != zero) {
9368 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
9369 }
9370 const uint8_t* i4 = input[4];
9371 assert(i4 != NULL);
9372 if XNN_UNPREDICTABLE(i4 != zero) {
9373 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
9374 }
9375 const uint8_t* i5 = input[5];
9376 assert(i5 != NULL);
9377 if XNN_UNPREDICTABLE(i5 != zero) {
9378 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
9379 }
9380 const uint8_t* i6 = input[6];
9381 assert(i6 != NULL);
9382 if XNN_UNPREDICTABLE(i6 != zero) {
9383 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
9384 }
9385 const uint8_t* i7 = input[7];
9386 assert(i7 != NULL);
9387 if XNN_UNPREDICTABLE(i7 != zero) {
9388 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
9389 }
9390 const uint8_t* i8 = input[8];
9391 assert(i8 != NULL);
9392 if XNN_UNPREDICTABLE(i8 != zero) {
9393 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
9394 }
9395 input = (const uint8_t**) ((uintptr_t) input + input_stride);
9396
9397 size_t c = channels;
9398 const void* w = weights;
9399 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
9400 for (; c >= 8; c -= 8) {
9401 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
9402 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
9403
9404
9405 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9406 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
9407 i0 += 8;
9408
9409 const __m128i vzero = _mm_setzero_si128();
9410 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
9411 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk0x01234567, vzero), vk_zero_point);
9412
9413 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
9414 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
9415
9416 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
9417 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
9418
9419 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9420 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
9421 i1 += 8;
9422
9423 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
9424 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk1x01234567, vzero), vk_zero_point);
9425
9426 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
9427 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
9428
9429 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
9430 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
9431
9432 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9433 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
9434 i2 += 8;
9435
9436 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
9437 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk2x01234567, vzero), vk_zero_point);
9438
9439 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
9440 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
9441
9442 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
9443 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
9444
9445 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9446 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
9447 i3 += 8;
9448
9449 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
9450 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk3x01234567, vzero), vk_zero_point);
9451
9452 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
9453 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
9454
9455 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
9456 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
9457
9458 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9459 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
9460 i4 += 8;
9461
9462 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
9463 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk4x01234567, vzero), vk_zero_point);
9464
9465 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
9466 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
9467
9468 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
9469 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
9470
9471 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9472 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
9473 i5 += 8;
9474
9475 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
9476 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk5x01234567, vzero), vk_zero_point);
9477
9478 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
9479 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
9480
9481 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
9482 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
9483
9484 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9485 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
9486 i6 += 8;
9487
9488 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
9489 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk6x01234567, vzero), vk_zero_point);
9490
9491 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
9492 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
9493
9494 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
9495 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
9496
9497 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
9498 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
9499 i7 += 8;
9500
9501 const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, vzero);
9502 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk7x01234567, vzero), vk_zero_point);
9503
9504 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
9505 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
9506
9507 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
9508 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
9509
9510 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
9511 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
9512 i8 += 8;
9513
9514 const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, vzero);
9515 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk8x01234567, vzero), vk_zero_point);
9516
9517 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
9518 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
9519
9520 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
9521 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
9522
9523 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(uint8_t));
9524
9525 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
9526 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
9527
9528 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
9529 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
9530 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
9531
9532 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
9533 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
9534 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
9535
9536 vacc0123 = _mm_cvtps_epi32(vscaled0123);
9537 vacc4567 = _mm_cvtps_epi32(vscaled4567);
9538
9539 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
9540 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9541
9542 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
9543
9544 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
9545 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
9546
9547 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
9548 output += 8;
9549 }
9550 if XNN_UNLIKELY(c != 0) {
9551 {
9552 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
9553 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
9554
9555
9556 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9557 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
9558
9559 const __m128i vzero = _mm_setzero_si128();
9560 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
9561 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk0x01234567, vzero), vk_zero_point);
9562
9563 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
9564 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
9565
9566 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
9567 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
9568
9569 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9570 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
9571
9572 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
9573 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk1x01234567, vzero), vk_zero_point);
9574
9575 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
9576 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
9577
9578 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
9579 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
9580
9581 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9582 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
9583
9584 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
9585 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk2x01234567, vzero), vk_zero_point);
9586
9587 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
9588 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
9589
9590 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
9591 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
9592
9593 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9594 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
9595
9596 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
9597 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk3x01234567, vzero), vk_zero_point);
9598
9599 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
9600 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
9601
9602 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
9603 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
9604
9605 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9606 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
9607
9608 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
9609 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk4x01234567, vzero), vk_zero_point);
9610
9611 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
9612 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
9613
9614 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
9615 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
9616
9617 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9618 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
9619
9620 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
9621 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk5x01234567, vzero), vk_zero_point);
9622
9623 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
9624 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
9625
9626 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
9627 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
9628
9629 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9630 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
9631
9632 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
9633 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk6x01234567, vzero), vk_zero_point);
9634
9635 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
9636 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
9637
9638 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
9639 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
9640
9641 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
9642 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
9643
9644 const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, vzero);
9645 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk7x01234567, vzero), vk_zero_point);
9646
9647 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
9648 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
9649
9650 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
9651 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
9652
9653 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
9654 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
9655
9656 const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, vzero);
9657 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_unpacklo_epi8(vk8x01234567, vzero), vk_zero_point);
9658
9659 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
9660 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
9661
9662 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
9663 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
9664
9665
9666 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
9667 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
9668
9669 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
9670 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
9671 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
9672
9673 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
9674 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
9675 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
9676
9677 vacc0123 = _mm_cvtps_epi32(vscaled0123);
9678 vacc4567 = _mm_cvtps_epi32(vscaled4567);
9679
9680
9681 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
9682 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9683
9684 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
9685
9686 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
9687
9688 if (c & 4) {
9689 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
9690 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
9691 output += 4;
9692 }
9693 if (c & 2) {
9694 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
9695 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
9696 output += 2;
9697 }
9698 if (c & 1) {
9699 *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
9700 output += 1;
9701 }
9702 }
9703 }
9704
9705 output = (uint8_t*) ((uintptr_t) output + output_increment);
9706 } while (--output_width != 0);
9707 }
9708
xnn_qu8_f32_vcvt_ukernel__sse2_x32(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])9709 void xnn_qu8_f32_vcvt_ukernel__sse2_x32(
9710 size_t n,
9711 const uint8_t* x,
9712 float* y,
9713 const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9714 {
9715 assert(n != 0);
9716 assert(n % sizeof(uint8_t) == 0);
9717 assert(x != NULL);
9718 assert(y != NULL);
9719
9720 const __m128i vmagic_exp = _mm_load_si128((const __m128i*) params->sse2.magic_exp);
9721 const __m128 vmagic_bias = _mm_load_ps(params->sse2.magic_bias);
9722 const __m128 vscale = _mm_load_ps(params->sse2.scale);
9723 const __m128i vzero = _mm_setzero_si128();
9724 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
9725 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) x);
9726 __m128i vx89ABCDEF = _mm_loadl_epi64((const __m128i*) (x + 8));
9727 __m128i vxGHIJKLMN = _mm_loadl_epi64((const __m128i*) (x + 16));
9728 __m128i vxOPQRSTUV = _mm_loadl_epi64((const __m128i*) (x + 24));
9729 x += 32;
9730
9731
9732 vx01234567 = _mm_unpacklo_epi8(vx01234567, vzero);
9733 vx89ABCDEF = _mm_unpacklo_epi8(vx89ABCDEF, vzero);
9734 vxGHIJKLMN = _mm_unpacklo_epi8(vxGHIJKLMN, vzero);
9735 vxOPQRSTUV = _mm_unpacklo_epi8(vxOPQRSTUV, vzero);
9736
9737 __m128 vy0123 = _mm_castsi128_ps(_mm_unpacklo_epi16(vx01234567, vmagic_exp));
9738 __m128 vy4567 = _mm_castsi128_ps(_mm_unpackhi_epi16(vx01234567, vmagic_exp));
9739 __m128 vy89AB = _mm_castsi128_ps(_mm_unpacklo_epi16(vx89ABCDEF, vmagic_exp));
9740 __m128 vyCDEF = _mm_castsi128_ps(_mm_unpackhi_epi16(vx89ABCDEF, vmagic_exp));
9741 __m128 vyGHIJ = _mm_castsi128_ps(_mm_unpacklo_epi16(vxGHIJKLMN, vmagic_exp));
9742 __m128 vyKLMN = _mm_castsi128_ps(_mm_unpackhi_epi16(vxGHIJKLMN, vmagic_exp));
9743 __m128 vyOPQR = _mm_castsi128_ps(_mm_unpacklo_epi16(vxOPQRSTUV, vmagic_exp));
9744 __m128 vySTUV = _mm_castsi128_ps(_mm_unpackhi_epi16(vxOPQRSTUV, vmagic_exp));
9745
9746 vy0123 = _mm_sub_ps(vy0123, vmagic_bias);
9747 vy4567 = _mm_sub_ps(vy4567, vmagic_bias);
9748 vy89AB = _mm_sub_ps(vy89AB, vmagic_bias);
9749 vyCDEF = _mm_sub_ps(vyCDEF, vmagic_bias);
9750 vyGHIJ = _mm_sub_ps(vyGHIJ, vmagic_bias);
9751 vyKLMN = _mm_sub_ps(vyKLMN, vmagic_bias);
9752 vyOPQR = _mm_sub_ps(vyOPQR, vmagic_bias);
9753 vySTUV = _mm_sub_ps(vySTUV, vmagic_bias);
9754
9755 vy0123 = _mm_mul_ps(vy0123, vscale);
9756 vy4567 = _mm_mul_ps(vy4567, vscale);
9757 vy89AB = _mm_mul_ps(vy89AB, vscale);
9758 vyCDEF = _mm_mul_ps(vyCDEF, vscale);
9759 vyGHIJ = _mm_mul_ps(vyGHIJ, vscale);
9760 vyKLMN = _mm_mul_ps(vyKLMN, vscale);
9761 vyOPQR = _mm_mul_ps(vyOPQR, vscale);
9762 vySTUV = _mm_mul_ps(vySTUV, vscale);
9763
9764 _mm_storeu_ps(y, vy0123);
9765 _mm_storeu_ps(y + 4, vy4567);
9766 _mm_storeu_ps(y + 8, vy89AB);
9767 _mm_storeu_ps(y + 12, vyCDEF);
9768 _mm_storeu_ps(y + 16, vyGHIJ);
9769 _mm_storeu_ps(y + 20, vyKLMN);
9770 _mm_storeu_ps(y + 24, vyOPQR);
9771 _mm_storeu_ps(y + 28, vySTUV);
9772 y += 32;
9773 }
9774 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
9775 __m128i vx = _mm_loadl_epi64((const __m128i*) x);
9776 vx = _mm_unpacklo_epi8(vx, vzero);
9777 x += 8;
9778
9779 __m128 vy_lo = _mm_castsi128_ps(_mm_unpacklo_epi16(vx, vmagic_exp));
9780 __m128 vy_hi = _mm_castsi128_ps(_mm_unpackhi_epi16(vx, vmagic_exp));
9781
9782 vy_lo = _mm_sub_ps(vy_lo, vmagic_bias);
9783 vy_hi = _mm_sub_ps(vy_hi, vmagic_bias);
9784
9785 vy_lo = _mm_mul_ps(vy_lo, vscale);
9786 vy_hi = _mm_mul_ps(vy_hi, vscale);
9787
9788 _mm_storeu_ps(y, vy_lo);
9789 _mm_storeu_ps(y + 4, vy_hi);
9790 y += 8;
9791 }
9792 if XNN_UNLIKELY(n != 0) {
9793 assert(n >= 1 * sizeof(uint8_t));
9794 assert(n <= 7 * sizeof(uint8_t));
9795
9796 __m128i vx = _mm_loadl_epi64((const __m128i*) x);
9797 vx = _mm_unpacklo_epi8(vx, vzero);
9798
9799 __m128 vy = _mm_castsi128_ps(_mm_unpacklo_epi16(vx, vmagic_exp));
9800 vy = _mm_sub_ps(vy, vmagic_bias);
9801 vy = _mm_mul_ps(vy, vscale);
9802
9803 if (n & (4 * sizeof(uint8_t))) {
9804 _mm_storeu_ps(y, vy);
9805 vy = _mm_castsi128_ps(_mm_unpackhi_epi16(vx, vmagic_exp));
9806 vy = _mm_sub_ps(vy, vmagic_bias);
9807 vy = _mm_mul_ps(vy, vscale);
9808 y += 4;
9809 }
9810 if (n & (2 * sizeof(uint8_t))) {
9811 _mm_storel_pi((__m64*) y, vy);
9812 vy = _mm_movehl_ps(vy, vy);
9813 y += 2;
9814 }
9815 if (n & (1 * sizeof(uint8_t))) {
9816 _mm_store_ss(y, vy);
9817 }
9818 }
9819 }
9820
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9821 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8(
9822 size_t rows,
9823 size_t channels,
9824 const uint8_t* input,
9825 size_t input_stride,
9826 const uint8_t* zero,
9827 int32_t* buffer,
9828 uint8_t* output,
9829 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9830 {
9831 assert(rows > 7);
9832 assert(channels != 0);
9833
9834 const uint8_t* i0 = input;
9835 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
9836 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
9837 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
9838 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
9839 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
9840 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
9841 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
9842
9843 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
9844 const __m128i vzero = _mm_setzero_si128();
9845 int32_t* b = buffer;
9846 size_t c = channels;
9847 for (; c != 0; c = doz(c, 8)) {
9848
9849 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9850 i0 += 8;
9851
9852 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
9853 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9854 i1 += 8;
9855
9856 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
9857 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9858 i2 += 8;
9859
9860 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
9861 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
9862 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9863 i3 += 8;
9864
9865 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
9866 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
9867 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9868 i4 += 8;
9869
9870 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
9871 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
9872 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9873 i5 += 8;
9874
9875 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
9876 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
9877 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9878 i6 += 8;
9879
9880 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
9881 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
9882
9883 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
9884
9885 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
9886 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
9887
9888 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
9889 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
9890
9891 _mm_store_si128((__m128i*) b, vacc0123);
9892 _mm_store_si128((__m128i*) (b + 4), vacc4567);
9893 b += 8;
9894 }
9895
9896 for (rows -= 7; rows > 7; rows -= 7) {
9897 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
9898 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
9899 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
9900 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
9901 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
9902 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
9903 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
9904
9905 int32_t* b = buffer;
9906 size_t c = channels;
9907 for (; c != 0; c = doz(c, 8)) {
9908
9909 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9910 i0 += 8;
9911
9912 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
9913 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9914 i1 += 8;
9915
9916 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
9917 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9918 i2 += 8;
9919
9920 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
9921 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
9922 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9923 i3 += 8;
9924
9925 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
9926 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
9927 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9928 i4 += 8;
9929
9930 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
9931 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
9932 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9933 i5 += 8;
9934
9935 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
9936 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
9937 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9938 i6 += 8;
9939
9940 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
9941 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
9942
9943 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
9944
9945 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
9946 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
9947
9948 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b));
9949 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4)));
9950
9951 _mm_store_si128((__m128i*) b, vacc0123);
9952 _mm_store_si128((__m128i*) (b + 4), vacc4567);
9953 b += 8;
9954 }
9955 }
9956
9957 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
9958 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
9959 if XNN_UNPREDICTABLE(rows < 2) {
9960 i1 = zero;
9961 }
9962 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
9963 if XNN_UNPREDICTABLE(rows <= 2) {
9964 i2 = zero;
9965 }
9966 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
9967 if XNN_UNPREDICTABLE(rows < 4) {
9968 i3 = zero;
9969 }
9970 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
9971 if XNN_UNPREDICTABLE(rows <= 4) {
9972 i4 = zero;
9973 }
9974 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
9975 if XNN_UNPREDICTABLE(rows < 6) {
9976 i5 = zero;
9977 }
9978 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
9979 if XNN_UNPREDICTABLE(rows <= 6) {
9980 i6 = zero;
9981 }
9982
9983 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
9984 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
9985 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
9986 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
9987 for (; channels >= 8; channels -= 8) {
9988
9989 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9990 i0 += 8;
9991
9992 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
9993 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9994 i1 += 8;
9995
9996 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
9997 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9998 i2 += 8;
9999
10000 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
10001 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
10002 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10003 i3 += 8;
10004
10005 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
10006 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
10007 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10008 i4 += 8;
10009
10010 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
10011 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
10012 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10013 i5 += 8;
10014
10015 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
10016 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
10017 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10018 i6 += 8;
10019
10020 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
10021 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
10022
10023 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
10024
10025 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
10026 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
10027
10028 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
10029 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
10030 buffer += 8;
10031
10032 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
10033 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
10034
10035 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
10036 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
10037
10038 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
10039 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
10040
10041 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
10042 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
10043
10044 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10045
10046
10047 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10048
10049 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
10050
10051 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10052 output += 8;
10053 }
10054 if XNN_UNLIKELY(channels != 0) {
10055 {
10056
10057 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10058 i0 += 8;
10059
10060 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10061 i1 += 8;
10062
10063 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
10064 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10065 i2 += 8;
10066
10067 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
10068 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10069 i3 += 8;
10070
10071 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
10072 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
10073 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10074 i4 += 8;
10075
10076 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
10077 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
10078 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10079 i5 += 8;
10080
10081 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
10082 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
10083 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10084 i6 += 8;
10085
10086 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
10087 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
10088
10089 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
10090 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
10091
10092 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
10093
10094 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
10095 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
10096
10097 vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer));
10098 vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4)));
10099 buffer += 8;
10100
10101 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
10102 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
10103
10104 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
10105 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
10106
10107 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
10108 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
10109
10110 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
10111 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
10112
10113 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10114
10115 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10116 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
10117
10118 if (channels & 4) {
10119 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10120 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10121 output += 4;
10122 }
10123 uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
10124 if (channels & 2) {
10125 unaligned_store_u16(output, (uint16_t) vout0123);
10126 vout0123 >>= 16;
10127 output += 2;
10128 }
10129 if (channels & 1) {
10130 *output = (uint8_t) vout0123;
10131 }
10132 }
10133 }
10134 }
10135
xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10136 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8(
10137 size_t rows,
10138 size_t channels,
10139 const uint8_t* input,
10140 size_t input_stride,
10141 const uint8_t* zero,
10142 uint8_t* output,
10143 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10144 {
10145 assert(rows != 0);
10146 assert(rows <= 7);
10147 assert(channels != 0);
10148
10149 const uint8_t* i0 = input;
10150 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
10151 if XNN_UNPREDICTABLE(rows < 2) {
10152 i1 = zero;
10153 }
10154 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
10155 if XNN_UNPREDICTABLE(rows <= 2) {
10156 i2 = zero;
10157 }
10158 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
10159 if XNN_UNPREDICTABLE(rows < 4) {
10160 i3 = zero;
10161 }
10162 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
10163 if XNN_UNPREDICTABLE(rows <= 4) {
10164 i4 = zero;
10165 }
10166 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
10167 if XNN_UNPREDICTABLE(rows < 6) {
10168 i5 = zero;
10169 }
10170 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
10171 if XNN_UNPREDICTABLE(rows <= 6) {
10172 i6 = zero;
10173 }
10174
10175 const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
10176 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10177 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10178 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10179 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
10180 const __m128i vzero = _mm_setzero_si128();
10181 for (; channels >= 8; channels -= 8) {
10182
10183 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10184 i0 += 8;
10185
10186 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
10187 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10188 i1 += 8;
10189
10190 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
10191 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10192 i2 += 8;
10193
10194 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
10195 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
10196 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10197 i3 += 8;
10198
10199 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
10200 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
10201 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10202 i4 += 8;
10203
10204 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
10205 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
10206 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10207 i5 += 8;
10208
10209 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
10210 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
10211 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10212 i6 += 8;
10213
10214 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
10215 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
10216
10217 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
10218
10219 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
10220 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
10221
10222 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
10223 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
10224
10225 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
10226 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
10227
10228 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
10229 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
10230
10231 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
10232 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
10233
10234 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
10235 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
10236
10237 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10238
10239
10240 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10241
10242 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
10243
10244 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10245 output += 8;
10246 }
10247 if XNN_UNLIKELY(channels != 0) {
10248 {
10249
10250 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10251 i0 += 8;
10252
10253 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10254 i1 += 8;
10255
10256 const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero);
10257 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10258 i2 += 8;
10259
10260 const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero);
10261 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10262 i3 += 8;
10263
10264 __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
10265 const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero);
10266 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10267 i4 += 8;
10268
10269 vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567);
10270 const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero);
10271 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10272 i5 += 8;
10273
10274 vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567);
10275 const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero);
10276 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10277 i6 += 8;
10278
10279 vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567);
10280 const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero);
10281
10282 vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567);
10283 const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero);
10284
10285 vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567);
10286
10287 __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero);
10288 __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero);
10289
10290 vacc0123 = _mm_add_epi32(vacc0123, vinit_bias);
10291 vacc4567 = _mm_add_epi32(vacc4567, vinit_bias);
10292
10293 __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123);
10294 __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567);
10295
10296 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
10297 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
10298
10299 vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point);
10300 vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point);
10301
10302 vacc0123 = _mm_cvtps_epi32(vfpacc0123);
10303 vacc4567 = _mm_cvtps_epi32(vfpacc4567);
10304
10305 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10306
10307 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10308 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
10309
10310 if (channels & 4) {
10311 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10312 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10313 output += 4;
10314 }
10315 uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
10316 if (channels & 2) {
10317 unaligned_store_u16(output, (uint16_t) vout0123);
10318 vout0123 >>= 16;
10319 output += 2;
10320 }
10321 if (channels & 1) {
10322 *output = (uint8_t) vout0123;
10323 }
10324 }
10325 }
10326 }
10327
xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10328 void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
10329 size_t mr,
10330 size_t nc,
10331 size_t kc,
10332 const uint8_t* restrict a,
10333 size_t a_stride,
10334 const void* restrict w,
10335 uint8_t* restrict c,
10336 size_t cm_stride,
10337 size_t cn_stride,
10338 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10339 {
10340 assert(mr != 0);
10341 assert(mr <= 1);
10342 assert(nc != 0);
10343 assert(kc != 0);
10344 assert(kc % sizeof(uint8_t) == 0);
10345 assert(a != NULL);
10346 assert(w != NULL);
10347 assert(c != NULL);
10348
10349 kc = round_up_po2(kc, 8);
10350 const uint8_t* a0 = a;
10351 uint8_t* c0 = c;
10352
10353 do {
10354 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
10355 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
10356 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
10357 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
10358 w = (const int32_t*) w + 4;
10359
10360 size_t k = 0;
10361 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
10362 const __m128i vzero = _mm_setzero_si128();
10363 while (k < kc) {
10364 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
10365 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
10366 a0 += 8;
10367
10368 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
10369 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
10370
10371 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
10372 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
10373 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
10374
10375 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
10376 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
10377 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
10378
10379 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
10380 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
10381 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
10382
10383 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
10384
10385 w = (const void*) ((const uint8_t*) w + 32);
10386 k += 8 * sizeof(uint8_t);
10387 }
10388
10389 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
10390 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
10391
10392 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
10393
10394 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
10395
10396 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10397 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
10398
10399 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10400 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
10401
10402 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
10403
10404 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10405 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
10406
10407 __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
10408
10409 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10410
10411 if (nc >= 4) {
10412 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
10413
10414 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
10415
10416 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
10417
10418 nc -= 4;
10419 } else {
10420 if (nc & 2) {
10421 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
10422 c0 += 2;
10423 vout = _mm_srli_epi32(vout, 16);
10424 }
10425 if (nc & 1) {
10426 *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
10427 }
10428
10429 nc = 0;
10430 }
10431 } while (nc != 0);
10432 }
10433
xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10434 void xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
10435 size_t mr,
10436 size_t nc,
10437 size_t kc,
10438 const uint8_t* restrict a,
10439 size_t a_stride,
10440 const void* restrict w,
10441 uint8_t* restrict c,
10442 size_t cm_stride,
10443 size_t cn_stride,
10444 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10445 {
10446 assert(mr != 0);
10447 assert(mr <= 3);
10448 assert(nc != 0);
10449 assert(kc != 0);
10450 assert(kc % sizeof(uint8_t) == 0);
10451 assert(a != NULL);
10452 assert(w != NULL);
10453 assert(c != NULL);
10454
10455 kc = round_up_po2(kc, 8);
10456 const uint8_t* a0 = a;
10457 uint8_t* c0 = c;
10458 const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
10459 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
10460 if XNN_UNPREDICTABLE(mr < 2) {
10461 a1 = a0;
10462 c1 = c0;
10463 }
10464 const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
10465 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
10466 if XNN_UNPREDICTABLE(mr <= 2) {
10467 a2 = a1;
10468 c2 = c1;
10469 }
10470
10471 do {
10472 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
10473 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
10474 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
10475 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
10476 __m128i vacc1x0 = vacc0x0;
10477 __m128i vacc1x1 = vacc0x1;
10478 __m128i vacc1x2 = vacc0x2;
10479 __m128i vacc1x3 = vacc0x3;
10480 __m128i vacc2x0 = vacc0x0;
10481 __m128i vacc2x1 = vacc0x1;
10482 __m128i vacc2x2 = vacc0x2;
10483 __m128i vacc2x3 = vacc0x3;
10484 w = (const int32_t*) w + 4;
10485
10486 size_t k = 0;
10487 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
10488 const __m128i vzero = _mm_setzero_si128();
10489 while (k < kc) {
10490 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
10491 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
10492 a0 += 8;
10493 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
10494 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
10495 a1 += 8;
10496 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
10497 const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
10498 a2 += 8;
10499
10500 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
10501 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
10502
10503 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
10504 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
10505 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
10506 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
10507 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
10508
10509 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
10510 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
10511 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
10512 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
10513 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
10514
10515 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
10516 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
10517 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
10518 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
10519 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
10520
10521 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
10522 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
10523 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
10524
10525 w = (const void*) ((const uint8_t*) w + 32);
10526 k += 8 * sizeof(uint8_t);
10527 }
10528
10529 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
10530 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
10531 const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
10532 const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
10533 const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
10534 const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
10535
10536 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
10537 __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
10538 __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
10539
10540 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
10541 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
10542 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
10543
10544 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10545 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
10546 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
10547 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
10548
10549 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10550 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
10551 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
10552 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
10553
10554 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
10555 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
10556 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
10557
10558 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10559 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
10560 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
10561
10562 __m128i vout = _mm_packus_epi16(vacc01x0123, vacc22x0123);
10563
10564 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10565
10566 if (nc >= 4) {
10567 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
10568 vout = _mm_srli_si128(vout, 4);
10569 unaligned_store_u32(c1, (uint32_t) _mm_cvtsi128_si32(vout));
10570 vout = _mm_srli_si128(vout, 4);
10571 unaligned_store_u32(c2, (uint32_t) _mm_cvtsi128_si32(vout));
10572
10573 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
10574 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
10575 c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
10576
10577 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
10578 a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
10579 a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
10580
10581 nc -= 4;
10582 } else {
10583 if (nc & 2) {
10584 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
10585 c0 += 2;
10586 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
10587 c1 += 2;
10588 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
10589 c2 += 2;
10590 vout = _mm_srli_epi32(vout, 16);
10591 }
10592 if (nc & 1) {
10593 *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
10594 *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
10595 *c2 = (uint8_t) _mm_extract_epi16(vout, 4);
10596 }
10597
10598 nc = 0;
10599 }
10600 } while (nc != 0);
10601 }
10602
xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10603 void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64(
10604 size_t mr,
10605 size_t nc,
10606 size_t kc,
10607 size_t ks,
10608 const uint8_t** restrict a,
10609 const void* restrict w,
10610 uint8_t* restrict c,
10611 size_t cm_stride,
10612 size_t cn_stride,
10613 size_t a_offset,
10614 const uint8_t* zero,
10615 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10616 {
10617 assert(mr != 0);
10618 assert(mr <= 1);
10619 assert(nc != 0);
10620 assert(kc != 0);
10621 assert(ks != 0);
10622 assert(ks % (1 * sizeof(void*)) == 0);
10623 assert(a_offset % sizeof(uint8_t) == 0);
10624 assert(a != NULL);
10625 assert(w != NULL);
10626 assert(c != NULL);
10627
10628 kc = round_up_po2(kc, 8);
10629 uint8_t* c0 = c;
10630
10631 do {
10632 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
10633 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
10634 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
10635 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
10636 w = (const int32_t*) w + 4;
10637
10638 size_t p = ks;
10639 do {
10640 const uint8_t* restrict a0 = a[0];
10641 if XNN_UNPREDICTABLE(a0 != zero) {
10642 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
10643 }
10644 a += 1;
10645
10646 size_t k = 0;
10647 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
10648 const __m128i vzero = _mm_setzero_si128();
10649 while (k < kc) {
10650 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
10651 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
10652 a0 += 8;
10653
10654 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
10655 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
10656
10657 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
10658 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
10659 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
10660
10661 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
10662 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
10663 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
10664
10665 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
10666 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
10667 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
10668
10669 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
10670
10671 w = (const void*) ((const uint8_t*) w + 32);
10672 k += 8 * sizeof(uint8_t);
10673 }
10674 p -= 1 * sizeof(void*);
10675 } while (p != 0);
10676
10677 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
10678 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
10679
10680 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
10681
10682 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
10683
10684 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10685 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
10686
10687 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10688 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
10689
10690 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
10691
10692 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10693 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
10694
10695 __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
10696
10697 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10698
10699 if (nc >= 4) {
10700 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
10701 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
10702
10703 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
10704
10705 nc -= 4;
10706 } else {
10707 if (nc & 2) {
10708 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
10709 c0 += 2;
10710 vout = _mm_srli_epi32(vout, 16);
10711 }
10712 if (nc & 1) {
10713 *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
10714 }
10715
10716 nc = 0;
10717 }
10718 } while (nc != 0);
10719 }
10720
xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10721 void xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64(
10722 size_t mr,
10723 size_t nc,
10724 size_t kc,
10725 size_t ks,
10726 const uint8_t** restrict a,
10727 const void* restrict w,
10728 uint8_t* restrict c,
10729 size_t cm_stride,
10730 size_t cn_stride,
10731 size_t a_offset,
10732 const uint8_t* zero,
10733 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10734 {
10735 assert(mr != 0);
10736 assert(mr <= 3);
10737 assert(nc != 0);
10738 assert(kc != 0);
10739 assert(ks != 0);
10740 assert(ks % (3 * sizeof(void*)) == 0);
10741 assert(a_offset % sizeof(uint8_t) == 0);
10742 assert(a != NULL);
10743 assert(w != NULL);
10744 assert(c != NULL);
10745
10746 kc = round_up_po2(kc, 8);
10747 uint8_t* c0 = c;
10748 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
10749 if XNN_UNPREDICTABLE(mr < 2) {
10750 c1 = c0;
10751 }
10752 uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
10753 if XNN_UNPREDICTABLE(mr <= 2) {
10754 c2 = c1;
10755 }
10756
10757 do {
10758 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
10759 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
10760 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
10761 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
10762 __m128i vacc1x0 = vacc0x0;
10763 __m128i vacc1x1 = vacc0x1;
10764 __m128i vacc1x2 = vacc0x2;
10765 __m128i vacc1x3 = vacc0x3;
10766 __m128i vacc2x0 = vacc0x0;
10767 __m128i vacc2x1 = vacc0x1;
10768 __m128i vacc2x2 = vacc0x2;
10769 __m128i vacc2x3 = vacc0x3;
10770 w = (const int32_t*) w + 4;
10771
10772 size_t p = ks;
10773 do {
10774 const uint8_t* restrict a0 = a[0];
10775 if XNN_UNPREDICTABLE(a0 != zero) {
10776 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
10777 }
10778 const uint8_t* restrict a1 = a[1];
10779 if XNN_UNPREDICTABLE(a1 != zero) {
10780 a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
10781 }
10782 const uint8_t* restrict a2 = a[2];
10783 if XNN_UNPREDICTABLE(a2 != zero) {
10784 a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
10785 }
10786 a += 3;
10787
10788 size_t k = 0;
10789 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
10790 const __m128i vzero = _mm_setzero_si128();
10791 while (k < kc) {
10792 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
10793 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
10794 a0 += 8;
10795 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
10796 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
10797 a1 += 8;
10798 const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
10799 const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
10800 a2 += 8;
10801
10802 const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
10803 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
10804
10805 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
10806 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
10807 vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
10808 const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
10809 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
10810
10811 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
10812 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
10813 vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
10814 const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
10815 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
10816
10817 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
10818 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
10819 vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
10820 const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
10821 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
10822
10823 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
10824 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
10825 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
10826
10827 w = (const void*) ((const uint8_t*) w + 32);
10828 k += 8 * sizeof(uint8_t);
10829 }
10830 p -= 3 * sizeof(void*);
10831 } while (p != 0);
10832
10833 const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
10834 const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
10835 const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
10836 const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
10837 const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
10838 const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
10839
10840 __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
10841 __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
10842 __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
10843
10844 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
10845 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
10846 __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
10847
10848 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10849 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
10850 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
10851 vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
10852
10853 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10854 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
10855 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
10856 vscaled2x0123 = _mm_min_ps(vscaled2x0123, voutput_max_less_zero_point);
10857
10858 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
10859 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
10860 vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
10861
10862 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10863 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
10864 __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
10865
10866 __m128i vout = _mm_packus_epi16(vacc01x0123, vacc22x0123);
10867
10868 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10869
10870 if (nc >= 4) {
10871 unaligned_store_u32(c2, (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2))));
10872 c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
10873 unaligned_store_u32(c1, (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1))));
10874 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
10875 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
10876 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
10877
10878 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
10879
10880 nc -= 4;
10881 } else {
10882 if (nc & 2) {
10883 unaligned_store_u16(c2, (uint16_t) _mm_extract_epi16(vout, 4));
10884 c2 += 2;
10885 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
10886 c1 += 2;
10887 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
10888 c0 += 2;
10889 vout = _mm_srli_epi32(vout, 16);
10890 }
10891 if (nc & 1) {
10892 *c2 = (uint8_t) _mm_extract_epi16(vout, 4);
10893 *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
10894 *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
10895 }
10896
10897 nc = 0;
10898 }
10899 } while (nc != 0);
10900 }
10901
xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10902 void xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8(
10903 size_t n,
10904 const uint8_t* input_a,
10905 const uint8_t* input_b,
10906 uint8_t* output,
10907 const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10908 {
10909 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
10910 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
10911 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
10912 const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
10913 const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
10914 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
10915 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
10916 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
10917 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
10918
10919 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
10920 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
10921 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
10922 input_a += 8;
10923 input_b += 8;
10924
10925 const __m128i vzero = _mm_setzero_si128();
10926 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
10927 vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
10928
10929 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
10930 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
10931 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
10932 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
10933
10934 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
10935 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
10936
10937
10938 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
10939 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
10940
10941 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
10942 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
10943
10944 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
10945 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
10946
10947 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10948
10949
10950 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10951
10952 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
10953
10954 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
10955
10956 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10957 output += 8;
10958 }
10959 if XNN_UNLIKELY(n != 0) {
10960 {
10961 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
10962 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
10963
10964 const __m128i vzero = _mm_setzero_si128();
10965 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
10966 vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
10967
10968 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
10969 __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
10970 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
10971 const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
10972
10973 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
10974 vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
10975
10976
10977 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
10978 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
10979
10980 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
10981 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
10982
10983 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
10984 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
10985
10986 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10987
10988 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10989 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
10990 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
10991
10992 if (n & (4 * sizeof(uint8_t))) {
10993 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10994 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10995 output += 4;
10996 }
10997 if (n & (2 * sizeof(uint8_t))) {
10998 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
10999 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11000 output += 2;
11001 }
11002 if (n & (1 * sizeof(uint8_t))) {
11003 *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
11004 }
11005 }
11006 }
11007 }
11008
xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11009 void xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8(
11010 size_t n,
11011 const uint8_t* input_a,
11012 const uint8_t* input_b,
11013 uint8_t* output,
11014 const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11015 {
11016 const __m128i vbias = _mm_add_epi32(
11017 _mm_shuffle_epi32(_mm_cvtsi32_si128(params->sse2.b_multiplier * (int32_t) *input_b), _MM_SHUFFLE(0, 0, 0, 0)),
11018 _mm_load_si128((const __m128i*) params->sse2.bias));
11019 const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
11020 const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
11021 const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
11022 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
11023 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
11024 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
11025
11026 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11027 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
11028 input_a += 8;
11029
11030 const __m128i vzero = _mm_setzero_si128();
11031 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
11032
11033 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
11034 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
11035
11036 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
11037
11038
11039 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
11040 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
11041
11042 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11043 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11044
11045 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11046
11047
11048 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11049
11050 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11051
11052 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11053
11054 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11055 output += 8;
11056 }
11057 if XNN_UNLIKELY(n != 0) {
11058 {
11059 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
11060
11061 va01234567 = _mm_unpacklo_epi8(va01234567, _mm_setzero_si128());
11062
11063 __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
11064 const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
11065
11066 vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
11067
11068
11069 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
11070 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
11071
11072 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11073 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11074
11075 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11076
11077 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11078 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11079 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11080
11081 if (n & (4 * sizeof(uint8_t))) {
11082 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11083 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11084 output += 4;
11085 }
11086 if (n & (2 * sizeof(uint8_t))) {
11087 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
11088 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11089 output += 2;
11090 }
11091 if (n & (1 * sizeof(uint8_t))) {
11092 *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
11093 }
11094 }
11095 }
11096 }
11097
xnn_qu8_vcvt_ukernel__sse2_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])11098 void xnn_qu8_vcvt_ukernel__sse2_x32(
11099 size_t n,
11100 const uint8_t* x,
11101 uint8_t* y,
11102 const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11103 {
11104 assert(n != 0);
11105 assert(n % sizeof(uint8_t) == 0);
11106 assert(x != NULL);
11107 assert(y != NULL);
11108
11109 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
11110 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
11111 const __m128i vzero = _mm_setzero_si128();
11112 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
11113 const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
11114 const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
11115 x += 32;
11116
11117 const __m128i vextx0 = _mm_unpacklo_epi8(vx0, vzero);
11118 const __m128i vextx1 = _mm_unpackhi_epi8(vx0, vzero);
11119 const __m128i vextx2 = _mm_unpacklo_epi8(vx1, vzero);
11120 const __m128i vextx3 = _mm_unpackhi_epi8(vx1, vzero);
11121
11122 const __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier);
11123 const __m128i vprodhi0 = _mm_mulhi_epu16(vextx0, vmultiplier);
11124 const __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier);
11125 const __m128i vprodhi1 = _mm_mulhi_epu16(vextx1, vmultiplier);
11126 const __m128i vprodlo2 = _mm_mullo_epi16(vextx2, vmultiplier);
11127 const __m128i vprodhi2 = _mm_mulhi_epu16(vextx2, vmultiplier);
11128 const __m128i vprodlo3 = _mm_mullo_epi16(vextx3, vmultiplier);
11129 const __m128i vprodhi3 = _mm_mulhi_epu16(vextx3, vmultiplier);
11130
11131 __m128i vacc0 = _mm_unpacklo_epi16(vprodlo0, vprodhi0);
11132 __m128i vacc1 = _mm_unpackhi_epi16(vprodlo0, vprodhi0);
11133 __m128i vacc2 = _mm_unpacklo_epi16(vprodlo1, vprodhi1);
11134 __m128i vacc3 = _mm_unpackhi_epi16(vprodlo1, vprodhi1);
11135 __m128i vacc4 = _mm_unpacklo_epi16(vprodlo2, vprodhi2);
11136 __m128i vacc5 = _mm_unpackhi_epi16(vprodlo2, vprodhi2);
11137 __m128i vacc6 = _mm_unpacklo_epi16(vprodlo3, vprodhi3);
11138 __m128i vacc7 = _mm_unpackhi_epi16(vprodlo3, vprodhi3);
11139
11140 vacc0 = _mm_add_epi32(vacc0, vbias);
11141 vacc1 = _mm_add_epi32(vacc1, vbias);
11142 vacc2 = _mm_add_epi32(vacc2, vbias);
11143 vacc3 = _mm_add_epi32(vacc3, vbias);
11144 vacc4 = _mm_add_epi32(vacc4, vbias);
11145 vacc5 = _mm_add_epi32(vacc5, vbias);
11146 vacc6 = _mm_add_epi32(vacc6, vbias);
11147 vacc7 = _mm_add_epi32(vacc7, vbias);
11148
11149 vacc0 = _mm_srai_epi32(vacc0, 8);
11150 vacc1 = _mm_srai_epi32(vacc1, 8);
11151 vacc2 = _mm_srai_epi32(vacc2, 8);
11152 vacc3 = _mm_srai_epi32(vacc3, 8);
11153 vacc4 = _mm_srai_epi32(vacc4, 8);
11154 vacc5 = _mm_srai_epi32(vacc5, 8);
11155 vacc6 = _mm_srai_epi32(vacc6, 8);
11156 vacc7 = _mm_srai_epi32(vacc7, 8);
11157
11158 vacc0 = _mm_packs_epi32(vacc0, vacc1);
11159 vacc1 = _mm_packs_epi32(vacc2, vacc3);
11160 vacc2 = _mm_packs_epi32(vacc4, vacc5);
11161 vacc3 = _mm_packs_epi32(vacc6, vacc7);
11162
11163 const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
11164 const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
11165
11166 _mm_storeu_si128((__m128i*) y, vy0);
11167 _mm_storeu_si128((__m128i*) (y + 16), vy1);
11168 y += 32;
11169 }
11170 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
11171 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
11172 x += 16;
11173
11174 const __m128i vextx_lo = _mm_unpacklo_epi8(vx, vzero);
11175 const __m128i vextx_hi = _mm_unpackhi_epi8(vx, vzero);
11176
11177 const __m128i vprodlo_lo = _mm_mullo_epi16(vextx_lo, vmultiplier);
11178 const __m128i vprodlo_hi = _mm_mullo_epi16(vextx_hi, vmultiplier);
11179 const __m128i vprodhi_lo = _mm_mulhi_epu16(vextx_lo, vmultiplier);
11180 const __m128i vprodhi_hi = _mm_mulhi_epu16(vextx_hi, vmultiplier);
11181
11182 __m128i vacc_ll = _mm_unpacklo_epi16(vprodlo_lo, vprodhi_lo);
11183 __m128i vacc_lh = _mm_unpackhi_epi16(vprodlo_lo, vprodhi_lo);
11184 __m128i vacc_hl = _mm_unpacklo_epi16(vprodlo_hi, vprodhi_hi);
11185 __m128i vacc_hh = _mm_unpackhi_epi16(vprodlo_hi, vprodhi_hi);
11186
11187 vacc_ll = _mm_add_epi32(vacc_ll, vbias);
11188 vacc_lh = _mm_add_epi32(vacc_lh, vbias);
11189 vacc_hl = _mm_add_epi32(vacc_hl, vbias);
11190 vacc_hh = _mm_add_epi32(vacc_hh, vbias);
11191
11192 vacc_ll = _mm_srai_epi32(vacc_ll, 8);
11193 vacc_lh = _mm_srai_epi32(vacc_lh, 8);
11194 vacc_hl = _mm_srai_epi32(vacc_hl, 8);
11195 vacc_hh = _mm_srai_epi32(vacc_hh, 8);
11196
11197 const __m128i vacc_lo = _mm_packs_epi32(vacc_ll, vacc_lh);
11198 const __m128i vacc_hi = _mm_packs_epi32(vacc_hl, vacc_hh);
11199
11200 const __m128i vy = _mm_packus_epi16(vacc_lo, vacc_hi);
11201 _mm_storeu_si128((__m128i*) y, vy);
11202 y += 16;
11203 }
11204 if XNN_UNLIKELY(n != 0) {
11205 assert(n >= 1 * sizeof(uint8_t));
11206 assert(n <= 15 * sizeof(uint8_t));
11207
11208 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
11209
11210 const __m128i vextx_lo = _mm_unpacklo_epi8(vx, vzero);
11211 const __m128i vextx_hi = _mm_unpackhi_epi8(vx, vzero);
11212
11213 const __m128i vprodlo_lo = _mm_mullo_epi16(vextx_lo, vmultiplier);
11214 const __m128i vprodlo_hi = _mm_mullo_epi16(vextx_hi, vmultiplier);
11215 const __m128i vprodhi_lo = _mm_mulhi_epu16(vextx_lo, vmultiplier);
11216 const __m128i vprodhi_hi = _mm_mulhi_epu16(vextx_hi, vmultiplier);
11217
11218 __m128i vacc_ll = _mm_unpacklo_epi16(vprodlo_lo, vprodhi_lo);
11219 __m128i vacc_lh = _mm_unpackhi_epi16(vprodlo_lo, vprodhi_lo);
11220 __m128i vacc_hl = _mm_unpacklo_epi16(vprodlo_hi, vprodhi_hi);
11221 __m128i vacc_hh = _mm_unpackhi_epi16(vprodlo_hi, vprodhi_hi);
11222
11223 vacc_ll = _mm_add_epi32(vacc_ll, vbias);
11224 vacc_lh = _mm_add_epi32(vacc_lh, vbias);
11225 vacc_hl = _mm_add_epi32(vacc_hl, vbias);
11226 vacc_hh = _mm_add_epi32(vacc_hh, vbias);
11227
11228 vacc_ll = _mm_srai_epi32(vacc_ll, 8);
11229 vacc_lh = _mm_srai_epi32(vacc_lh, 8);
11230 vacc_hl = _mm_srai_epi32(vacc_hl, 8);
11231 vacc_hh = _mm_srai_epi32(vacc_hh, 8);
11232
11233 const __m128i vacc_lo = _mm_packs_epi32(vacc_ll, vacc_lh);
11234 const __m128i vacc_hi = _mm_packs_epi32(vacc_hl, vacc_hh);
11235
11236 __m128i vy = _mm_packus_epi16(vacc_lo, vacc_hi);
11237 if (n & (8 * sizeof(uint8_t))) {
11238 _mm_storel_epi64((__m128i*) y, vy);
11239 vy = _mm_unpackhi_epi64(vy, vy);
11240 y += 8;
11241 }
11242 if (n & (4 * sizeof(uint8_t))) {
11243 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
11244 vy = _mm_srli_epi64(vy, 32);
11245 y += 4;
11246 }
11247 uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
11248 if (n & (2 * sizeof(uint8_t))) {
11249 unaligned_store_u16(y, (uint16_t) vy_lo);
11250 vy_lo >>= 16;
11251 y += 2;
11252 }
11253 if (n & (1 * sizeof(uint8_t))) {
11254 *y = (uint8_t) vy_lo;
11255 }
11256 }
11257 }
11258
xnn_qu8_vlrelu_ukernel__sse2_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])11259 void xnn_qu8_vlrelu_ukernel__sse2_x32(
11260 size_t n,
11261 const uint8_t* x,
11262 uint8_t* y,
11263 const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11264 {
11265 assert(n != 0);
11266 assert(n % sizeof(uint8_t) == 0);
11267 assert(x != NULL);
11268 assert(y != NULL);
11269
11270 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->sse2.input_zero_point);
11271 const __m128i vmultiplier_diff = _mm_load_si128((const __m128i*) params->sse2.multiplier_diff);
11272 const __m128i vmultiplier_base = _mm_load_si128((const __m128i*) params->sse2.multiplier_base);
11273 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
11274 const __m128i vzero = _mm_setzero_si128();
11275 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
11276 const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
11277 const __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
11278 x += 32;
11279
11280 __m128i vextx0 = _mm_unpacklo_epi8(vx0, vzero);
11281 __m128i vextx1 = _mm_unpackhi_epi8(vx0, vzero);
11282 __m128i vextx2 = _mm_unpacklo_epi8(vx1, vzero);
11283 __m128i vextx3 = _mm_unpackhi_epi8(vx1, vzero);
11284
11285 __m128i vmultiplier0 = _mm_cmpgt_epi16(vextx0, vinput_zero_point);
11286 vextx0 = _mm_sub_epi16(vinput_zero_point, vextx0);
11287 __m128i vmultiplier1 = _mm_cmpgt_epi16(vextx1, vinput_zero_point);
11288 vextx1 = _mm_sub_epi16(vinput_zero_point, vextx1);
11289 __m128i vmultiplier2 = _mm_cmpgt_epi16(vextx2, vinput_zero_point);
11290 vextx2 = _mm_sub_epi16(vinput_zero_point, vextx2);
11291 __m128i vmultiplier3 = _mm_cmpgt_epi16(vextx3, vinput_zero_point);
11292 vextx3 = _mm_sub_epi16(vinput_zero_point, vextx3);
11293
11294 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
11295 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
11296 vmultiplier2 = _mm_and_si128(vmultiplier2, vmultiplier_diff);
11297 vmultiplier3 = _mm_and_si128(vmultiplier3, vmultiplier_diff);
11298
11299 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
11300 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
11301 vmultiplier2 = _mm_xor_si128(vmultiplier2, vmultiplier_base);
11302 vmultiplier3 = _mm_xor_si128(vmultiplier3, vmultiplier_base);
11303
11304 __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier0);
11305 __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier1);
11306 __m128i vprodlo2 = _mm_mullo_epi16(vextx2, vmultiplier2);
11307 __m128i vprodlo3 = _mm_mullo_epi16(vextx3, vmultiplier3);
11308
11309 vprodlo0 = _mm_srli_epi16(vprodlo0, 7);
11310 __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier0);
11311 vprodlo1 = _mm_srli_epi16(vprodlo1, 7);
11312 __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier1);
11313 vprodlo2 = _mm_srli_epi16(vprodlo2, 7);
11314 __m128i vprodhi2 = _mm_mulhi_epi16(vextx2, vmultiplier2);
11315 vprodlo3 = _mm_srli_epi16(vprodlo3, 7);
11316 __m128i vprodhi3 = _mm_mulhi_epi16(vextx3, vmultiplier3);
11317
11318 vprodhi0 = _mm_slli_epi16(vprodhi0, 8);
11319 vprodlo0 = _mm_avg_epu16(vprodlo0, vzero);
11320 vprodhi1 = _mm_slli_epi16(vprodhi1, 8);
11321 vprodlo1 = _mm_avg_epu16(vprodlo1, vzero);
11322 vprodhi2 = _mm_slli_epi16(vprodhi2, 8);
11323 vprodlo2 = _mm_avg_epu16(vprodlo2, vzero);
11324 vprodhi3 = _mm_slli_epi16(vprodhi3, 8);
11325 vprodlo3 = _mm_avg_epu16(vprodlo3, vzero);
11326
11327 __m128i vacc0 = _mm_add_epi16(vprodlo0, vprodhi0);
11328 __m128i vacc1 = _mm_add_epi16(vprodlo1, vprodhi1);
11329 __m128i vacc2 = _mm_add_epi16(vprodlo2, vprodhi2);
11330 __m128i vacc3 = _mm_add_epi16(vprodlo3, vprodhi3);
11331
11332 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11333 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11334 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
11335 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
11336
11337 const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
11338 const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
11339
11340 _mm_storeu_si128((__m128i*) y, vy0);
11341 _mm_storeu_si128((__m128i*) (y + 16), vy1);
11342 y += 32;
11343 }
11344 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
11345 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
11346 x += 16;
11347
11348 __m128i vextx0 = _mm_unpacklo_epi8(vx, vzero);
11349 __m128i vextx1 = _mm_unpackhi_epi8(vx, vzero);
11350
11351 __m128i vmultiplier0 = _mm_cmpgt_epi16(vextx0, vinput_zero_point);
11352 __m128i vmultiplier1 = _mm_cmpgt_epi16(vextx1, vinput_zero_point);
11353 vextx0 = _mm_sub_epi16(vinput_zero_point, vextx0);
11354 vextx1 = _mm_sub_epi16(vinput_zero_point, vextx1);
11355
11356 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
11357 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
11358
11359 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
11360 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
11361
11362 __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier0);
11363 __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier1);
11364
11365 vprodlo0 = _mm_srli_epi16(vprodlo0, 7);
11366 vprodlo1 = _mm_srli_epi16(vprodlo1, 7);
11367 __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier0);
11368 __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier1);
11369
11370 vprodhi0 = _mm_slli_epi16(vprodhi0, 8);
11371 vprodhi1 = _mm_slli_epi16(vprodhi1, 8);
11372 vprodlo0 = _mm_avg_epu16(vprodlo0, vzero);
11373 vprodlo1 = _mm_avg_epu16(vprodlo1, vzero);
11374
11375 __m128i vacc0 = _mm_add_epi16(vprodlo0, vprodhi0);
11376 __m128i vacc1 = _mm_add_epi16(vprodlo1, vprodhi1);
11377
11378 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11379 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11380
11381 const __m128i vy = _mm_packus_epi16(vacc0, vacc1);
11382 _mm_storeu_si128((__m128i*) y, vy);
11383 y += 16;
11384 }
11385 if XNN_UNLIKELY(n != 0) {
11386 assert(n >= 1 * sizeof(uint8_t));
11387 assert(n <= 15 * sizeof(uint8_t));
11388
11389 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
11390
11391 __m128i vextx0 = _mm_unpacklo_epi8(vx, vzero);
11392 __m128i vextx1 = _mm_unpackhi_epi8(vx, vzero);
11393
11394 __m128i vmultiplier0 = _mm_cmpgt_epi16(vextx0, vinput_zero_point);
11395 __m128i vmultiplier1 = _mm_cmpgt_epi16(vextx1, vinput_zero_point);
11396 vextx0 = _mm_sub_epi16(vinput_zero_point, vextx0);
11397 vextx1 = _mm_sub_epi16(vinput_zero_point, vextx1);
11398
11399 vmultiplier0 = _mm_and_si128(vmultiplier0, vmultiplier_diff);
11400 vmultiplier1 = _mm_and_si128(vmultiplier1, vmultiplier_diff);
11401
11402 vmultiplier0 = _mm_xor_si128(vmultiplier0, vmultiplier_base);
11403 vmultiplier1 = _mm_xor_si128(vmultiplier1, vmultiplier_base);
11404
11405 __m128i vprodlo0 = _mm_mullo_epi16(vextx0, vmultiplier0);
11406 __m128i vprodlo1 = _mm_mullo_epi16(vextx1, vmultiplier1);
11407
11408 vprodlo0 = _mm_srli_epi16(vprodlo0, 7);
11409 vprodlo1 = _mm_srli_epi16(vprodlo1, 7);
11410 __m128i vprodhi0 = _mm_mulhi_epi16(vextx0, vmultiplier0);
11411 __m128i vprodhi1 = _mm_mulhi_epi16(vextx1, vmultiplier1);
11412
11413 vprodhi0 = _mm_slli_epi16(vprodhi0, 8);
11414 vprodhi1 = _mm_slli_epi16(vprodhi1, 8);
11415 vprodlo0 = _mm_avg_epu16(vprodlo0, vzero);
11416 vprodlo1 = _mm_avg_epu16(vprodlo1, vzero);
11417
11418 __m128i vacc0 = _mm_add_epi16(vprodlo0, vprodhi0);
11419 __m128i vacc1 = _mm_add_epi16(vprodlo1, vprodhi1);
11420
11421 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11422 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11423
11424 __m128i vy = _mm_packus_epi16(vacc0, vacc1);
11425 if (n & (8 * sizeof(uint8_t))) {
11426 _mm_storel_epi64((__m128i*) y, vy);
11427 vy = _mm_unpackhi_epi64(vy, vy);
11428 y += 8;
11429 }
11430 if (n & (4 * sizeof(uint8_t))) {
11431 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vy));
11432 vy = _mm_srli_epi64(vy, 32);
11433 y += 4;
11434 }
11435 uint32_t vy0 = (uint32_t) _mm_cvtsi128_si32(vy);
11436 if (n & (2 * sizeof(uint8_t))) {
11437 unaligned_store_u16(y, (uint16_t) vy0);
11438 vy0 >>= 16;
11439 y += 2;
11440 }
11441 if (n & (1 * sizeof(uint8_t))) {
11442 *y = (uint8_t) vy0;
11443 }
11444 }
11445 }
11446
xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11447 void xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8(
11448 size_t n,
11449 const uint8_t* input_a,
11450 const uint8_t* input_b,
11451 uint8_t* output,
11452 const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11453
11454 {
11455 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
11456 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point);
11457 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11458 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11459 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
11460 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
11461
11462 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11463 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
11464 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
11465 input_a += 8;
11466 input_b += 8;
11467
11468 const __m128i vzero = _mm_setzero_si128();
11469 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
11470 vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
11471
11472 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11473 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
11474
11475 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
11476 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
11477
11478 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11479 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11480
11481 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11482 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11483
11484 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11485 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11486
11487 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11488 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11489
11490 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11491
11492
11493 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11494
11495 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11496
11497 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11498
11499 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11500 output += 8;
11501 }
11502 if XNN_UNLIKELY(n != 0) {
11503 {
11504 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
11505 __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
11506
11507 const __m128i vzero = _mm_setzero_si128();
11508 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
11509 vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
11510
11511 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11512 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
11513
11514 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
11515 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
11516
11517 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11518 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11519
11520 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11521 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11522
11523 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11524 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11525
11526 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11527 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11528
11529 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11530
11531 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11532 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11533 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11534
11535 if (n & (4 * sizeof(uint8_t))) {
11536 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11537 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11538 output += 4;
11539 }
11540 if (n & (2 * sizeof(uint8_t))) {
11541 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
11542 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11543 output += 2;
11544 }
11545 if (n & (1 * sizeof(uint8_t))) {
11546 *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
11547 }
11548 }
11549 }
11550 }
11551
xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11552 void xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8(
11553 size_t n,
11554 const uint8_t* input_a,
11555 const uint8_t* input_b,
11556 uint8_t* output,
11557 const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11558
11559 {
11560 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
11561 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11562 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11563 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
11564 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
11565
11566 __m128i vxb = _mm_sub_epi16(
11567 _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
11568 _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point));
11569 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11570 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
11571 input_a += 8;
11572
11573 const __m128i vzero = _mm_setzero_si128();
11574 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
11575
11576 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11577
11578 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
11579 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
11580
11581 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11582 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11583
11584 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11585 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11586
11587 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11588 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11589
11590 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11591 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11592
11593 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11594
11595
11596 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11597
11598 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11599
11600 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11601
11602 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11603 output += 8;
11604 }
11605 if XNN_UNLIKELY(n != 0) {
11606 {
11607 __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
11608
11609 const __m128i vzero = _mm_setzero_si128();
11610 va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
11611
11612 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11613
11614 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
11615 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
11616
11617 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11618 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11619
11620 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11621 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11622
11623 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11624 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11625
11626 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11627 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11628
11629 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11630
11631 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11632 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11633 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11634
11635 if (n & (4 * sizeof(uint8_t))) {
11636 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11637 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11638 output += 4;
11639 }
11640 if (n & (2 * sizeof(uint8_t))) {
11641 unaligned_store_u16(output, (uint16_t) _mm_cvtsi128_si32(vout0123456701234567));
11642 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11643 output += 2;
11644 }
11645 if (n & (1 * sizeof(uint8_t))) {
11646 *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
11647 }
11648 }
11649 }
11650 }
11651
xnn_s8_ibilinear_ukernel__sse2_c8(size_t output_pixels,size_t channels,const int8_t ** restrict input,size_t input_offset,const int16_t * restrict weights,int8_t * restrict output,size_t output_increment)11652 void xnn_s8_ibilinear_ukernel__sse2_c8(
11653 size_t output_pixels,
11654 size_t channels,
11655 const int8_t**restrict input,
11656 size_t input_offset,
11657 const int16_t*restrict weights,
11658 int8_t*restrict output,
11659 size_t output_increment) XNN_OOB_READS
11660 {
11661 assert(output_pixels != 0);
11662 assert(channels != 0);
11663
11664 do {
11665 const int8_t* i0 = (const int8_t*) ((uintptr_t) input[0] + input_offset);
11666 const int8_t* i1 = (const int8_t*) ((uintptr_t) input[1] + input_offset);
11667 const int8_t* i2 = (const int8_t*) ((uintptr_t) input[2] + input_offset);
11668 const int8_t* i3 = (const int8_t*) ((uintptr_t) input[3] + input_offset);
11669 input += 4;
11670
11671 const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
11672 weights += 2;
11673 __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
11674 valphah = _mm_unpacklo_epi64(valphah, valphah);
11675 __m128i valphav = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(1, 1, 1, 1));
11676 valphav = _mm_unpacklo_epi64(valphav, valphav);
11677
11678 valphah = _mm_xor_si128(valphah, _mm_set1_epi32(0xFFFF0000));
11679 valphah = _mm_add_epi16(valphah, _mm_set1_epi32(0x08010000));
11680
11681 const __m128i vrounding = _mm_set1_epi32(0x00200000);
11682
11683 size_t c = channels;
11684 for (; c >= 8 * sizeof(int8_t); c -= 8 * sizeof(int8_t)) {
11685 __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0);
11686 i0 += 8;
11687 __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1);
11688 i1 += 8;
11689 __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2);
11690 i2 += 8;
11691 __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3);
11692 i3 += 8;
11693
11694 vtl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtl01234567, vtl01234567), 8);
11695 vtr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtr01234567, vtr01234567), 8);
11696 vbl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbl01234567, vbl01234567), 8);
11697 vbr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbr01234567, vbr01234567), 8);
11698
11699 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
11700 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
11701 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
11702 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
11703
11704 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
11705 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
11706
11707 __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16);
11708 __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16);
11709
11710 vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123);
11711 vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567);
11712
11713 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
11714 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
11715
11716 vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
11717 vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
11718
11719 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
11720
11721 const __m128i vo01234567 = _mm_packs_epi16(vacc01234567, vacc01234567);
11722
11723 _mm_storel_epi64((__m128i*) output, vo01234567);
11724 output += 8;
11725 }
11726 if XNN_UNLIKELY(c != 0) {
11727 __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0);
11728 __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1);
11729 __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2);
11730 __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3);
11731
11732 vtl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtl01234567, vtl01234567), 8);
11733 vtr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vtr01234567, vtr01234567), 8);
11734 vbl01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbl01234567, vbl01234567), 8);
11735 vbr01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vbr01234567, vbr01234567), 8);
11736
11737 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
11738 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
11739 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
11740 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
11741
11742 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
11743 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
11744
11745 __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16);
11746 __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16);
11747
11748 vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123);
11749 vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567);
11750
11751 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
11752 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
11753
11754 vacc0123 = _mm_srai_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
11755 vacc4567 = _mm_srai_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
11756
11757 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
11758
11759 __m128i vo01234567 = _mm_packs_epi16(vacc01234567, vacc01234567);
11760
11761 if (c & (4 * sizeof(int8_t))) {
11762 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
11763 output += 4;
11764 vo01234567 = _mm_srli_epi64(vo01234567, 32);
11765 }
11766 uint32_t vo0123 = (uint32_t) _mm_cvtsi128_si32(vo01234567);
11767 if (c & (2 * sizeof(int8_t))) {
11768 unaligned_store_u16(output, (uint16_t) vo0123);
11769 output += 2;
11770 vo0123 >>= 16;
11771 }
11772 if (c & (1 * sizeof(int8_t))) {
11773 *output++ = (uint8_t) vo0123;
11774 }
11775 }
11776
11777 output = (int8_t*) ((uintptr_t) output + output_increment);
11778 } while (--output_pixels != 0);
11779 }
11780
xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16(size_t output_pixels,size_t kernel_elements,size_t channels,const int8_t ** input,size_t input_offset,int8_t * output,size_t input_increment,size_t output_increment,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11781 void xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16(
11782 size_t output_pixels,
11783 size_t kernel_elements,
11784 size_t channels,
11785 const int8_t** input,
11786 size_t input_offset,
11787 int8_t* output,
11788 size_t input_increment,
11789 size_t output_increment,
11790 const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11791 {
11792 assert(output_pixels != 0);
11793 assert(kernel_elements != 0);
11794 assert(channels != 0);
11795
11796 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
11797 const __m128i voutput_max_with_bias = _mm_load_si128((const __m128i*) params->sse2.max_with_bias);
11798 const __m128i voutput_min_with_bias = _mm_load_si128((const __m128i*) params->sse2.min_with_bias);
11799
11800 do {
11801 int8_t* o = output;
11802 {
11803 const int8_t* i0 = *input++;
11804 const int8_t* i1 = *input++;
11805 const int8_t* i2 = *input++;
11806 const int8_t* i3 = *input++;
11807 const int8_t* i4 = *input++;
11808 const int8_t* i5 = *input++;
11809 const int8_t* i6 = *input++;
11810 const int8_t* i7 = *input++;
11811 const int8_t* i8 = *input++;
11812 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
11813 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
11814 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
11815 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
11816 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
11817 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
11818 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
11819 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
11820 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
11821 if (kernel_elements < 2) {
11822 i1 = i0;
11823 }
11824 if (kernel_elements <= 2) {
11825 i2 = i0;
11826 }
11827 if (kernel_elements < 4) {
11828 i3 = i0;
11829 }
11830 if (kernel_elements <= 4) {
11831 i4 = i0;
11832 }
11833 if (kernel_elements < 6) {
11834 i5 = i0;
11835 }
11836 if (kernel_elements <= 6) {
11837 i6 = i0;
11838 }
11839 if (kernel_elements < 8) {
11840 i7 = i0;
11841 }
11842 if (kernel_elements <= 8) {
11843 i8 = i0;
11844 }
11845
11846 size_t c = channels;
11847 for (; c >= 16; c -= 16) {
11848 const __m128i vi0 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i0), vbias);
11849 i0 += 16;
11850 const __m128i vi1 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i1), vbias);
11851 i1 += 16;
11852 const __m128i vi2 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i2), vbias);
11853 i2 += 16;
11854 const __m128i vi3 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i3), vbias);
11855 i3 += 16;
11856 const __m128i vi4 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i4), vbias);
11857 i4 += 16;
11858 const __m128i vi5 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i5), vbias);
11859 i5 += 16;
11860 const __m128i vi6 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i6), vbias);
11861 i6 += 16;
11862 const __m128i vi7 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i7), vbias);
11863 i7 += 16;
11864 const __m128i vi8 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i8), vbias);
11865 i8 += 16;
11866
11867 const __m128i vmax018 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vi8);
11868 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
11869 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
11870 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
11871
11872 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
11873 const __m128i vmax01678 = _mm_max_epu8(vmax018, vmax67);
11874 __m128i vout = _mm_max_epu8(vmax2345, vmax01678);
11875 vout = _mm_max_epu8(vout, voutput_min_with_bias);
11876 vout = _mm_min_epu8(vout, voutput_max_with_bias);
11877 vout = _mm_xor_si128(vout, vbias);
11878
11879 _mm_storeu_si128((__m128i*) o, vout); o += 16;
11880 }
11881 if (c != 0) {
11882 const __m128i vi0 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i0), vbias);
11883 const __m128i vi1 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i1), vbias);
11884 const __m128i vi2 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i2), vbias);
11885 const __m128i vi3 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i3), vbias);
11886 const __m128i vi4 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i4), vbias);
11887 const __m128i vi5 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i5), vbias);
11888 const __m128i vi6 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i6), vbias);
11889 const __m128i vi7 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i7), vbias);
11890 const __m128i vi8 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i8), vbias);
11891
11892 const __m128i vmax018 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vi8);
11893 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
11894 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
11895 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
11896
11897 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
11898 const __m128i vmax01678 = _mm_max_epu8(vmax018, vmax67);
11899 __m128i vout = _mm_max_epu8(vmax2345, vmax01678);
11900 vout = _mm_max_epu8(vout, voutput_min_with_bias);
11901 vout = _mm_min_epu8(vout, voutput_max_with_bias);
11902 vout = _mm_xor_si128(vout, vbias);
11903
11904 if (c & 8) {
11905 _mm_storel_epi64((__m128i*) o, vout);
11906 vout = _mm_unpackhi_epi64(vout, vout);
11907 o += 8;
11908 }
11909 if (c & 4) {
11910 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
11911 vout = _mm_srli_epi64(vout, 32);
11912 o += 4;
11913 }
11914 if (c & 2) {
11915 unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
11916 vout = _mm_srli_epi32(vout, 16);
11917 o += 2;
11918 }
11919 if (c & 1) {
11920 *((int8_t*) o) = (int8_t) _mm_cvtsi128_si32(vout);
11921 o += 1;
11922 }
11923 }
11924 }
11925
11926 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
11927 const int8_t* i0 = *input++;
11928 const int8_t* i1 = *input++;
11929 const int8_t* i2 = *input++;
11930 const int8_t* i3 = *input++;
11931 const int8_t* i4 = *input++;
11932 const int8_t* i5 = *input++;
11933 const int8_t* i6 = *input++;
11934 const int8_t* i7 = *input++;
11935 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
11936 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
11937 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
11938 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
11939 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
11940 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
11941 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
11942 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
11943 if (k < 2) {
11944 i1 = i0;
11945 }
11946 if (k <= 2) {
11947 i2 = i0;
11948 }
11949 if (k < 4) {
11950 i3 = i0;
11951 }
11952 if (k <= 4) {
11953 i4 = i0;
11954 }
11955 if (k < 6) {
11956 i5 = i0;
11957 }
11958 if (k <= 6) {
11959 i6 = i0;
11960 }
11961 if (k < 8) {
11962 i7 = i0;
11963 }
11964
11965 o = output;
11966 size_t c = channels;
11967 for (; c >= 16; c -= 16) {
11968 const __m128i vi0 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i0), vbias);
11969 i0 += 16;
11970 const __m128i vi1 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i1), vbias);
11971 i1 += 16;
11972 const __m128i vi2 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i2), vbias);
11973 i2 += 16;
11974 const __m128i vi3 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i3), vbias);
11975 i3 += 16;
11976 const __m128i vi4 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i4), vbias);
11977 i4 += 16;
11978 const __m128i vi5 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i5), vbias);
11979 i5 += 16;
11980 const __m128i vi6 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i6), vbias);
11981 i6 += 16;
11982 const __m128i vi7 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i7), vbias);
11983 i7 += 16;
11984 const __m128i vo = _mm_xor_si128(_mm_loadu_si128((const __m128i*) o), vbias);
11985
11986 const __m128i vmax01 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vo);
11987 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
11988 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
11989 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
11990
11991 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
11992 const __m128i vmax0167 = _mm_max_epu8(vmax01, vmax67);
11993 __m128i vout = _mm_max_epu8(vmax2345, vmax0167);
11994 vout = _mm_max_epu8(vout, voutput_min_with_bias);
11995 vout = _mm_min_epu8(vout, voutput_max_with_bias);
11996 vout = _mm_xor_si128(vout, vbias);
11997
11998 _mm_storeu_si128((__m128i*) o, vout);
11999 o += 16;
12000 }
12001 if (c != 0) {
12002 const __m128i vi0 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i0), vbias);
12003 const __m128i vi1 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i1), vbias);
12004 const __m128i vi2 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i2), vbias);
12005 const __m128i vi3 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i3), vbias);
12006 const __m128i vi4 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i4), vbias);
12007 const __m128i vi5 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i5), vbias);
12008 const __m128i vi6 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i6), vbias);
12009 const __m128i vi7 = _mm_xor_si128(_mm_loadu_si128((const __m128i*) i7), vbias);
12010 const __m128i vo = _mm_xor_si128(_mm_loadu_si128((const __m128i*) o), vbias);
12011
12012 const __m128i vmax01 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vo);
12013 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
12014 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
12015 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
12016
12017 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
12018 const __m128i vmax0167 = _mm_max_epu8(vmax01, vmax67);
12019 __m128i vout = _mm_max_epu8(vmax2345, vmax0167);
12020 vout = _mm_max_epu8(vout, voutput_min_with_bias);
12021 vout = _mm_min_epu8(vout, voutput_max_with_bias);
12022 vout = _mm_xor_si128(vout, vbias);
12023
12024 if (c & 8) {
12025 _mm_storel_epi64((__m128i*) o, vout);
12026 vout = _mm_unpackhi_epi64(vout, vout);
12027 o += 8;
12028 }
12029 if (c & 4) {
12030 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
12031 vout = _mm_srli_epi64(vout, 32);
12032 o += 4;
12033 }
12034 if (c & 2) {
12035 unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
12036 vout = _mm_srli_epi32(vout, 16);
12037 o += 2;
12038 }
12039 if (c & 1) {
12040 *o = (int8_t) _mm_cvtsi128_si32(vout);
12041 o += 1;
12042 }
12043 }
12044 }
12045 input = (const int8_t**) ((uintptr_t) input + input_increment);
12046 output = (int8_t*) ((uintptr_t) o + output_increment);
12047 } while (--output_pixels != 0);
12048 }
12049
xnn_s8_vclamp_ukernel__sse2_x64(size_t n,const int8_t * x,int8_t * y,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12050 void xnn_s8_vclamp_ukernel__sse2_x64(
12051 size_t n,
12052 const int8_t* x,
12053 int8_t* y,
12054 const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
12055 {
12056 assert(n != 0);
12057
12058 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
12059 const __m128i voutput_max_with_bias = _mm_load_si128((const __m128i*) params->sse2.max_with_bias);
12060 const __m128i voutput_min_with_bias = _mm_load_si128((const __m128i*) params->sse2.min_with_bias);
12061 for (; n >= 64; n -= 64) {
12062 __m128i vacc0 = _mm_loadu_si128((const __m128i*) x);
12063 __m128i vacc1 = _mm_loadu_si128((const __m128i*) x + 1);
12064 __m128i vacc2 = _mm_loadu_si128((const __m128i*) x + 2);
12065 __m128i vacc3 = _mm_loadu_si128((const __m128i*) x + 3);
12066 x += 64;
12067
12068 vacc0 = _mm_xor_si128(vacc0, vbias);
12069 vacc1 = _mm_xor_si128(vacc1, vbias);
12070 vacc2 = _mm_xor_si128(vacc2, vbias);
12071 vacc3 = _mm_xor_si128(vacc3, vbias);
12072
12073 vacc0 = _mm_max_epu8(vacc0, voutput_min_with_bias);
12074 vacc1 = _mm_max_epu8(vacc1, voutput_min_with_bias);
12075 vacc2 = _mm_max_epu8(vacc2, voutput_min_with_bias);
12076 vacc3 = _mm_max_epu8(vacc3, voutput_min_with_bias);
12077
12078 vacc0 = _mm_min_epu8(vacc0, voutput_max_with_bias);
12079 vacc1 = _mm_min_epu8(vacc1, voutput_max_with_bias);
12080 vacc2 = _mm_min_epu8(vacc2, voutput_max_with_bias);
12081 vacc3 = _mm_min_epu8(vacc3, voutput_max_with_bias);
12082
12083 vacc0 = _mm_xor_si128(vacc0, vbias);
12084 vacc1 = _mm_xor_si128(vacc1, vbias);
12085 vacc2 = _mm_xor_si128(vacc2, vbias);
12086 vacc3 = _mm_xor_si128(vacc3, vbias);
12087
12088 _mm_storeu_si128((__m128i*) y, vacc0);
12089 _mm_storeu_si128((__m128i*) y + 1, vacc1);
12090 _mm_storeu_si128((__m128i*) y + 2, vacc2);
12091 _mm_storeu_si128((__m128i*) y + 3, vacc3);
12092 y += 64;
12093 }
12094 for (; n >= 16; n -= 16) {
12095 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
12096 x += 16;
12097
12098 vacc = _mm_xor_si128(vacc, vbias);
12099 vacc = _mm_min_epu8(vacc, voutput_max_with_bias);
12100 vacc = _mm_max_epu8(vacc, voutput_min_with_bias);
12101 vacc = _mm_xor_si128(vacc, vbias);
12102
12103 _mm_storeu_si128((__m128i*) y, vacc);
12104 y += 16;
12105 }
12106 if XNN_UNLIKELY(n != 0) {
12107 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
12108
12109 vacc = _mm_xor_si128(vacc, vbias);
12110 vacc = _mm_min_epu8(vacc, voutput_max_with_bias);
12111 vacc = _mm_max_epu8(vacc, voutput_min_with_bias);
12112 vacc = _mm_xor_si128(vacc, vbias);
12113
12114 if (n & 8) {
12115 _mm_storel_epi64((__m128i*) y, vacc);
12116 y += 8;
12117 vacc = _mm_unpackhi_epi64(vacc, vacc);
12118 }
12119 if (n & 4) {
12120 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vacc));
12121 y += 4;
12122 vacc = _mm_srli_epi64(vacc, 32);
12123 }
12124 if (n & 2) {
12125 unaligned_store_u16(y, (uint16_t) _mm_cvtsi128_si32(vacc));
12126 y += 2;
12127 vacc = _mm_srli_epi32(vacc, 16);
12128 }
12129 if (n & 1) {
12130 *y = (int8_t) _mm_cvtsi128_si32(vacc);
12131 }
12132 }
12133 }
12134
xnn_u8_ibilinear_ukernel__sse2_c8(size_t output_pixels,size_t channels,const uint8_t ** restrict input,size_t input_offset,const int16_t * restrict weights,uint8_t * restrict output,size_t output_increment)12135 void xnn_u8_ibilinear_ukernel__sse2_c8(
12136 size_t output_pixels,
12137 size_t channels,
12138 const uint8_t**restrict input,
12139 size_t input_offset,
12140 const int16_t*restrict weights,
12141 uint8_t*restrict output,
12142 size_t output_increment) XNN_OOB_READS
12143 {
12144 assert(output_pixels != 0);
12145 assert(channels != 0);
12146
12147 do {
12148 const uint8_t* i0 = (const uint8_t*) ((uintptr_t) input[0] + input_offset);
12149 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) input[1] + input_offset);
12150 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) input[2] + input_offset);
12151 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) input[3] + input_offset);
12152 input += 4;
12153
12154 const __m128i valpha = _mm_cvtsi32_si128(*((const int*) weights));
12155 weights += 2;
12156 __m128i valphah = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(0, 0, 0, 0));
12157 valphah = _mm_unpacklo_epi64(valphah, valphah);
12158 __m128i valphav = _mm_shufflelo_epi16(valpha, _MM_SHUFFLE(1, 1, 1, 1));
12159 valphav = _mm_unpacklo_epi64(valphav, valphav);
12160
12161 valphah = _mm_xor_si128(valphah, _mm_set1_epi32(0xFFFF0000));
12162 valphah = _mm_add_epi16(valphah, _mm_set1_epi32(0x08010000));
12163
12164 const __m128i vrounding = _mm_set1_epi32(0x00200000);
12165
12166 size_t c = channels;
12167 for (; c >= 8 * sizeof(uint8_t); c -= 8 * sizeof(uint8_t)) {
12168 __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0);
12169 i0 += 8;
12170 __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1);
12171 i1 += 8;
12172 __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2);
12173 i2 += 8;
12174 __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3);
12175 i3 += 8;
12176
12177 __m128i vzero = _mm_setzero_si128();
12178 vtl01234567 = _mm_unpacklo_epi8(vtl01234567, vzero);
12179 vtr01234567 = _mm_unpacklo_epi8(vtr01234567, vzero);
12180 vbl01234567 = _mm_unpacklo_epi8(vbl01234567, vzero);
12181 vbr01234567 = _mm_unpacklo_epi8(vbr01234567, vzero);
12182
12183 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
12184 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
12185 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
12186 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
12187
12188 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
12189 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
12190
12191 __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16);
12192 __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16);
12193
12194 vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123);
12195 vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567);
12196
12197 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
12198 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
12199
12200 vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
12201 vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
12202
12203 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
12204
12205 const __m128i vo01234567 = _mm_packus_epi16(vacc01234567, vacc01234567);
12206
12207 _mm_storel_epi64((__m128i*) output, vo01234567);
12208 output += 8;
12209 }
12210 if XNN_UNLIKELY(c != 0) {
12211 __m128i vtl01234567 = _mm_loadl_epi64((const __m128i*) i0);
12212 __m128i vtr01234567 = _mm_loadl_epi64((const __m128i*) i1);
12213 __m128i vbl01234567 = _mm_loadl_epi64((const __m128i*) i2);
12214 __m128i vbr01234567 = _mm_loadl_epi64((const __m128i*) i3);
12215
12216 __m128i vzero = _mm_setzero_si128();
12217 vtl01234567 = _mm_unpacklo_epi8(vtl01234567, vzero);
12218 vtr01234567 = _mm_unpacklo_epi8(vtr01234567, vzero);
12219 vbl01234567 = _mm_unpacklo_epi8(vbl01234567, vzero);
12220 vbr01234567 = _mm_unpacklo_epi8(vbr01234567, vzero);
12221
12222 const __m128i vdr01234567 = _mm_sub_epi16(vbr01234567, vtr01234567);
12223 const __m128i vt0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vtr01234567, vtl01234567), valphah);
12224 const __m128i vdl01234567 = _mm_sub_epi16(vbl01234567, vtl01234567);
12225 const __m128i vt4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vtr01234567, vtl01234567), valphah);
12226
12227 const __m128i vd0123 = _mm_madd_epi16(_mm_unpacklo_epi16(vdr01234567, vdl01234567), valphah);
12228 const __m128i vd4567 = _mm_madd_epi16(_mm_unpackhi_epi16(vdr01234567, vdl01234567), valphah);
12229
12230 __m128i vacc0123 = _mm_slli_epi32(_mm_mulhi_epu16(vd0123, valphav), 16);
12231 __m128i vacc4567 = _mm_slli_epi32(_mm_mulhi_epu16(vd4567, valphav), 16);
12232
12233 vacc0123 = _mm_add_epi16(_mm_mullo_epi16(vd0123, valphav), vacc0123);
12234 vacc4567 = _mm_add_epi16(_mm_mullo_epi16(vd4567, valphav), vacc4567);
12235
12236 vacc0123 = _mm_add_epi32(_mm_slli_epi32(vt0123, 11), vacc0123);
12237 vacc4567 = _mm_add_epi32(_mm_slli_epi32(vt4567, 11), vacc4567);
12238
12239 vacc0123 = _mm_srli_epi32(_mm_add_epi16(vacc0123, vrounding), 22);
12240 vacc4567 = _mm_srli_epi32(_mm_add_epi16(vacc4567, vrounding), 22);
12241
12242 const __m128i vacc01234567 = _mm_packs_epi32(vacc0123, vacc4567);
12243
12244 __m128i vo01234567 = _mm_packus_epi16(vacc01234567, vacc01234567);
12245
12246 if (c & (4 * sizeof(uint8_t))) {
12247 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vo01234567));
12248 output += 4;
12249 vo01234567 = _mm_srli_epi64(vo01234567, 32);
12250 }
12251 uint32_t vo0123 = (uint32_t) _mm_cvtsi128_si32(vo01234567);
12252 if (c & (2 * sizeof(uint8_t))) {
12253 unaligned_store_u16(output, (uint16_t) vo0123);
12254 output += 2;
12255 vo0123 >>= 16;
12256 }
12257 if (c & (1 * sizeof(uint8_t))) {
12258 *output++ = (uint8_t) vo0123;
12259 }
12260 }
12261
12262 output = (uint8_t*) ((uintptr_t) output + output_increment);
12263 } while (--output_pixels != 0);
12264 }
12265
xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16(size_t output_pixels,size_t kernel_elements,size_t channels,const uint8_t ** input,size_t input_offset,uint8_t * output,size_t input_increment,size_t output_increment,const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12266 void xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16(
12267 size_t output_pixels,
12268 size_t kernel_elements,
12269 size_t channels,
12270 const uint8_t** input,
12271 size_t input_offset,
12272 uint8_t* output,
12273 size_t input_increment,
12274 size_t output_increment,
12275 const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
12276 {
12277 assert(output_pixels != 0);
12278 assert(kernel_elements != 0);
12279 assert(channels != 0);
12280
12281 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.max);
12282 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.min);
12283
12284 do {
12285 uint8_t* o = output;
12286 {
12287 const uint8_t* i0 = *input++;
12288 const uint8_t* i1 = *input++;
12289 const uint8_t* i2 = *input++;
12290 const uint8_t* i3 = *input++;
12291 const uint8_t* i4 = *input++;
12292 const uint8_t* i5 = *input++;
12293 const uint8_t* i6 = *input++;
12294 const uint8_t* i7 = *input++;
12295 const uint8_t* i8 = *input++;
12296 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
12297 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
12298 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
12299 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
12300 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
12301 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
12302 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
12303 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
12304 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
12305 if (kernel_elements < 2) {
12306 i1 = i0;
12307 }
12308 if (kernel_elements <= 2) {
12309 i2 = i0;
12310 }
12311 if (kernel_elements < 4) {
12312 i3 = i0;
12313 }
12314 if (kernel_elements <= 4) {
12315 i4 = i0;
12316 }
12317 if (kernel_elements < 6) {
12318 i5 = i0;
12319 }
12320 if (kernel_elements <= 6) {
12321 i6 = i0;
12322 }
12323 if (kernel_elements < 8) {
12324 i7 = i0;
12325 }
12326 if (kernel_elements <= 8) {
12327 i8 = i0;
12328 }
12329
12330 size_t c = channels;
12331 for (; c >= 16; c -= 16) {
12332 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
12333 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
12334 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
12335 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); i3 += 16;
12336 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); i4 += 16;
12337 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); i5 += 16;
12338 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); i6 += 16;
12339 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); i7 += 16;
12340 const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8); i8 += 16;
12341
12342 const __m128i vmax018 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vi8);
12343 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
12344 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
12345 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
12346
12347 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
12348 const __m128i vmax01678 = _mm_max_epu8(vmax018, vmax67);
12349 __m128i vout = _mm_max_epu8(vmax2345, vmax01678);
12350 vout = _mm_max_epu8(vout, voutput_min);
12351 vout = _mm_min_epu8(vout, voutput_max);
12352
12353 _mm_storeu_si128((__m128i*) o, vout); o += 16;
12354 }
12355 if (c != 0) {
12356 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
12357 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
12358 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
12359 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
12360 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
12361 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
12362 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
12363 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
12364 const __m128i vi8 = _mm_loadu_si128((const __m128i*) i8);
12365
12366 const __m128i vmax018 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vi8);
12367 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
12368 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
12369 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
12370
12371 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
12372 const __m128i vmax01678 = _mm_max_epu8(vmax018, vmax67);
12373 __m128i vout = _mm_max_epu8(vmax2345, vmax01678);
12374 vout = _mm_max_epu8(vout, voutput_min);
12375 vout = _mm_min_epu8(vout, voutput_max);
12376
12377 if (c & 8) {
12378 _mm_storel_epi64((__m128i*) o, vout);
12379 vout = _mm_unpackhi_epi64(vout, vout);
12380 o += 8;
12381 }
12382 if (c & 4) {
12383 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
12384 vout = _mm_srli_epi64(vout, 32);
12385 o += 4;
12386 }
12387 if (c & 2) {
12388 unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
12389 vout = _mm_srli_epi32(vout, 16);
12390 o += 2;
12391 }
12392 if (c & 1) {
12393 *o = (uint8_t) _mm_cvtsi128_si32(vout);
12394 o += 1;
12395 }
12396 }
12397 }
12398
12399 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
12400 const uint8_t* i0 = *input++;
12401 const uint8_t* i1 = *input++;
12402 const uint8_t* i2 = *input++;
12403 const uint8_t* i3 = *input++;
12404 const uint8_t* i4 = *input++;
12405 const uint8_t* i5 = *input++;
12406 const uint8_t* i6 = *input++;
12407 const uint8_t* i7 = *input++;
12408 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
12409 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
12410 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
12411 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
12412 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
12413 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
12414 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
12415 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
12416 if (k < 2) {
12417 i1 = i0;
12418 }
12419 if (k <= 2) {
12420 i2 = i0;
12421 }
12422 if (k < 4) {
12423 i3 = i0;
12424 }
12425 if (k <= 4) {
12426 i4 = i0;
12427 }
12428 if (k < 6) {
12429 i5 = i0;
12430 }
12431 if (k <= 6) {
12432 i6 = i0;
12433 }
12434 if (k < 8) {
12435 i7 = i0;
12436 }
12437
12438 o = output;
12439 size_t c = channels;
12440 for (; c >= 16; c -= 16) {
12441 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
12442 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
12443 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
12444 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3); i3 += 16;
12445 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4); i4 += 16;
12446 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5); i5 += 16;
12447 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6); i6 += 16;
12448 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7); i7 += 16;
12449 const __m128i vo = _mm_loadu_si128((const __m128i*) o);
12450
12451 const __m128i vmax01 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vo);
12452 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
12453 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
12454 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
12455
12456 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
12457 const __m128i vmax0167 = _mm_max_epu8(vmax01, vmax67);
12458 __m128i vout = _mm_max_epu8(vmax2345, vmax0167);
12459 vout = _mm_max_epu8(vout, voutput_min);
12460 vout = _mm_min_epu8(vout, voutput_max);
12461
12462 _mm_storeu_si128((__m128i*) o, vout);
12463 o += 16;
12464 }
12465 if (c != 0) {
12466 const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
12467 const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
12468 const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
12469 const __m128i vi3 = _mm_loadu_si128((const __m128i*) i3);
12470 const __m128i vi4 = _mm_loadu_si128((const __m128i*) i4);
12471 const __m128i vi5 = _mm_loadu_si128((const __m128i*) i5);
12472 const __m128i vi6 = _mm_loadu_si128((const __m128i*) i6);
12473 const __m128i vi7 = _mm_loadu_si128((const __m128i*) i7);
12474 const __m128i vo = _mm_loadu_si128((const __m128i*) o);
12475
12476 const __m128i vmax01 = _mm_max_epu8(_mm_max_epu8(vi0, vi1), vo);
12477 const __m128i vmax23 = _mm_max_epu8(vi2, vi3);
12478 const __m128i vmax45 = _mm_max_epu8(vi4, vi5);
12479 const __m128i vmax67 = _mm_max_epu8(vi6, vi7);
12480
12481 const __m128i vmax2345 = _mm_max_epu8(vmax23, vmax45);
12482 const __m128i vmax0167 = _mm_max_epu8(vmax01, vmax67);
12483 __m128i vout = _mm_max_epu8(vmax2345, vmax0167);
12484 vout = _mm_max_epu8(vout, voutput_min);
12485 vout = _mm_min_epu8(vout, voutput_max);
12486
12487 if (c & 8) {
12488 _mm_storel_epi64((__m128i*) o, vout);
12489 vout = _mm_unpackhi_epi64(vout, vout);
12490 o += 8;
12491 }
12492 if (c & 4) {
12493 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vout));
12494 vout = _mm_srli_epi64(vout, 32);
12495 o += 4;
12496 }
12497 if (c & 2) {
12498 unaligned_store_u16(o, (uint16_t) _mm_extract_epi16(vout, 0));
12499 vout = _mm_srli_epi32(vout, 16);
12500 o += 2;
12501 }
12502 if (c & 1) {
12503 *o = (uint8_t) _mm_cvtsi128_si32(vout);
12504 o += 1;
12505 }
12506 }
12507 }
12508 input = (const uint8_t**) ((uintptr_t) input + input_increment);
12509 output = (uint8_t*) ((uintptr_t) o + output_increment);
12510 } while (--output_pixels != 0);
12511 }
12512
xnn_u8_rmax_ukernel__sse2(size_t n,const uint8_t * x,uint8_t * y)12513 void xnn_u8_rmax_ukernel__sse2(
12514 size_t n,
12515 const uint8_t* x,
12516 uint8_t* y)
12517 {
12518 assert(n != 0);
12519
12520 if XNN_LIKELY(n >= 16) {
12521 __m128i vmax = _mm_setzero_si128();
12522 do {
12523 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
12524 x += 16;
12525 vmax = _mm_max_epu8(vmax, vx);
12526 n -= 16;
12527 } while (n >= 16);
12528 if (n != 0) {
12529 const size_t x_increment = n - 16;
12530 x = (const uint8_t*) ((uintptr_t) x + x_increment);
12531 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
12532 vmax = _mm_max_epu8(vmax, vx);
12533 }
12534 vmax = _mm_max_epu8(vmax, _mm_unpackhi_epi64(vmax, vmax));
12535 vmax = _mm_max_epu8(vmax, _mm_srli_epi64(vmax, 32));
12536 vmax = _mm_max_epu8(vmax, _mm_srli_epi32(vmax, 16));
12537 vmax = _mm_max_epu8(vmax, _mm_srli_epi16(vmax, 8));
12538 *y = (uint8_t) _mm_cvtsi128_si32(vmax);
12539 } else {
12540 uint8_t vmax = 0;
12541 do {
12542 const uint8_t vx = *x++;
12543 vmax = vx > vmax ? vx : vmax;
12544 } while (--n != 0);
12545 *y = vmax;
12546 }
12547 }
12548
xnn_u8_vclamp_ukernel__sse2_x64(size_t n,const uint8_t * x,uint8_t * y,const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12549 void xnn_u8_vclamp_ukernel__sse2_x64(
12550 size_t n,
12551 const uint8_t* x,
12552 uint8_t* y,
12553 const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
12554 {
12555 assert(n != 0);
12556
12557 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.max);
12558 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.min);
12559 for (; n >= 64; n -= 64) {
12560 __m128i vacc0 = _mm_loadu_si128((const __m128i*) x);
12561 __m128i vacc1 = _mm_loadu_si128((const __m128i*) x + 1);
12562 __m128i vacc2 = _mm_loadu_si128((const __m128i*) x + 2);
12563 __m128i vacc3 = _mm_loadu_si128((const __m128i*) x + 3);
12564 x += 64;
12565
12566 vacc0 = _mm_max_epu8(vacc0, voutput_min);
12567 vacc1 = _mm_max_epu8(vacc1, voutput_min);
12568 vacc2 = _mm_max_epu8(vacc2, voutput_min);
12569 vacc3 = _mm_max_epu8(vacc3, voutput_min);
12570
12571 vacc0 = _mm_min_epu8(vacc0, voutput_max);
12572 vacc1 = _mm_min_epu8(vacc1, voutput_max);
12573 vacc2 = _mm_min_epu8(vacc2, voutput_max);
12574 vacc3 = _mm_min_epu8(vacc3, voutput_max);
12575
12576 _mm_storeu_si128((__m128i*) y, vacc0);
12577 _mm_storeu_si128((__m128i*) y + 1, vacc1);
12578 _mm_storeu_si128((__m128i*) y + 2, vacc2);
12579 _mm_storeu_si128((__m128i*) y + 3, vacc3);
12580 y += 64;
12581 }
12582 for (; n >= 16; n -= 16) {
12583 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
12584 x += 16;
12585
12586 vacc = _mm_min_epu8(vacc, voutput_max);
12587 vacc = _mm_max_epu8(vacc, voutput_min);
12588
12589 _mm_storeu_si128((__m128i*) y, vacc);
12590 y += 16;
12591 }
12592 if XNN_UNLIKELY(n != 0) {
12593 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
12594
12595 vacc = _mm_min_epu8(vacc, voutput_max);
12596 vacc = _mm_max_epu8(vacc, voutput_min);
12597
12598 if (n & 8) {
12599 _mm_storel_epi64((__m128i*) y, vacc);
12600 y += 8;
12601 vacc = _mm_unpackhi_epi64(vacc, vacc);
12602 }
12603 if (n & 4) {
12604 unaligned_store_u32(y, (uint32_t) _mm_cvtsi128_si32(vacc));
12605 y += 4;
12606 vacc = _mm_srli_epi64(vacc, 32);
12607 }
12608 if (n & 2) {
12609 unaligned_store_u16(y, (uint16_t) _mm_cvtsi128_si32(vacc));
12610 y += 2;
12611 vacc = _mm_srli_epi32(vacc, 16);
12612 }
12613 if (n & 1) {
12614 *y = (uint8_t) _mm_cvtsi128_si32(vacc);
12615 }
12616 }
12617 }
12618
xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)12619 void xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2(
12620 const uint16_t* input,
12621 uint16_t* output,
12622 size_t input_stride,
12623 size_t output_stride,
12624 size_t block_width,
12625 size_t block_height) XNN_OOB_READS
12626 {
12627 assert(output_stride >= block_height * sizeof(uint16_t));
12628 assert(input_stride >= block_width * sizeof(uint16_t));
12629
12630 const size_t tile_height = 8;
12631 const size_t tile_width = 8;
12632 const size_t tile_hbytes = tile_height * sizeof(uint16_t);
12633 const size_t tile_wbytes = tile_width * sizeof(uint16_t);
12634 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
12635 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t);
12636
12637 const uint16_t* i0 = input;
12638 uint16_t* o0 = (uint16_t*) output;
12639 uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
12640 uint16_t* o2 = (uint16_t*) ((uintptr_t) o1 + output_stride);
12641 uint16_t* o3 = (uint16_t*) ((uintptr_t) o2 + output_stride);
12642 uint16_t* o4 = (uint16_t*) ((uintptr_t) o3 + output_stride);
12643 uint16_t* o5 = (uint16_t*) ((uintptr_t) o4 + output_stride);
12644 uint16_t* o6 = (uint16_t*) ((uintptr_t) o5 + output_stride);
12645 uint16_t* o7 = (uint16_t*) ((uintptr_t) o6 + output_stride);
12646
12647 do {
12648 if XNN_UNPREDICTABLE(block_width < 2) {
12649 o1 = o0;
12650 }
12651 if XNN_UNPREDICTABLE(block_width <= 2) {
12652 o2 = o0;
12653 }
12654 if XNN_UNPREDICTABLE(block_width < 4) {
12655 o3 = o0;
12656 }
12657 if XNN_UNPREDICTABLE(block_width <= 4) {
12658 o4 = o0;
12659 }
12660 if XNN_UNPREDICTABLE(block_width < 6) {
12661 o5 = o0;
12662 }
12663 if XNN_UNPREDICTABLE(block_width <= 6) {
12664 o6 = o0;
12665 }
12666 if XNN_UNPREDICTABLE(block_width < 8) {
12667 o7 = o0;
12668 }
12669 size_t bh = block_height;
12670 for (; bh >= 8; bh -= 8) {
12671 const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
12672 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12673 const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i0);
12674 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12675 const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i0);
12676 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12677 const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i0);
12678 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12679 const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i0);
12680 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12681 const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i0);
12682 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12683 const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i0);
12684 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12685 const __m128i v3_7 = _mm_loadu_si128((const __m128i*) i0);
12686 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
12687
12688 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
12689 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
12690 const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
12691 const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
12692 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
12693 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
12694 const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
12695 const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
12696
12697 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
12698 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
12699 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
12700 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
12701 const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
12702 const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
12703 const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
12704 const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
12705
12706 const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
12707 const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
12708 const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
12709 const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
12710 const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
12711 const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
12712 const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
12713 const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
12714
12715
12716 _mm_storeu_si128((__m128i*) o7, v0_7);
12717 o7 = (uint16_t*) ((uintptr_t) o7 + tile_hbytes);
12718 _mm_storeu_si128((__m128i*) o6, v0_6);
12719 o6 = (uint16_t*) ((uintptr_t) o6 + tile_hbytes);
12720 _mm_storeu_si128((__m128i*) o5, v0_5);
12721 o5 = (uint16_t*) ((uintptr_t) o5 + tile_hbytes);
12722 _mm_storeu_si128((__m128i*) o4, v0_4);
12723 o4 = (uint16_t*) ((uintptr_t) o4 + tile_hbytes);
12724 _mm_storeu_si128((__m128i*) o3, v0_3);
12725 o3 = (uint16_t*) ((uintptr_t) o3 + tile_hbytes);
12726 _mm_storeu_si128((__m128i*) o2, v0_2);
12727 o2 = (uint16_t*) ((uintptr_t) o2 + tile_hbytes);
12728 _mm_storeu_si128((__m128i*) o1, v0_1);
12729 o1 = (uint16_t*) ((uintptr_t) o1 + tile_hbytes);
12730 _mm_storeu_si128((__m128i*) o0, v0_0);
12731 o0 = (uint16_t*) ((uintptr_t) o0 + tile_hbytes);
12732 }
12733 if (bh != 0) {
12734 const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
12735 const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
12736 if XNN_UNPREDICTABLE(bh < 2) {
12737 i1 = i0;
12738 }
12739 const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i1);
12740 const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
12741 if XNN_UNPREDICTABLE(bh <= 2) {
12742 i2 = i1;
12743 }
12744 const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i2);
12745 const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
12746 if XNN_UNPREDICTABLE(bh < 4) {
12747 i3 = i2;
12748 }
12749 const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i3);
12750 const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
12751 if XNN_UNPREDICTABLE(bh <= 4) {
12752 i4 = i3;
12753 }
12754 const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i4);
12755 const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
12756 if XNN_UNPREDICTABLE(bh < 6) {
12757 i5 = i4;
12758 }
12759 const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i5);
12760 const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
12761 if XNN_UNPREDICTABLE(bh <= 6) {
12762 i6 = i5;
12763 }
12764 const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i6);
12765 const __m128i v3_7 = _mm_undefined_si128();
12766
12767 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
12768 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
12769 const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
12770 const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
12771 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
12772 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
12773 const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
12774 const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
12775
12776 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
12777 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
12778 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
12779 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
12780 const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
12781 const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
12782 const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
12783 const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
12784
12785 __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
12786 __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
12787 __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
12788 __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
12789 __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
12790 __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
12791 __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
12792 __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
12793
12794
12795 if (bh & 4) {
12796 _mm_storel_epi64((__m128i*) o7, v0_7);
12797 o7 += 4;
12798 _mm_storel_epi64((__m128i*) o6, v0_6);
12799 o6 += 4;
12800 _mm_storel_epi64((__m128i*) o5, v0_5);
12801 o5 += 4;
12802 _mm_storel_epi64((__m128i*) o4, v0_4);
12803 o4 += 4;
12804 _mm_storel_epi64((__m128i*) o3, v0_3);
12805 o3 += 4;
12806 _mm_storel_epi64((__m128i*) o2, v0_2);
12807 o2 += 4;
12808 _mm_storel_epi64((__m128i*) o1, v0_1);
12809 o1 += 4;
12810 _mm_storel_epi64((__m128i*) o0, v0_0);
12811 o0 += 4;
12812 v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
12813 v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
12814 v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
12815 v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
12816 v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
12817 v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
12818 v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
12819 v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
12820 }
12821
12822 if (bh & 2) {
12823 unaligned_store_u32(o7, (uint32_t) _mm_cvtsi128_si32(v0_7));
12824 o7 += 2;
12825 unaligned_store_u32(o6, (uint32_t) _mm_cvtsi128_si32(v0_6));
12826 o6 += 2;
12827 unaligned_store_u32(o5, (uint32_t) _mm_cvtsi128_si32(v0_5));
12828 o5 += 2;
12829 unaligned_store_u32(o4, (uint32_t) _mm_cvtsi128_si32(v0_4));
12830 o4 += 2;
12831 unaligned_store_u32(o3, (uint32_t) _mm_cvtsi128_si32(v0_3));
12832 o3 += 2;
12833 unaligned_store_u32(o2, (uint32_t) _mm_cvtsi128_si32(v0_2));
12834 o2 += 2;
12835 unaligned_store_u32(o1, (uint32_t) _mm_cvtsi128_si32(v0_1));
12836 o1 += 2;
12837 unaligned_store_u32(o0, (uint32_t) _mm_cvtsi128_si32(v0_0));
12838 o0 += 2;
12839 v0_0 = _mm_srli_epi64(v0_0, 32);
12840 v0_1 = _mm_srli_epi64(v0_1, 32);
12841 v0_2 = _mm_srli_epi64(v0_2, 32);
12842 v0_3 = _mm_srli_epi64(v0_3, 32);
12843 v0_4 = _mm_srli_epi64(v0_4, 32);
12844 v0_5 = _mm_srli_epi64(v0_5, 32);
12845 v0_6 = _mm_srli_epi64(v0_6, 32);
12846 v0_7 = _mm_srli_epi64(v0_7, 32);
12847 }
12848 if (bh & 1) {
12849 unaligned_store_u16(o7, (uint16_t) _mm_cvtsi128_si32(v0_7));
12850 unaligned_store_u16(o6, (uint16_t) _mm_cvtsi128_si32(v0_6));
12851 unaligned_store_u16(o5, (uint16_t) _mm_cvtsi128_si32(v0_5));
12852 unaligned_store_u16(o4, (uint16_t) _mm_cvtsi128_si32(v0_4));
12853 unaligned_store_u16(o3, (uint16_t) _mm_cvtsi128_si32(v0_3));
12854 unaligned_store_u16(o2, (uint16_t) _mm_cvtsi128_si32(v0_2));
12855 unaligned_store_u16(o1, (uint16_t) _mm_cvtsi128_si32(v0_1));
12856 unaligned_store_u16(o0, (uint16_t) _mm_cvtsi128_si32(v0_0));
12857 }
12858 }
12859
12860 i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
12861 o0 = (uint16_t*) ((uintptr_t) o0 + output_reset);
12862 o1 = (uint16_t*) ((uintptr_t) o1 + output_reset);
12863 o2 = (uint16_t*) ((uintptr_t) o2 + output_reset);
12864 o3 = (uint16_t*) ((uintptr_t) o3 + output_reset);
12865 o4 = (uint16_t*) ((uintptr_t) o4 + output_reset);
12866 o5 = (uint16_t*) ((uintptr_t) o5 + output_reset);
12867 o6 = (uint16_t*) ((uintptr_t) o6 + output_reset);
12868 o7 = (uint16_t*) ((uintptr_t) o7 + output_reset);
12869 block_width = doz(block_width, tile_width);
12870 } while (block_width != 0);
12871 }
12872
xnn_x32_unpool_ukernel__sse2(size_t kernel_elements,size_t channels,uint32_t fill,const uint32_t * input,const uint32_t * index,uint32_t ** output)12873 void xnn_x32_unpool_ukernel__sse2(
12874 size_t kernel_elements,
12875 size_t channels,
12876 uint32_t fill,
12877 const uint32_t* input,
12878 const uint32_t* index,
12879 uint32_t** output)
12880 {
12881 // Pre-initialize outputs with constant.
12882 const __m128i vfill = _mm_set1_epi32((int) fill);
12883 uint32_t** os = output;
12884 do {
12885 uint32_t* o = *os++;
12886 size_t c = channels;
12887 for (; c >= 4; c -= 4) {
12888 _mm_storeu_si128((__m128i*) o, vfill);
12889 o += 4;
12890 }
12891 if (c != 0) {
12892 if (c & 2) {
12893 _mm_storel_epi64((__m128i*) o, vfill);
12894 o += 2;
12895 }
12896 if (c & 1) {
12897 *((int*) o) = _mm_cvtsi128_si32(vfill);
12898 }
12899 }
12900 } while (--kernel_elements != 0);
12901
12902 // Copy indexed elements to output.
12903 size_t offset = 0;
12904 do {
12905 const uint32_t i = *index++;
12906 *((uint32_t*) ((uintptr_t) output[i] + offset)) = *input++;
12907 offset += sizeof(uint32_t);
12908 } while (--channels != 0);
12909 }
12910
xnn_x32_zip_x2_ukernel__sse2(size_t n,const uint32_t * input,uint32_t * output)12911 void xnn_x32_zip_x2_ukernel__sse2(
12912 size_t n,
12913 const uint32_t* input,
12914 uint32_t* output)
12915 {
12916 assert(n != 0);
12917 assert(n % 4 == 0);
12918
12919 const uint32_t* x = input;
12920 const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
12921 uint32_t* o = output;
12922
12923 while (n >= 16) {
12924 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
12925 x += 4;
12926 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
12927 y += 4;
12928 const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy);
12929 const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy);
12930 _mm_storeu_si128((__m128i*) o, vxy_lo);
12931 _mm_storeu_si128((__m128i*) (o + 4), vxy_hi);
12932 o += 8;
12933 n -= 16;
12934 }
12935 if XNN_UNLIKELY(n != 0) {
12936 if (n & 8) {
12937 const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
12938 x += 2;
12939 const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
12940 y += 2;
12941 const __m128i vxy = _mm_unpacklo_epi32(vx, vy);
12942 _mm_storeu_si128((__m128i*) o, vxy);
12943 o += 4;
12944 }
12945 if (n & 4) {
12946 const uint32_t vx = *x;
12947 const uint32_t vy = *y;
12948 o[0] = vx;
12949 o[1] = vy;
12950 }
12951 }
12952 }
12953
xnn_x32_zip_x3_ukernel__sse2(size_t n,const uint32_t * input,uint32_t * output)12954 void xnn_x32_zip_x3_ukernel__sse2(
12955 size_t n,
12956 const uint32_t* input,
12957 uint32_t* output)
12958 {
12959 assert(n != 0);
12960 assert(n % 4 == 0);
12961
12962 const float* x = (const float*) input;
12963 const float* y = (const float*) ((uintptr_t) x + n);
12964 const float* z = (const float*) ((uintptr_t) y + n);
12965 float* o = (float*) output;
12966
12967 while (n >= 16) {
12968 // vx = ( x3, x2, x1, x0 )
12969 const __m128 vx = _mm_loadu_ps(x);
12970 x += 4;
12971 // vy = ( y3, y2, y1, y0 )
12972 const __m128 vy = _mm_loadu_ps(y);
12973 y += 4;
12974 // vz = ( z3, z2, z1, z0 )
12975 const __m128 vz = _mm_loadu_ps(z);
12976 z += 4;
12977
12978 // vxy = ( y2, y0, x2, x0 )
12979 const __m128 vxy = _mm_shuffle_ps(vx, vy, _MM_SHUFFLE(2, 0, 2, 0));
12980 // vyz = ( z3, z1, y3, y1 )
12981 const __m128 vyz = _mm_shuffle_ps(vy, vz, _MM_SHUFFLE(3, 1, 3, 1));
12982 // vzx = ( x3, x1, z2, z0 )
12983 const __m128 vzx = _mm_shuffle_ps(vz, vx, _MM_SHUFFLE(3, 1, 2, 0));
12984
12985 // vxyz0 = ( x1, z0, y0, x0 )
12986 const __m128 vxyz0 = _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(2, 0, 2, 0));
12987 // vxyz1 = ( y2, x2, z1, y1 )
12988 const __m128 vxyz1 = _mm_shuffle_ps(vyz, vxy, _MM_SHUFFLE(3, 1, 2, 0));
12989 // vxyz2 = ( z3, y3, x3, z2 )
12990 const __m128 vxyz2 = _mm_shuffle_ps(vzx, vyz, _MM_SHUFFLE(3, 1, 3, 1));
12991
12992 _mm_storeu_ps(o, vxyz0);
12993 _mm_storeu_ps(o + 4, vxyz1);
12994 _mm_storeu_ps(o + 8, vxyz2);
12995 o += 12;
12996 n -= 16;
12997 }
12998 if XNN_UNLIKELY(n != 0) {
12999 if (n & 8) {
13000 // vx = ( -, -, x1, x0 )
13001 const __m128 vx = _mm_castpd_ps(_mm_load_sd((const double*) x));
13002 x += 2;
13003 // vy = ( -, -, y1, y0 )
13004 const __m128 vy = _mm_castpd_ps(_mm_load_sd((const double*) y));
13005 y += 2;
13006 // vz = ( -, -, z1, z0 )
13007 const __m128 vz = _mm_castpd_ps(_mm_load_sd((const double*) z));
13008 z += 2;
13009
13010 // vxy = ( y1, x1, y0, x0 )
13011 const __m128 vxy = _mm_unpacklo_ps(vx, vy);
13012 // vzx = ( x1, z1, x0, z0 )
13013 const __m128 vzx = _mm_unpacklo_ps(vz, vx);
13014 // vyz = ( z1, y1, z0, y0 )
13015 const __m128 vyz = _mm_unpacklo_ps(vy, vz);
13016
13017 _mm_storeu_ps(o, _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(3, 0, 1, 0)));
13018 _mm_storeh_pi((__m64*) (o + 4), vyz);
13019 o += 6;
13020 }
13021 if (n & 4) {
13022 const __m128 vx = _mm_load_ss(x);
13023 const __m128 vy = _mm_load_ss(y);
13024 const __m128 vz = _mm_load_ss(z);
13025 _mm_store_ss(o, vx);
13026 _mm_store_ss(o + 1, vy);
13027 _mm_store_ss(o + 2, vz);
13028 }
13029 }
13030 }
13031
xnn_x32_zip_x4_ukernel__sse2(size_t n,const uint32_t * input,uint32_t * output)13032 void xnn_x32_zip_x4_ukernel__sse2(
13033 size_t n,
13034 const uint32_t* input,
13035 uint32_t* output)
13036 {
13037 assert(n != 0);
13038 assert(n % 4 == 0);
13039
13040 const uint32_t* x = input;
13041 const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
13042 const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
13043 const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
13044 uint32_t* o = output;
13045
13046 while (n >= 16) {
13047 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
13048 x += 4;
13049 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
13050 y += 4;
13051 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
13052 z += 4;
13053 const __m128i vw = _mm_loadu_si128((const __m128i*) w);
13054 w += 4;
13055
13056 const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy);
13057 const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy);
13058 const __m128i vzw_lo = _mm_unpacklo_epi32(vz, vw);
13059 const __m128i vzw_hi = _mm_unpackhi_epi32(vz, vw);
13060
13061 const __m128i vxyzw0 = _mm_unpacklo_epi64(vxy_lo, vzw_lo);
13062 const __m128i vxyzw1 = _mm_unpackhi_epi64(vxy_lo, vzw_lo);
13063 const __m128i vxyzw2 = _mm_unpacklo_epi64(vxy_hi, vzw_hi);
13064 const __m128i vxyzw3 = _mm_unpackhi_epi64(vxy_hi, vzw_hi);
13065
13066 _mm_storeu_si128((__m128i*) o, vxyzw0);
13067 _mm_storeu_si128((__m128i*) (o + 4), vxyzw1);
13068 _mm_storeu_si128((__m128i*) (o + 8), vxyzw2);
13069 _mm_storeu_si128((__m128i*) (o + 12), vxyzw3);
13070 o += 16;
13071 n -= 16;
13072 }
13073 if XNN_UNLIKELY(n != 0) {
13074 if (n & 8) {
13075 const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
13076 x += 2;
13077 const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
13078 y += 2;
13079 const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
13080 z += 2;
13081 const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
13082 w += 2;
13083
13084 const __m128i vxy = _mm_unpacklo_epi32(vx, vy);
13085 const __m128i vzw = _mm_unpacklo_epi32(vz, vw);
13086
13087 const __m128i vxyzw_lo = _mm_unpacklo_epi64(vxy, vzw);
13088 const __m128i vxyzw_hi = _mm_unpackhi_epi64(vxy, vzw);
13089
13090 _mm_storeu_si128((__m128i*) o, vxyzw_lo);
13091 _mm_storeu_si128((__m128i*) (o + 4), vxyzw_hi);
13092 o += 8;
13093 }
13094 if (n & 4) {
13095 const uint32_t vx = *x;
13096 const uint32_t vy = *y;
13097 const uint32_t vz = *z;
13098 const uint32_t vw = *w;
13099 o[0] = vx;
13100 o[1] = vy;
13101 o[2] = vz;
13102 o[3] = vw;
13103 }
13104 }
13105 }
13106
xnn_x32_zip_xm_ukernel__sse2(size_t n,size_t m,const uint32_t * input,uint32_t * output)13107 void xnn_x32_zip_xm_ukernel__sse2(
13108 size_t n,
13109 size_t m,
13110 const uint32_t* input,
13111 uint32_t* output)
13112 {
13113 assert(n != 0);
13114 assert(n % 4 == 0);
13115 assert(m >= 4);
13116
13117 const uint32_t* w = input;
13118 const size_t group_increment = m * 4;
13119 const size_t input_increment = n * 3;
13120 const size_t output_increment = 16 - m * n;
13121 const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1));
13122 uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16));
13123
13124 for (size_t i = 0; i < m; i += 4) {
13125 w = (const uint32_t*) ((uintptr_t) w + input_increment);
13126 if (w >= last_input) {
13127 w = last_input;
13128 }
13129 const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n);
13130 const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n);
13131 const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n);
13132
13133 size_t k = n;
13134 while (k >= 16) {
13135 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
13136 x += 4;
13137 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
13138 y += 4;
13139 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
13140 z += 4;
13141 const __m128i vw = _mm_loadu_si128((const __m128i*) w);
13142 w += 4;
13143
13144 const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy);
13145 const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy);
13146 const __m128i vzw_lo = _mm_unpacklo_epi32(vz, vw);
13147 const __m128i vzw_hi = _mm_unpackhi_epi32(vz, vw);
13148
13149 const __m128i vxyzw0 = _mm_unpacklo_epi64(vxy_lo, vzw_lo);
13150 const __m128i vxyzw1 = _mm_unpackhi_epi64(vxy_lo, vzw_lo);
13151 const __m128i vxyzw2 = _mm_unpacklo_epi64(vxy_hi, vzw_hi);
13152 const __m128i vxyzw3 = _mm_unpackhi_epi64(vxy_hi, vzw_hi);
13153
13154 _mm_storeu_si128((__m128i*) output, vxyzw0);
13155 output = (uint32_t*) ((uintptr_t) output + group_increment);
13156
13157 _mm_storeu_si128((__m128i*) output, vxyzw1);
13158 output = (uint32_t*) ((uintptr_t) output + group_increment);
13159
13160 _mm_storeu_si128((__m128i*) output, vxyzw2);
13161 output = (uint32_t*) ((uintptr_t) output + group_increment);
13162
13163 _mm_storeu_si128((__m128i*) output, vxyzw3);
13164 output = (uint32_t*) ((uintptr_t) output + group_increment);
13165
13166 k -= 16;
13167 }
13168 if XNN_UNLIKELY(k != 0) {
13169 if (k & 8) {
13170 const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
13171 x += 2;
13172 const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
13173 y += 2;
13174 const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
13175 z += 2;
13176 const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
13177 w += 2;
13178
13179 const __m128i vxy = _mm_unpacklo_epi32(vx, vy);
13180 const __m128i vzw = _mm_unpacklo_epi32(vz, vw);
13181
13182 const __m128i vxyzw_lo = _mm_unpacklo_epi64(vxy, vzw);
13183 const __m128i vxyzw_hi = _mm_unpackhi_epi64(vxy, vzw);
13184
13185 _mm_storeu_si128((__m128i*) output, vxyzw_lo);
13186 output = (uint32_t*) ((uintptr_t) output + group_increment);
13187
13188 _mm_storeu_si128((__m128i*) output, vxyzw_hi);
13189 output = (uint32_t*) ((uintptr_t) output + group_increment);
13190 }
13191 if (k & 4) {
13192 const uint32_t vx = *x;
13193 const uint32_t vy = *y;
13194 const uint32_t vz = *z;
13195 const uint32_t vw = *w++;
13196
13197 output[0] = vx;
13198 output[1] = vy;
13199 output[2] = vz;
13200 output[3] = vw;
13201 output = (uint32_t*) ((uintptr_t) output + group_increment);
13202 }
13203 }
13204 output = (uint32_t*) ((uintptr_t) output + output_increment);
13205 if (output > last_output) {
13206 output = last_output;
13207 }
13208 }
13209 }
13210
xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)13211 void xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2(
13212 const uint8_t* input,
13213 uint8_t* output,
13214 size_t input_stride,
13215 size_t output_stride,
13216 size_t block_width,
13217 size_t block_height) XNN_OOB_READS
13218 {
13219 assert(output_stride >= block_height * sizeof(uint8_t));
13220 assert(input_stride >= block_width * sizeof(uint8_t));
13221
13222 const size_t tile_height = 16;
13223 const size_t tile_width = 16;
13224 const size_t tile_hbytes = tile_height * sizeof(uint8_t);
13225 const size_t tile_wbytes = tile_width * sizeof(uint8_t);
13226 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
13227 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t) - tile_hbytes;
13228
13229 const uint8_t* i0 = input;
13230 uint8_t* o = (uint8_t*) ((uintptr_t) output - tile_hbytes);
13231 const size_t minus_output_stride = -output_stride;
13232
13233 do {
13234 const size_t rem = min(block_width - 1, 15);
13235 const size_t oN_stride = rem * output_stride;
13236 const size_t oN_offset = oN_stride + tile_hbytes;
13237 size_t bh = block_height;
13238 for (; bh >= 16; bh -= 16) {
13239 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
13240 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13241 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i0);
13242 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13243 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i0);
13244 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13245 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i0);
13246 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13247 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i0);
13248 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13249 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i0);
13250 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13251 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i0);
13252 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13253 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i0);
13254 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13255 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i0);
13256 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13257 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i0);
13258 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13259 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i0);
13260 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13261 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i0);
13262 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13263 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i0);
13264 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13265 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i0);
13266 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13267 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i0);
13268 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13269 const __m128i v4_15 = _mm_loadu_si128((const __m128i*) i0);
13270 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
13271
13272 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
13273 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
13274 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
13275 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
13276 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
13277 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
13278 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
13279 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
13280 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
13281 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
13282 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
13283 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
13284 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
13285 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
13286 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
13287 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
13288
13289 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
13290 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
13291 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
13292 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
13293 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
13294 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
13295 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
13296 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
13297 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
13298 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
13299 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
13300 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
13301 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
13302 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
13303 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
13304 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
13305
13306 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
13307 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
13308 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
13309 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
13310 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
13311 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
13312 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
13313 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
13314 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
13315 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
13316 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
13317 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
13318 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
13319 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
13320 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
13321 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
13322
13323 const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
13324 const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
13325 const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
13326 const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
13327 const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
13328 const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
13329 const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
13330 const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
13331 const __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
13332 const __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
13333 const __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
13334 const __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
13335 const __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
13336 const __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
13337 const __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
13338 const __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
13339
13340 o = (uint8_t*) ((uintptr_t) o + oN_offset);
13341 _mm_storeu_si128((__m128i*) o, v0_15);
13342 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13343 if XNN_UNPREDICTABLE(block_width > 15) {
13344 o = oN;
13345 }
13346 _mm_storeu_si128((__m128i*) o, v0_14);
13347 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13348 if XNN_UNPREDICTABLE(block_width >= 15) {
13349 o = oN;
13350 }
13351 _mm_storeu_si128((__m128i*) o, v0_13);
13352 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13353 if XNN_UNPREDICTABLE(block_width > 13) {
13354 o = oN;
13355 }
13356 _mm_storeu_si128((__m128i*) o, v0_12);
13357 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13358 if XNN_UNPREDICTABLE(block_width >= 13) {
13359 o = oN;
13360 }
13361 _mm_storeu_si128((__m128i*) o, v0_11);
13362 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13363 if XNN_UNPREDICTABLE(block_width > 11) {
13364 o = oN;
13365 }
13366 _mm_storeu_si128((__m128i*) o, v0_10);
13367 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13368 if XNN_UNPREDICTABLE(block_width >= 11) {
13369 o = oN;
13370 }
13371 _mm_storeu_si128((__m128i*) o, v0_9);
13372 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13373 if XNN_UNPREDICTABLE(block_width > 9) {
13374 o = oN;
13375 }
13376 _mm_storeu_si128((__m128i*) o, v0_8);
13377 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13378 if XNN_UNPREDICTABLE(block_width >= 9) {
13379 o = oN;
13380 }
13381 _mm_storeu_si128((__m128i*) o, v0_7);
13382 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13383 if XNN_UNPREDICTABLE(block_width > 7) {
13384 o = oN;
13385 }
13386 _mm_storeu_si128((__m128i*) o, v0_6);
13387 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13388 if XNN_UNPREDICTABLE(block_width >= 7) {
13389 o = oN;
13390 }
13391 _mm_storeu_si128((__m128i*) o, v0_5);
13392 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13393 if XNN_UNPREDICTABLE(block_width > 5) {
13394 o = oN;
13395 }
13396 _mm_storeu_si128((__m128i*) o, v0_4);
13397 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13398 if XNN_UNPREDICTABLE(block_width >= 5) {
13399 o = oN;
13400 }
13401 _mm_storeu_si128((__m128i*) o, v0_3);
13402 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13403 if XNN_UNPREDICTABLE(block_width > 3) {
13404 o = oN;
13405 }
13406 _mm_storeu_si128((__m128i*) o, v0_2);
13407 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13408 if XNN_UNPREDICTABLE(block_width >= 3) {
13409 o = oN;
13410 }
13411 _mm_storeu_si128((__m128i*) o, v0_1);
13412 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13413 if XNN_UNPREDICTABLE(block_width > 1) {
13414 o = oN;
13415 }
13416 _mm_storeu_si128((__m128i*) o, v0_0);
13417 }
13418 o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
13419 if (bh != 0) {
13420 const __m128i v4_0 = _mm_loadu_si128((const __m128i*) i0);
13421 const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
13422 if XNN_UNPREDICTABLE(bh < 2) {
13423 i1 = i0;
13424 }
13425 const __m128i v4_1 = _mm_loadu_si128((const __m128i*) i1);
13426 const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
13427 if XNN_UNPREDICTABLE(bh <= 2) {
13428 i2 = i1;
13429 }
13430 const __m128i v4_2 = _mm_loadu_si128((const __m128i*) i2);
13431 const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
13432 if XNN_UNPREDICTABLE(bh < 4) {
13433 i3 = i2;
13434 }
13435 const __m128i v4_3 = _mm_loadu_si128((const __m128i*) i3);
13436 const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
13437 if XNN_UNPREDICTABLE(bh <= 4) {
13438 i4 = i3;
13439 }
13440 const __m128i v4_4 = _mm_loadu_si128((const __m128i*) i4);
13441 const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
13442 if XNN_UNPREDICTABLE(bh < 6) {
13443 i5 = i4;
13444 }
13445 const __m128i v4_5 = _mm_loadu_si128((const __m128i*) i5);
13446 const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
13447 if XNN_UNPREDICTABLE(bh <= 6) {
13448 i6 = i5;
13449 }
13450 const __m128i v4_6 = _mm_loadu_si128((const __m128i*) i6);
13451 const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
13452 if XNN_UNPREDICTABLE(bh < 8) {
13453 i7 = i6;
13454 }
13455 const __m128i v4_7 = _mm_loadu_si128((const __m128i*) i7);
13456 const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
13457 if XNN_UNPREDICTABLE(bh <= 8) {
13458 i8 = i7;
13459 }
13460 const __m128i v4_8 = _mm_loadu_si128((const __m128i*) i8);
13461 const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
13462 if XNN_UNPREDICTABLE(bh < 10) {
13463 i9 = i8;
13464 }
13465 const __m128i v4_9 = _mm_loadu_si128((const __m128i*) i9);
13466 const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
13467 if XNN_UNPREDICTABLE(bh <= 10) {
13468 i10 = i9;
13469 }
13470 const __m128i v4_10 = _mm_loadu_si128((const __m128i*) i10);
13471 const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
13472 if XNN_UNPREDICTABLE(bh < 12) {
13473 i11 = i10;
13474 }
13475 const __m128i v4_11 = _mm_loadu_si128((const __m128i*) i11);
13476 const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
13477 if XNN_UNPREDICTABLE(bh <= 12) {
13478 i12 = i11;
13479 }
13480 const __m128i v4_12 = _mm_loadu_si128((const __m128i*) i12);
13481 const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
13482 if XNN_UNPREDICTABLE(bh < 14) {
13483 i13 = i12;
13484 }
13485 const __m128i v4_13 = _mm_loadu_si128((const __m128i*) i13);
13486 const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
13487 if XNN_UNPREDICTABLE(bh <= 14) {
13488 i14 = i13;
13489 }
13490 const __m128i v4_14 = _mm_loadu_si128((const __m128i*) i14);
13491 const __m128i v4_15 = _mm_undefined_si128();
13492
13493 const __m128i v3_0 = _mm_unpacklo_epi8(v4_0, v4_1);
13494 const __m128i v3_1 = _mm_unpackhi_epi8(v4_0, v4_1);
13495 const __m128i v3_2 = _mm_unpacklo_epi8(v4_2, v4_3);
13496 const __m128i v3_3 = _mm_unpackhi_epi8(v4_2, v4_3);
13497 const __m128i v3_4 = _mm_unpacklo_epi8(v4_4, v4_5);
13498 const __m128i v3_5 = _mm_unpackhi_epi8(v4_4, v4_5);
13499 const __m128i v3_6 = _mm_unpacklo_epi8(v4_6, v4_7);
13500 const __m128i v3_7 = _mm_unpackhi_epi8(v4_6, v4_7);
13501 const __m128i v3_8 = _mm_unpacklo_epi8(v4_8, v4_9);
13502 const __m128i v3_9 = _mm_unpackhi_epi8(v4_8, v4_9);
13503 const __m128i v3_10 = _mm_unpacklo_epi8(v4_10, v4_11);
13504 const __m128i v3_11 = _mm_unpackhi_epi8(v4_10, v4_11);
13505 const __m128i v3_12 = _mm_unpacklo_epi8(v4_12, v4_13);
13506 const __m128i v3_13 = _mm_unpackhi_epi8(v4_12, v4_13);
13507 const __m128i v3_14 = _mm_unpacklo_epi8(v4_14, v4_15);
13508 const __m128i v3_15 = _mm_unpackhi_epi8(v4_14, v4_15);
13509
13510 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_2);
13511 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_2);
13512 const __m128i v2_2 = _mm_unpacklo_epi16(v3_1, v3_3);
13513 const __m128i v2_3 = _mm_unpackhi_epi16(v3_1, v3_3);
13514 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_6);
13515 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_6);
13516 const __m128i v2_6 = _mm_unpacklo_epi16(v3_5, v3_7);
13517 const __m128i v2_7 = _mm_unpackhi_epi16(v3_5, v3_7);
13518 const __m128i v2_8 = _mm_unpacklo_epi16(v3_8, v3_10);
13519 const __m128i v2_9 = _mm_unpackhi_epi16(v3_8, v3_10);
13520 const __m128i v2_10 = _mm_unpacklo_epi16(v3_9, v3_11);
13521 const __m128i v2_11 = _mm_unpackhi_epi16(v3_9, v3_11);
13522 const __m128i v2_12 = _mm_unpacklo_epi16(v3_12, v3_14);
13523 const __m128i v2_13 = _mm_unpackhi_epi16(v3_12, v3_14);
13524 const __m128i v2_14 = _mm_unpacklo_epi16(v3_13, v3_15);
13525 const __m128i v2_15 = _mm_unpackhi_epi16(v3_13, v3_15);
13526
13527 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_4);
13528 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_4);
13529 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_5);
13530 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_5);
13531 const __m128i v1_4 = _mm_unpacklo_epi32(v2_2, v2_6);
13532 const __m128i v1_5 = _mm_unpackhi_epi32(v2_2, v2_6);
13533 const __m128i v1_6 = _mm_unpacklo_epi32(v2_3, v2_7);
13534 const __m128i v1_7 = _mm_unpackhi_epi32(v2_3, v2_7);
13535 const __m128i v1_8 = _mm_unpacklo_epi32(v2_8, v2_12);
13536 const __m128i v1_9 = _mm_unpackhi_epi32(v2_8, v2_12);
13537 const __m128i v1_10 = _mm_unpacklo_epi32(v2_9, v2_13);
13538 const __m128i v1_11 = _mm_unpackhi_epi32(v2_9, v2_13);
13539 const __m128i v1_12 = _mm_unpacklo_epi32(v2_10, v2_14);
13540 const __m128i v1_13 = _mm_unpackhi_epi32(v2_10, v2_14);
13541 const __m128i v1_14 = _mm_unpacklo_epi32(v2_11, v2_15);
13542 const __m128i v1_15 = _mm_unpackhi_epi32(v2_11, v2_15);
13543
13544 __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_8);
13545 __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_8);
13546 __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_9);
13547 __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_9);
13548 __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_10);
13549 __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_10);
13550 __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_11);
13551 __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_11);
13552 __m128i v0_8 = _mm_unpacklo_epi64(v1_4, v1_12);
13553 __m128i v0_9 = _mm_unpackhi_epi64(v1_4, v1_12);
13554 __m128i v0_10 = _mm_unpacklo_epi64(v1_5, v1_13);
13555 __m128i v0_11 = _mm_unpackhi_epi64(v1_5, v1_13);
13556 __m128i v0_12 = _mm_unpacklo_epi64(v1_6, v1_14);
13557 __m128i v0_13 = _mm_unpackhi_epi64(v1_6, v1_14);
13558 __m128i v0_14 = _mm_unpacklo_epi64(v1_7, v1_15);
13559 __m128i v0_15 = _mm_unpackhi_epi64(v1_7, v1_15);
13560
13561 if (bh & 8) {
13562 o = (uint8_t*) ((uintptr_t) o + oN_stride);
13563 _mm_storel_epi64((__m128i*) o, v0_15);
13564 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13565 if XNN_UNPREDICTABLE(block_width > 15) {
13566 o = oN;
13567 }
13568 _mm_storel_epi64((__m128i*) o, v0_14);
13569 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13570 if XNN_UNPREDICTABLE(block_width >= 15) {
13571 o = oN;
13572 }
13573 _mm_storel_epi64((__m128i*) o, v0_13);
13574 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13575 if XNN_UNPREDICTABLE(block_width > 13) {
13576 o = oN;
13577 }
13578 _mm_storel_epi64((__m128i*) o, v0_12);
13579 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13580 if XNN_UNPREDICTABLE(block_width >= 13) {
13581 o = oN;
13582 }
13583 _mm_storel_epi64((__m128i*) o, v0_11);
13584 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13585 if XNN_UNPREDICTABLE(block_width > 11) {
13586 o = oN;
13587 }
13588 _mm_storel_epi64((__m128i*) o, v0_10);
13589 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13590 if XNN_UNPREDICTABLE(block_width >= 11) {
13591 o = oN;
13592 }
13593 _mm_storel_epi64((__m128i*) o, v0_9);
13594 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13595 if XNN_UNPREDICTABLE(block_width > 9) {
13596 o = oN;
13597 }
13598 _mm_storel_epi64((__m128i*) o, v0_8);
13599 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13600 if XNN_UNPREDICTABLE(block_width >= 9) {
13601 o = oN;
13602 }
13603 _mm_storel_epi64((__m128i*) o, v0_7);
13604 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13605 if XNN_UNPREDICTABLE(block_width > 7) {
13606 o = oN;
13607 }
13608 _mm_storel_epi64((__m128i*) o, v0_6);
13609 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13610 if XNN_UNPREDICTABLE(block_width >= 7) {
13611 o = oN;
13612 }
13613 _mm_storel_epi64((__m128i*) o, v0_5);
13614 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13615 if XNN_UNPREDICTABLE(block_width > 5) {
13616 o = oN;
13617 }
13618 _mm_storel_epi64((__m128i*) o, v0_4);
13619 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13620 if XNN_UNPREDICTABLE(block_width >= 5) {
13621 o = oN;
13622 }
13623 _mm_storel_epi64((__m128i*) o, v0_3);
13624 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13625 if XNN_UNPREDICTABLE(block_width > 3) {
13626 o = oN;
13627 }
13628 _mm_storel_epi64((__m128i*) o, v0_2);
13629 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13630 if XNN_UNPREDICTABLE(block_width >= 3) {
13631 o = oN;
13632 }
13633 _mm_storel_epi64((__m128i*) o, v0_1);
13634 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13635 if XNN_UNPREDICTABLE(block_width > 1) {
13636 o = oN;
13637 }
13638 _mm_storel_epi64((__m128i*) o, v0_0);
13639 o += 8;
13640 v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
13641 v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
13642 v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
13643 v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
13644 v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
13645 v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
13646 v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
13647 v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
13648 v0_8 = _mm_unpackhi_epi64(v0_8, v0_8);
13649 v0_9 = _mm_unpackhi_epi64(v0_9, v0_9);
13650 v0_10 = _mm_unpackhi_epi64(v0_10, v0_10);
13651 v0_11 = _mm_unpackhi_epi64(v0_11, v0_11);
13652 v0_12 = _mm_unpackhi_epi64(v0_12, v0_12);
13653 v0_13 = _mm_unpackhi_epi64(v0_13, v0_13);
13654 v0_14 = _mm_unpackhi_epi64(v0_14, v0_14);
13655 v0_15 = _mm_unpackhi_epi64(v0_15, v0_15);
13656 }
13657
13658 if (bh & 4) {
13659 o = (uint8_t*) ((uintptr_t) o + oN_stride);
13660 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_15));
13661 uint8_t *oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13662 if XNN_UNPREDICTABLE(block_width > 15) {
13663 o = oN;
13664 }
13665 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_14));
13666 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13667 if XNN_UNPREDICTABLE(block_width >= 15) {
13668 o = oN;
13669 }
13670 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_13));
13671 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13672 if XNN_UNPREDICTABLE(block_width > 13) {
13673 o = oN;
13674 }
13675 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_12));
13676 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13677 if XNN_UNPREDICTABLE(block_width >= 13) {
13678 o = oN;
13679 }
13680 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_11));
13681 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13682 if XNN_UNPREDICTABLE(block_width > 11) {
13683 o = oN;
13684 }
13685 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_10));
13686 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13687 if XNN_UNPREDICTABLE(block_width >= 11) {
13688 o = oN;
13689 }
13690 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_9));
13691 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13692 if XNN_UNPREDICTABLE(block_width > 9) {
13693 o = oN;
13694 }
13695 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_8));
13696 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13697 if XNN_UNPREDICTABLE(block_width >= 9) {
13698 o = oN;
13699 }
13700 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_7));
13701 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13702 if XNN_UNPREDICTABLE(block_width > 7) {
13703 o = oN;
13704 }
13705 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_6));
13706 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13707 if XNN_UNPREDICTABLE(block_width >= 7) {
13708 o = oN;
13709 }
13710 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_5));
13711 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13712 if XNN_UNPREDICTABLE(block_width > 5) {
13713 o = oN;
13714 }
13715 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_4));
13716 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13717 if XNN_UNPREDICTABLE(block_width >= 5) {
13718 o = oN;
13719 }
13720 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_3));
13721 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13722 if XNN_UNPREDICTABLE(block_width > 3) {
13723 o = oN;
13724 }
13725 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_2));
13726 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13727 if XNN_UNPREDICTABLE(block_width >= 3) {
13728 o = oN;
13729 }
13730 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_1));
13731 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13732 if XNN_UNPREDICTABLE(block_width > 1) {
13733 o = oN;
13734 }
13735 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(v0_0));
13736 o += 4;
13737 v0_0 = _mm_srli_epi64(v0_0, 32);
13738 v0_1 = _mm_srli_epi64(v0_1, 32);
13739 v0_2 = _mm_srli_epi64(v0_2, 32);
13740 v0_3 = _mm_srli_epi64(v0_3, 32);
13741 v0_4 = _mm_srli_epi64(v0_4, 32);
13742 v0_5 = _mm_srli_epi64(v0_5, 32);
13743 v0_6 = _mm_srli_epi64(v0_6, 32);
13744 v0_7 = _mm_srli_epi64(v0_7, 32);
13745 v0_8 = _mm_srli_epi64(v0_8, 32);
13746 v0_9 = _mm_srli_epi64(v0_9, 32);
13747 v0_10 = _mm_srli_epi64(v0_10, 32);
13748 v0_11 = _mm_srli_epi64(v0_11, 32);
13749 v0_12 = _mm_srli_epi64(v0_12, 32);
13750 v0_13 = _mm_srli_epi64(v0_13, 32);
13751 v0_14 = _mm_srli_epi64(v0_14, 32);
13752 v0_15 = _mm_srli_epi64(v0_15, 32);
13753 }
13754 if (bh & 2) {
13755 o = (uint8_t*) ((uintptr_t) o + oN_stride);
13756 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_15));
13757 uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13758 if XNN_UNPREDICTABLE(block_width > 15) {
13759 o = oN;
13760 }
13761 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_14));
13762 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13763 if XNN_UNPREDICTABLE(block_width >= 15) {
13764 o = oN;
13765 }
13766 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_13));
13767 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13768 if XNN_UNPREDICTABLE(block_width > 13) {
13769 o = oN;
13770 }
13771 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_12));
13772 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13773 if XNN_UNPREDICTABLE(block_width >= 13) {
13774 o = oN;
13775 }
13776 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_11));
13777 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13778 if XNN_UNPREDICTABLE(block_width > 11) {
13779 o = oN;
13780 }
13781 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_10));
13782 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13783 if XNN_UNPREDICTABLE(block_width >= 11) {
13784 o = oN;
13785 }
13786 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_9));
13787 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13788 if XNN_UNPREDICTABLE(block_width > 9) {
13789 o = oN;
13790 }
13791 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_8));
13792 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13793 if XNN_UNPREDICTABLE(block_width >= 9) {
13794 o = oN;
13795 }
13796 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_7));
13797 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13798 if XNN_UNPREDICTABLE(block_width > 7) {
13799 o = oN;
13800 }
13801 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_6));
13802 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13803 if XNN_UNPREDICTABLE(block_width >= 7) {
13804 o = oN;
13805 }
13806 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_5));
13807 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13808 if XNN_UNPREDICTABLE(block_width > 5) {
13809 o = oN;
13810 }
13811 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_4));
13812 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13813 if XNN_UNPREDICTABLE(block_width >= 5) {
13814 o = oN;
13815 }
13816 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_3));
13817 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13818 if XNN_UNPREDICTABLE(block_width > 3) {
13819 o = oN;
13820 }
13821 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_2));
13822 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13823 if XNN_UNPREDICTABLE(block_width >= 3) {
13824 o = oN;
13825 }
13826 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_1));
13827 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13828 if XNN_UNPREDICTABLE(block_width > 1) {
13829 o = oN;
13830 }
13831 unaligned_store_u16(o, (uint16_t) _mm_cvtsi128_si32(v0_0));
13832 o += 2;
13833 v0_0 = _mm_srli_epi32(v0_0, 16);
13834 v0_1 = _mm_srli_epi32(v0_1, 16);
13835 v0_2 = _mm_srli_epi32(v0_2, 16);
13836 v0_3 = _mm_srli_epi32(v0_3, 16);
13837 v0_4 = _mm_srli_epi32(v0_4, 16);
13838 v0_5 = _mm_srli_epi32(v0_5, 16);
13839 v0_6 = _mm_srli_epi32(v0_6, 16);
13840 v0_7 = _mm_srli_epi32(v0_7, 16);
13841 v0_8 = _mm_srli_epi32(v0_8, 16);
13842 v0_9 = _mm_srli_epi32(v0_9, 16);
13843 v0_10 = _mm_srli_epi32(v0_10, 16);
13844 v0_11 = _mm_srli_epi32(v0_11, 16);
13845 v0_12 = _mm_srli_epi32(v0_12, 16);
13846 v0_13 = _mm_srli_epi32(v0_13, 16);
13847 v0_14 = _mm_srli_epi32(v0_14, 16);
13848 v0_15 = _mm_srli_epi32(v0_15, 16);
13849 }
13850 if (bh & 1) {
13851 o = (uint8_t*) ((uintptr_t) o + oN_stride);
13852 *o = (uint8_t) _mm_cvtsi128_si32(v0_15);
13853 uint8_t* oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13854 if XNN_UNPREDICTABLE(block_width > 15) {
13855 o = oN;
13856 }
13857 *o = (uint8_t) _mm_cvtsi128_si32(v0_14);
13858 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13859 if XNN_UNPREDICTABLE(block_width >= 15) {
13860 o = oN;
13861 }
13862 *o = (uint8_t) _mm_cvtsi128_si32(v0_13);
13863 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13864 if XNN_UNPREDICTABLE(block_width > 13) {
13865 o = oN;
13866 }
13867 *o = (uint8_t) _mm_cvtsi128_si32(v0_12);
13868 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13869 if XNN_UNPREDICTABLE(block_width >= 13) {
13870 o = oN;
13871 }
13872 *o = (uint8_t) _mm_cvtsi128_si32(v0_11);
13873 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13874 if XNN_UNPREDICTABLE(block_width > 11) {
13875 o = oN;
13876 }
13877 *o = (uint8_t) _mm_cvtsi128_si32(v0_10);
13878 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13879 if XNN_UNPREDICTABLE(block_width >= 11) {
13880 o = oN;
13881 }
13882 *o = (uint8_t) _mm_cvtsi128_si32(v0_9);
13883 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13884 if XNN_UNPREDICTABLE(block_width > 9) {
13885 o = oN;
13886 }
13887 *o = (uint8_t) _mm_cvtsi128_si32(v0_8);
13888 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13889 if XNN_UNPREDICTABLE(block_width >= 9) {
13890 o = oN;
13891 }
13892 *o = (uint8_t) _mm_cvtsi128_si32(v0_7);
13893 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13894 if XNN_UNPREDICTABLE(block_width > 7) {
13895 o = oN;
13896 }
13897 *o = (uint8_t) _mm_cvtsi128_si32(v0_6);
13898 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13899 if XNN_UNPREDICTABLE(block_width >= 7) {
13900 o = oN;
13901 }
13902 *o = (uint8_t) _mm_cvtsi128_si32(v0_5);
13903 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13904 if XNN_UNPREDICTABLE(block_width > 5) {
13905 o = oN;
13906 }
13907 *o = (uint8_t) _mm_cvtsi128_si32(v0_4);
13908 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13909 if XNN_UNPREDICTABLE(block_width >= 5) {
13910 o = oN;
13911 }
13912 *o = (uint8_t) _mm_cvtsi128_si32(v0_3);
13913 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13914 if XNN_UNPREDICTABLE(block_width > 3) {
13915 o = oN;
13916 }
13917 *o = (uint8_t) _mm_cvtsi128_si32(v0_2);
13918 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13919 if XNN_UNPREDICTABLE(block_width >= 3) {
13920 o = oN;
13921 }
13922 *o = (uint8_t) _mm_cvtsi128_si32(v0_1);
13923 oN = (uint8_t*) ((uintptr_t) o + minus_output_stride);
13924 if XNN_UNPREDICTABLE(block_width > 1) {
13925 o = oN;
13926 }
13927 *o = (uint8_t) _mm_cvtsi128_si32(v0_0);
13928 }
13929 }
13930
13931 i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
13932 o = (uint8_t*) ((uintptr_t) o + output_reset);
13933 block_width = doz(block_width, tile_width);
13934 } while (block_width != 0);
13935 }
13936
xnn_x8_zip_x2_ukernel__sse2(size_t n,const uint8_t * input,uint8_t * output)13937 void xnn_x8_zip_x2_ukernel__sse2(
13938 size_t n,
13939 const uint8_t* input,
13940 uint8_t* output)
13941 {
13942 const uint8_t* x = input;
13943 const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
13944 uint8_t* o = output;
13945
13946 if (n >= 16) {
13947 do {
13948 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
13949 x += 16;
13950 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
13951 y += 16;
13952 const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
13953 const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
13954 _mm_storeu_si128((__m128i*) o, vxy_lo);
13955 _mm_storeu_si128((__m128i*) (o + 16), vxy_hi);
13956 o = (void*) ((uintptr_t) o + 32);
13957 n -= 16;
13958 } while (n >= 16);
13959 if (n != 0) {
13960 const size_t address_increment = n - 16;
13961 const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
13962 const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
13963 const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
13964 const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
13965 o = (void*) ((uintptr_t) o + address_increment * 2);
13966 _mm_storeu_si128((__m128i*) o, vxy_lo);
13967 _mm_storeu_si128((__m128i*) o + 1, vxy_hi);
13968 }
13969 } else {
13970 do {
13971 const uint8_t vx = *x++;
13972 const uint8_t vy = *y++;
13973 o[0] = vx;
13974 o[1] = vy;
13975 o += 2;
13976 } while (--n != 0);
13977 }
13978 }
13979
xnn_x8_zip_x3_ukernel__sse2(size_t n,const uint8_t * input,uint8_t * output)13980 void xnn_x8_zip_x3_ukernel__sse2(
13981 size_t n,
13982 const uint8_t* input,
13983 uint8_t* output)
13984 {
13985 const uint8_t* x = input;
13986 const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
13987 const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
13988 uint8_t* o = output;
13989
13990 if (n >= 16) {
13991 const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF);
13992 const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF);
13993 do {
13994 // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 )
13995 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
13996 x += 16;
13997 // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 )
13998 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
13999 y += 16;
14000 // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 )
14001 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
14002 z += 16;
14003
14004 // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 )
14005 const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
14006 // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 )
14007 const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
14008 // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 )
14009 const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
14010
14011 // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 )
14012 const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
14013 // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 )
14014 const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
14015 // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 )
14016 const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
14017
14018 // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 )
14019 const __m128i vtemp0 = _mm_castps_si128(
14020 _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
14021 // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 )
14022 const __m128i vtemp1 = _mm_castps_si128(
14023 _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
14024 // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 )
14025 const __m128i vtemp2 = _mm_castps_si128(
14026 _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
14027
14028 // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 )
14029 const __m128i vxyz0 = _mm_castps_si128(
14030 _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
14031 // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 )
14032 const __m128i vxyz1 = _mm_castps_si128(
14033 _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
14034 // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
14035 const __m128i vxyz2 = _mm_castps_si128(
14036 _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
14037
14038 _mm_storeu_si128((__m128i*) o, vxyz0);
14039 _mm_storeu_si128((__m128i*) o + 1, vxyz1);
14040 _mm_storeu_si128((__m128i*) o + 2, vxyz2);
14041 o += 48;
14042 n -= 16;
14043 } while (n >= 16);
14044 if (n != 0) {
14045 const size_t address_increment = n - 16;
14046 // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 )
14047 const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
14048 // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 )
14049 const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
14050 // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 )
14051 const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
14052
14053 // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 )
14054 const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
14055 // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 )
14056 const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
14057 // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 )
14058 const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
14059
14060 // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 )
14061 const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
14062 // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 )
14063 const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
14064 // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 )
14065 const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
14066
14067 // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 )
14068 const __m128i vtemp0 = _mm_castps_si128(
14069 _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
14070 // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 )
14071 const __m128i vtemp1 = _mm_castps_si128(
14072 _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
14073 // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 )
14074 const __m128i vtemp2 = _mm_castps_si128(
14075 _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
14076
14077 // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 )
14078 const __m128i vxyz0 = _mm_castps_si128(
14079 _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
14080 // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 )
14081 const __m128i vxyz1 = _mm_castps_si128(
14082 _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
14083 // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
14084 const __m128i vxyz2 = _mm_castps_si128(
14085 _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
14086
14087 o = (uint8_t*) ((uintptr_t) o + address_increment * 3);
14088 _mm_storeu_si128((__m128i*) o, vxyz0);
14089 _mm_storeu_si128((__m128i*) o + 1, vxyz1);
14090 _mm_storeu_si128((__m128i*) o + 2, vxyz2);
14091 }
14092 } else {
14093 do {
14094 const uint8_t vx = *x++;
14095 const uint8_t vy = *y++;
14096 const uint8_t vz = *z++;
14097 o[0] = vx;
14098 o[1] = vy;
14099 o[2] = vz;
14100 o += 3;
14101 } while (--n != 0);
14102 }
14103 }
14104
xnn_x8_zip_x4_ukernel__sse2(size_t n,const uint8_t * input,uint8_t * output)14105 void xnn_x8_zip_x4_ukernel__sse2(
14106 size_t n,
14107 const uint8_t* input,
14108 uint8_t* output)
14109 {
14110 const uint8_t* x = input;
14111 const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
14112 const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
14113 const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n);
14114 uint8_t* o = output;
14115
14116 if (n >= 16) {
14117 do {
14118 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
14119 x += 16;
14120 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
14121 y += 16;
14122 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
14123 z += 16;
14124 const __m128i vw = _mm_loadu_si128((const __m128i*) w);
14125 w += 16;
14126 const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
14127 const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
14128 const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
14129 const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
14130 const __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
14131 const __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
14132 const __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
14133 const __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
14134 _mm_storeu_si128((__m128i*) o, vxyzw0);
14135 _mm_storeu_si128((__m128i*) o + 1, vxyzw1);
14136 _mm_storeu_si128((__m128i*) o + 2, vxyzw2);
14137 _mm_storeu_si128((__m128i*) o + 3, vxyzw3);
14138 o = (void*) ((uintptr_t) o + 64);
14139 n -= 16;
14140 } while (n >= 16);
14141 if (n != 0) {
14142 const size_t address_increment = n - 16;
14143 const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
14144 const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
14145 const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
14146 const __m128i vw = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + address_increment));
14147 const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
14148 const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
14149 const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
14150 const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
14151 const __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
14152 const __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
14153 const __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
14154 const __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
14155 o = (void*) ((uintptr_t) o + address_increment * 4);
14156 _mm_storeu_si128((__m128i*) o, vxyzw0);
14157 _mm_storeu_si128((__m128i*) o + 1, vxyzw1);
14158 _mm_storeu_si128((__m128i*) o + 2, vxyzw2);
14159 _mm_storeu_si128((__m128i*) o + 3, vxyzw3);
14160 }
14161 } else {
14162 do {
14163 const uint8_t vx = *x++;
14164 const uint8_t vy = *y++;
14165 const uint8_t vz = *z++;
14166 const uint8_t vw = *w++;
14167 o[0] = vx;
14168 o[1] = vy;
14169 o[2] = vz;
14170 o[3] = vw;
14171 o += 4;
14172 } while (--n != 0);
14173 }
14174 }
14175
xnn_x8_zip_xm_ukernel__sse2(size_t n,size_t m,const uint8_t * input,uint8_t * output)14176 void xnn_x8_zip_xm_ukernel__sse2(
14177 size_t n,
14178 size_t m,
14179 const uint8_t* input,
14180 uint8_t* output)
14181 {
14182 const uint8_t* w = input;
14183 const size_t input_increment = n * 3;
14184 const size_t output_increment = 4 - m * n;
14185 const uint8_t* last_input = w + n * (m - 1);
14186 uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4));
14187
14188 if (n >= 8) {
14189 for (size_t i = 0; i < m; i += 4) {
14190 size_t k = n;
14191 w = (const uint8_t*) ((uintptr_t) w + input_increment);
14192 if (w >= last_input) {
14193 w = last_input;
14194 }
14195 const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n);
14196 const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n);
14197 const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n);
14198 while (k >= 16) {
14199 const __m128i vx = _mm_loadu_si128((const __m128i*) x);
14200 x += 16;
14201 const __m128i vy = _mm_loadu_si128((const __m128i*) y);
14202 y += 16;
14203 const __m128i vz = _mm_loadu_si128((const __m128i*) z);
14204 z += 16;
14205 const __m128i vw = _mm_loadu_si128((const __m128i*) w);
14206 w += 16;
14207 const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
14208 const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
14209 const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
14210 const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
14211 __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
14212 __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
14213 __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
14214 __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
14215
14216 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14217 output = (uint8_t*) ((uintptr_t) output + m);
14218 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14219 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14220 output = (uint8_t*) ((uintptr_t) output + m);
14221 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
14222 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14223 output = (uint8_t*) ((uintptr_t) output + m);
14224 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14225 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14226 output = (uint8_t*) ((uintptr_t) output + m);
14227
14228 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14229 output = (uint8_t*) ((uintptr_t) output + m);
14230 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
14231 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14232 output = (uint8_t*) ((uintptr_t) output + m);
14233 vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1);
14234 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14235 output = (uint8_t*) ((uintptr_t) output + m);
14236 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
14237 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14238 output = (uint8_t*) ((uintptr_t) output + m);
14239
14240 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
14241 output = (uint8_t*) ((uintptr_t) output + m);
14242 vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2));
14243 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
14244 output = (uint8_t*) ((uintptr_t) output + m);
14245 vxyzw2 = _mm_unpackhi_epi64(vxyzw2, vxyzw2);
14246 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
14247 output = (uint8_t*) ((uintptr_t) output + m);
14248 vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2));
14249 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
14250 output = (uint8_t*) ((uintptr_t) output + m);
14251
14252 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
14253 output = (uint8_t*) ((uintptr_t) output + m);
14254 vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2));
14255 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
14256 output = (uint8_t*) ((uintptr_t) output + m);
14257 vxyzw3 = _mm_unpackhi_epi64(vxyzw3, vxyzw3);
14258 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
14259 output = (uint8_t*) ((uintptr_t) output + m);
14260 vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2));
14261 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
14262 output = (uint8_t*) ((uintptr_t) output + m);
14263 k -= 16;
14264 };
14265 if (k >= 8) {
14266 const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
14267 x += 8;
14268 const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
14269 y += 8;
14270 const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
14271 z += 8;
14272 const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
14273 w += 8;
14274 const __m128i vxy = _mm_unpacklo_epi8(vx, vy);
14275 const __m128i vzw = _mm_unpacklo_epi8(vz, vw);
14276 __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw);
14277 __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw);
14278
14279 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14280 output = (uint8_t*) ((uintptr_t) output + m);
14281 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14282 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14283 output = (uint8_t*) ((uintptr_t) output + m);
14284 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
14285 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14286 output = (uint8_t*) ((uintptr_t) output + m);
14287 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14288 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14289 output = (uint8_t*) ((uintptr_t) output + m);
14290
14291 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14292 output = (uint8_t*) ((uintptr_t) output + m);
14293 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
14294 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14295 output = (uint8_t*) ((uintptr_t) output + m);
14296 vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1);
14297 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14298 output = (uint8_t*) ((uintptr_t) output + m);
14299 vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
14300 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
14301 output = (uint8_t*) ((uintptr_t) output + m);
14302 k -= 8;
14303 }
14304 if (k != 0) {
14305 const size_t address_decrement = 8 - k;
14306 x -= address_decrement;
14307 y -= address_decrement;
14308 z -= address_decrement;
14309 w -= address_decrement;
14310 const __m128i vshift = _mm_cvtsi32_si128((int) address_decrement * 8);
14311
14312 const __m128i vx = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) x), vshift);
14313 const __m128i vy = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) y), vshift);
14314 const __m128i vz = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) z), vshift);
14315 const __m128i vw = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) w), vshift);
14316 w += 8;
14317 const __m128i vxy = _mm_unpacklo_epi8(vx, vy);
14318 const __m128i vzw = _mm_unpacklo_epi8(vz, vw);
14319 __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw);
14320 __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw);
14321
14322 if (k & 4) {
14323 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14324 output = (uint8_t*) ((uintptr_t) output + m);
14325 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14326 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14327 output = (uint8_t*) ((uintptr_t) output + m);
14328 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
14329 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14330 output = (uint8_t*) ((uintptr_t) output + m);
14331 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14332 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14333 output = (uint8_t*) ((uintptr_t) output + m);
14334 vxyzw0 = vxyzw1;
14335 }
14336
14337 if (k & 2) {
14338 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14339 output = (uint8_t*) ((uintptr_t) output + m);
14340 vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
14341 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14342 output = (uint8_t*) ((uintptr_t) output + m);
14343 vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
14344 }
14345 if (k & 1) {
14346 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
14347 output = (uint8_t*) ((uintptr_t) output + m);
14348 }
14349 }
14350 output = (uint8_t*) ((uintptr_t) output + output_increment);
14351 if (output > last_output) {
14352 output = last_output;
14353 }
14354 }
14355 } else {
14356 const uint8_t* i = input;
14357 uint8_t* o = output;
14358 size_t k = n;
14359 do {
14360 size_t l = m;
14361 const uint8_t* ii = i++;
14362 do {
14363 *o++ = *ii;
14364 ii += n;
14365 } while (--l != 0);
14366 } while (--k != 0);
14367 }
14368 }
14369
xnn_xx_fill_ukernel__sse2_x64(size_t rows,size_t channels,void * output,size_t output_stride,const uint32_t fill_pattern)14370 void xnn_xx_fill_ukernel__sse2_x64(
14371 size_t rows,
14372 size_t channels,
14373 void* output,
14374 size_t output_stride,
14375 const uint32_t fill_pattern)
14376 {
14377 assert(rows != 0);
14378 assert(channels != 0);
14379
14380 const size_t output_increment = output_stride - channels;
14381
14382 const __m128i vfill = _mm_shuffle_epi32(_mm_cvtsi32_si128(fill_pattern), _MM_SHUFFLE(0, 0, 0, 0));
14383 do {
14384 size_t c = channels;
14385 for (; c >= 64 * sizeof(uint8_t); c -= 64 * sizeof(uint8_t)) {
14386 _mm_storeu_si128((__m128i*) output, vfill);
14387 _mm_storeu_si128((__m128i*) output + 1, vfill);
14388 _mm_storeu_si128((__m128i*) output + 2, vfill);
14389 _mm_storeu_si128((__m128i*) output + 3, vfill);
14390 output = ((uint8_t*) output + 64);
14391 }
14392 for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
14393 _mm_storeu_si128((__m128i*) output, vfill);
14394 output = ((uint8_t*) output + 16);
14395 }
14396 if XNN_UNLIKELY(c != 0) {
14397 if XNN_LIKELY(c & (8 * sizeof(uint8_t))) {
14398 _mm_storel_epi64(output, vfill);
14399 output = ((uint8_t*) output + 8);
14400 }
14401 if XNN_LIKELY(c & (4 * sizeof(uint8_t))) {
14402 unaligned_store_u32(output, fill_pattern);
14403 output = ((uint8_t*) output + 4);
14404 }
14405 uint32_t vfill_subpattern = fill_pattern;
14406 if XNN_LIKELY(c & (2 * sizeof(uint8_t))) {
14407 unaligned_store_u16(output, (uint16_t) vfill_subpattern);
14408 vfill_subpattern >>= 16;
14409 output = ((uint8_t*) output + 2);
14410 }
14411 if XNN_LIKELY(c & (1 * sizeof(uint8_t))) {
14412 *((uint8_t*) output) = (uint8_t) vfill_subpattern;
14413 output = ((uint8_t*) output + 1);
14414 }
14415 }
14416 output = (void*) ((uintptr_t) output + output_increment);
14417 } while (--rows != 0);
14418 }
14419
xnn_xx_pad_ukernel__sse2(size_t rows,size_t channels,size_t pre_padding,size_t post_padding,const void * input,size_t input_stride,void * output,size_t output_stride,const uint32_t fill_pattern)14420 void xnn_xx_pad_ukernel__sse2(
14421 size_t rows,
14422 size_t channels,
14423 size_t pre_padding,
14424 size_t post_padding,
14425 const void* input,
14426 size_t input_stride,
14427 void* output,
14428 size_t output_stride,
14429 const uint32_t fill_pattern) XNN_OOB_READS
14430 {
14431 const size_t input_increment = input_stride - channels;
14432 const size_t output_increment = output_stride - (pre_padding + channels + post_padding);
14433
14434 const __m128i vfill_pattern = _mm_shuffle_epi32(_mm_cvtsi32_si128((int) fill_pattern), _MM_SHUFFLE(0, 0, 0, 0));
14435 do {
14436 // Pre-pad input channels.
14437 size_t l = pre_padding;
14438 if XNN_LIKELY(l != 0) {
14439 for (; l >= 16 * sizeof(uint8_t); l -= 16 * sizeof(uint8_t)) {
14440 _mm_storeu_si128((__m128i*) output, vfill_pattern);
14441 output = (uint8_t*) output + 16;
14442 }
14443 if (l & (8 * sizeof(uint8_t))) {
14444 _mm_storel_epi64((__m128i*) output, vfill_pattern);
14445 output = (uint8_t*) output + 8;
14446 }
14447 uint32_t vfill_subpattern = fill_pattern;
14448 if (l & (4 * sizeof(uint8_t))) {
14449 unaligned_store_u32(output, vfill_subpattern);
14450 output = (uint8_t*) output + 4;
14451 }
14452 if (l & (2 * sizeof(uint8_t))) {
14453 unaligned_store_u16(output, vfill_subpattern);
14454 vfill_subpattern >>= 16;
14455 output = (uint8_t*) output + 2;
14456 }
14457 if (l & (1 * sizeof(uint8_t))) {
14458 *((uint8_t*) output) = (uint8_t) vfill_subpattern;
14459 output = (uint8_t*) output + 1;
14460 }
14461 }
14462
14463 // Copy input channels.
14464 size_t c = channels;
14465 for (; c >= 16 * sizeof(uint8_t); c -= 16 * sizeof(uint8_t)) {
14466 const __m128i vdata = _mm_loadu_si128((const __m128i*) input);
14467 input = (const uint8_t*) input + 16;
14468
14469 _mm_storeu_si128((__m128i*) output, vdata);
14470 output = (uint8_t*) output + 16;
14471 }
14472 if XNN_UNLIKELY(c != 0) {
14473 __m128i vdata = _mm_loadu_si128((const __m128i*) input);
14474 input = (const void*) ((uintptr_t) input + c);
14475 if (c & (8 * sizeof(uint8_t))) {
14476 _mm_storel_epi64((__m128i*) output, vdata);
14477 vdata = _mm_unpackhi_epi64(vdata, vdata);
14478 output = (uint8_t*) output + 8;
14479 }
14480 if (c & (4 * sizeof(uint8_t))) {
14481 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vdata));
14482 vdata = _mm_srli_epi64(vdata, 32);
14483 output = (uint8_t*) output + 4;
14484 }
14485 uint32_t vsubdata = (uint32_t) _mm_cvtsi128_si32(vdata);
14486 if (c & (2 * sizeof(uint8_t))) {
14487 unaligned_store_u16(output, (uint16_t) vsubdata);
14488 vsubdata >>= 16;
14489 output = (uint8_t*) output + 2;
14490 }
14491 if (c & (1 * sizeof(uint8_t))) {
14492 *((uint8_t*) output) = (uint8_t) vsubdata;
14493 output = (uint8_t*) output + 1;
14494 }
14495 }
14496
14497 // Post-pad input channels.
14498 size_t r = post_padding;
14499 if XNN_LIKELY(r != 0) {
14500 for (; r >= 16 * sizeof(uint8_t); r -= 16 * sizeof(uint8_t)) {
14501 _mm_storeu_si128((__m128i*) output, vfill_pattern);
14502 output = (uint8_t*) output + 16;
14503 }
14504 if (r & (8 * sizeof(uint8_t))) {
14505 _mm_storel_epi64((__m128i*) output, vfill_pattern);
14506 output = (uint8_t*) output + 8;
14507 }
14508 uint32_t vfill_subpattern = fill_pattern;
14509 if (r & (4 * sizeof(uint8_t))) {
14510 unaligned_store_u32(output, vfill_subpattern);
14511 output = (uint8_t*) output + 4;
14512 }
14513 if (r & (2 * sizeof(uint8_t))) {
14514 unaligned_store_u16(output, (uint16_t) vfill_subpattern);
14515 vfill_subpattern >>= 16;
14516 output = (uint8_t*) output + 2;
14517 }
14518 if (r & (1 * sizeof(uint8_t))) {
14519 *((uint8_t*) output) = (uint8_t) vfill_subpattern;
14520 output = (uint8_t*) output + 1;
14521 }
14522 }
14523
14524 input = (const void*) ((uintptr_t) input + input_increment);
14525 output = (void*) ((uintptr_t) output + output_increment);
14526 } while (--rows != 0);
14527 }
14528