1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <immintrin.h>
10
11 #include <qnnpack/q8dwconv.h>
12
pytorch_q8dwconv_ukernel_mp8x25__sse2(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,int32_t * outacc32,uint8_t * output,size_t input_stride,size_t output_increment,const union pytorch_qnnp_conv_quantization_params quantization_params[RESTRICT_STATIC1])13 void pytorch_q8dwconv_ukernel_mp8x25__sse2(
14 size_t channels,
15 size_t output_width,
16 const uint8_t** input,
17 const void* weights,
18 int32_t* outacc32,
19 uint8_t* output,
20 size_t input_stride,
21 size_t output_increment,
22 const union pytorch_qnnp_conv_quantization_params
23 quantization_params[RESTRICT_STATIC 1]) {
24 const __m128i vinput_zero_point = _mm_load_si128(
25 (const __m128i*)quantization_params->sse2.input_zero_point);
26 const __m128i vkernel_zero_point = _mm_set1_epi16(
27 quantization_params->sse2.kernel_zero_points[0]);
28 const __m128i vzero = _mm_setzero_si128();
29
30 do {
31 int32_t* outacc = outacc32;
32 const void* w = weights;
33 {
34 const uint8_t* i00 = input[0];
35 const uint8_t* i01 = input[1];
36 const uint8_t* i02 = input[2];
37 const uint8_t* i10 = input[3];
38 const uint8_t* i11 = input[4];
39 const uint8_t* i12 = input[5];
40 const uint8_t* i20 = input[6];
41 const uint8_t* i21 = input[7];
42 const uint8_t* i22 = input[8];
43 const uint8_t* i23 = input[9];
44
45 size_t c = channels;
46 for (; c >= 8; c -= 8) {
47 __m128i vacc_lo = _mm_loadu_si128((const __m128i*)w);
48 __m128i vacc_hi = _mm_loadu_si128((const __m128i*)((uintptr_t)w + 16));
49
50 const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
51 i00 += 8;
52 const __m128i vxi00 =
53 _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
54 const __m128i vk00 =
55 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
56 const __m128i vxk00 =
57 _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
58 const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
59 const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
60 vacc_lo = _mm_add_epi32(
61 vacc_lo, _mm_unpacklo_epi16(vprod00_odd, vprod00_even));
62 vacc_hi = _mm_add_epi32(
63 vacc_hi, _mm_unpackhi_epi16(vprod00_odd, vprod00_even));
64
65 const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
66 i01 += 8;
67 const __m128i vxi01 =
68 _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
69 const __m128i vk01 =
70 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
71 const __m128i vxk01 =
72 _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
73 const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
74 const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
75 vacc_lo = _mm_add_epi32(
76 vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
77 vacc_hi = _mm_add_epi32(
78 vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
79
80 const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
81 i02 += 8;
82 const __m128i vxi02 =
83 _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
84 const __m128i vk02 =
85 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
86 const __m128i vxk02 =
87 _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
88 const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
89 const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
90 vacc_lo = _mm_add_epi32(
91 vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
92 vacc_hi = _mm_add_epi32(
93 vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
94
95 const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
96 i10 += 8;
97 const __m128i vxi10 =
98 _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
99 const __m128i vk10 =
100 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
101 const __m128i vxk10 =
102 _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
103 const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
104 const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
105 vacc_lo = _mm_add_epi32(
106 vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
107 vacc_hi = _mm_add_epi32(
108 vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
109
110 const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
111 i11 += 8;
112 const __m128i vxi11 =
113 _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
114 const __m128i vk11 =
115 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
116 const __m128i vxk11 =
117 _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
118 const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
119 const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
120 vacc_lo = _mm_add_epi32(
121 vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
122 vacc_hi = _mm_add_epi32(
123 vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
124
125 const __m128i vi12 = _mm_loadl_epi64((const __m128i*)i12);
126 i12 += 8;
127 const __m128i vxi12 =
128 _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
129 const __m128i vk12 =
130 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
131 const __m128i vxk12 =
132 _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
133 const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
134 const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
135 vacc_lo = _mm_add_epi32(
136 vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
137 vacc_hi = _mm_add_epi32(
138 vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
139
140 const __m128i vi20 = _mm_loadl_epi64((const __m128i*)i20);
141 i20 += 8;
142 const __m128i vxi20 =
143 _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
144 const __m128i vk20 =
145 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 80));
146 const __m128i vxk20 =
147 _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
148 const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
149 const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
150 vacc_lo = _mm_add_epi32(
151 vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
152 vacc_hi = _mm_add_epi32(
153 vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
154
155 const __m128i vi21 = _mm_loadl_epi64((const __m128i*)i21);
156 i21 += 8;
157 const __m128i vxi21 =
158 _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
159 const __m128i vk21 =
160 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 88));
161 const __m128i vxk21 =
162 _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
163 const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
164 const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
165 vacc_lo = _mm_add_epi32(
166 vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
167 vacc_hi = _mm_add_epi32(
168 vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
169
170 const __m128i vi22 = _mm_loadl_epi64((const __m128i*)i22);
171 i22 += 8;
172 const __m128i vxi22 =
173 _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
174 const __m128i vk22 =
175 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 96));
176 const __m128i vxk22 =
177 _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
178 const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
179 const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
180 vacc_lo = _mm_add_epi32(
181 vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
182 vacc_hi = _mm_add_epi32(
183 vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
184
185 const __m128i vi23 = _mm_loadl_epi64((const __m128i*)i23);
186 i23 += 8;
187 const __m128i vxi23 =
188 _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
189 const __m128i vk23 =
190 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 104));
191 const __m128i vxk23 =
192 _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
193 const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
194 const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
195 vacc_lo = _mm_add_epi32(
196 vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
197 vacc_hi = _mm_add_epi32(
198 vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
199
200 w = (const void*)((uintptr_t)w + 112);
201 _mm_storeu_si128((__m128i*)outacc, vacc_lo);
202 outacc += 4;
203 _mm_storeu_si128((__m128i*)outacc, vacc_hi);
204 outacc += 4;
205 }
206 if (c != 0) {
207 const size_t i_predecrement = 8 - c;
208 const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
209 i00 -= i_predecrement;
210 i01 -= i_predecrement;
211 i02 -= i_predecrement;
212 i10 -= i_predecrement;
213 i11 -= i_predecrement;
214 i12 -= i_predecrement;
215 i20 -= i_predecrement;
216 i21 -= i_predecrement;
217 i22 -= i_predecrement;
218 i23 -= i_predecrement;
219
220 __m128i vacc_lo = _mm_loadu_si128((const __m128i*)w);
221 __m128i vacc_hi = _mm_loadu_si128((const __m128i*)((uintptr_t)w + 16));
222
223 const __m128i vi00 =
224 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
225 const __m128i vxi00 =
226 _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
227 const __m128i vk00 =
228 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
229 const __m128i vxk00 =
230 _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
231 const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
232 const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
233 vacc_lo = _mm_add_epi32(
234 vacc_lo, _mm_unpacklo_epi16(vprod00_odd, vprod00_even));
235 vacc_hi = _mm_add_epi32(
236 vacc_hi, _mm_unpackhi_epi16(vprod00_odd, vprod00_even));
237
238 const __m128i vi01 =
239 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
240 const __m128i vxi01 =
241 _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
242 const __m128i vk01 =
243 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
244 const __m128i vxk01 =
245 _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
246 const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
247 const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
248 vacc_lo = _mm_add_epi32(
249 vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
250 vacc_hi = _mm_add_epi32(
251 vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
252
253 const __m128i vi02 =
254 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
255 const __m128i vxi02 =
256 _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
257 const __m128i vk02 =
258 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
259 const __m128i vxk02 =
260 _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
261 const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
262 const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
263 vacc_lo = _mm_add_epi32(
264 vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
265 vacc_hi = _mm_add_epi32(
266 vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
267
268 const __m128i vi10 =
269 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
270 const __m128i vxi10 =
271 _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
272 const __m128i vk10 =
273 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
274 const __m128i vxk10 =
275 _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
276 const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
277 const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
278 vacc_lo = _mm_add_epi32(
279 vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
280 vacc_hi = _mm_add_epi32(
281 vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
282
283 const __m128i vi11 =
284 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
285 const __m128i vxi11 =
286 _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
287 const __m128i vk11 =
288 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
289 const __m128i vxk11 =
290 _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
291 const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
292 const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
293 vacc_lo = _mm_add_epi32(
294 vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
295 vacc_hi = _mm_add_epi32(
296 vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
297
298 const __m128i vi12 =
299 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i12), vi_shift);
300 const __m128i vxi12 =
301 _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
302 const __m128i vk12 =
303 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
304 const __m128i vxk12 =
305 _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
306 const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
307 const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
308 vacc_lo = _mm_add_epi32(
309 vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
310 vacc_hi = _mm_add_epi32(
311 vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
312
313 const __m128i vi20 =
314 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i20), vi_shift);
315 const __m128i vxi20 =
316 _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
317 const __m128i vk20 =
318 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 80));
319 const __m128i vxk20 =
320 _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
321 const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
322 const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
323 vacc_lo = _mm_add_epi32(
324 vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
325 vacc_hi = _mm_add_epi32(
326 vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
327
328 const __m128i vi21 =
329 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i21), vi_shift);
330 const __m128i vxi21 =
331 _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
332 const __m128i vk21 =
333 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 88));
334 const __m128i vxk21 =
335 _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
336 const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
337 const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
338 vacc_lo = _mm_add_epi32(
339 vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
340 vacc_hi = _mm_add_epi32(
341 vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
342
343 const __m128i vi22 =
344 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i22), vi_shift);
345 const __m128i vxi22 =
346 _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
347 const __m128i vk22 =
348 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 96));
349 const __m128i vxk22 =
350 _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
351 const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
352 const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
353 vacc_lo = _mm_add_epi32(
354 vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
355 vacc_hi = _mm_add_epi32(
356 vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
357
358 const __m128i vi23 =
359 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i23), vi_shift);
360 const __m128i vxi23 =
361 _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
362 const __m128i vk23 =
363 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 104));
364 const __m128i vxk23 =
365 _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
366 const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
367 const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
368 vacc_lo = _mm_add_epi32(
369 vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
370 vacc_hi = _mm_add_epi32(
371 vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
372
373 w = (const void*)((uintptr_t)w + 112);
374 _mm_storeu_si128((__m128i*)outacc, vacc_lo);
375 outacc += 4;
376 _mm_storeu_si128((__m128i*)outacc, vacc_hi);
377 outacc += 4;
378 }
379 }
380 {
381 const uint8_t* i00 = input[10];
382 const uint8_t* i01 = input[11];
383 const uint8_t* i02 = input[12];
384 const uint8_t* i10 = input[13];
385 const uint8_t* i11 = input[14];
386 const uint8_t* i12 = input[15];
387 const uint8_t* i20 = input[16];
388 const uint8_t* i21 = input[17];
389 const uint8_t* i22 = input[18];
390 const uint8_t* i23 = input[19];
391 outacc = outacc32;
392
393 size_t c = channels;
394 for (; c >= 8; c -= 8) {
395 const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
396 i00 += 8;
397 const __m128i vxi00 =
398 _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
399 const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
400 const __m128i vxk00 =
401 _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
402 const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
403 const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
404 __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
405 __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
406
407 const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
408 i01 += 8;
409 const __m128i vxi01 =
410 _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
411 const __m128i vk01 =
412 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
413 const __m128i vxk01 =
414 _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
415 const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
416 const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
417 vacc_lo = _mm_add_epi32(
418 vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
419 vacc_hi = _mm_add_epi32(
420 vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
421
422 const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
423 i02 += 8;
424 const __m128i vxi02 =
425 _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
426 const __m128i vk02 =
427 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
428 const __m128i vxk02 =
429 _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
430 const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
431 const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
432 vacc_lo = _mm_add_epi32(
433 vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
434 vacc_hi = _mm_add_epi32(
435 vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
436
437 const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
438 i10 += 8;
439 const __m128i vxi10 =
440 _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
441 const __m128i vk10 =
442 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
443 const __m128i vxk10 =
444 _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
445 const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
446 const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
447 vacc_lo = _mm_add_epi32(
448 vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
449 vacc_hi = _mm_add_epi32(
450 vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
451
452 const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
453 i11 += 8;
454 const __m128i vxi11 =
455 _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
456 const __m128i vk11 =
457 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
458 const __m128i vxk11 =
459 _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
460 const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
461 const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
462 vacc_lo = _mm_add_epi32(
463 vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
464 vacc_hi = _mm_add_epi32(
465 vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
466
467 const __m128i vi12 = _mm_loadl_epi64((const __m128i*)i12);
468 i12 += 8;
469 const __m128i vxi12 =
470 _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
471 const __m128i vk12 =
472 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
473 const __m128i vxk12 =
474 _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
475 const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
476 const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
477 vacc_lo = _mm_add_epi32(
478 vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
479 vacc_hi = _mm_add_epi32(
480 vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
481
482 const __m128i vi20 = _mm_loadl_epi64((const __m128i*)i20);
483 i20 += 8;
484 const __m128i vxi20 =
485 _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
486 const __m128i vk20 =
487 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
488 const __m128i vxk20 =
489 _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
490 const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
491 const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
492 vacc_lo = _mm_add_epi32(
493 vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
494 vacc_hi = _mm_add_epi32(
495 vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
496
497 const __m128i vi21 = _mm_loadl_epi64((const __m128i*)i21);
498 i21 += 8;
499 const __m128i vxi21 =
500 _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
501 const __m128i vk21 =
502 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
503 const __m128i vxk21 =
504 _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
505 const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
506 const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
507 vacc_lo = _mm_add_epi32(
508 vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
509 vacc_hi = _mm_add_epi32(
510 vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
511
512 const __m128i vi22 = _mm_loadl_epi64((const __m128i*)i22);
513 i22 += 8;
514 const __m128i vxi22 =
515 _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
516 const __m128i vk22 =
517 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
518 const __m128i vxk22 =
519 _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
520 const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
521 const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
522 vacc_lo = _mm_add_epi32(
523 vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
524 vacc_hi = _mm_add_epi32(
525 vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
526
527 const __m128i vi23 = _mm_loadl_epi64((const __m128i*)i23);
528 i23 += 8;
529 const __m128i vxi23 =
530 _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
531 const __m128i vk23 =
532 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
533 const __m128i vxk23 =
534 _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
535 const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
536 const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
537 vacc_lo = _mm_add_epi32(
538 vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
539 vacc_hi = _mm_add_epi32(
540 vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
541
542 w = (const void*)((uintptr_t)w + 80);
543 vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
544 vacc_hi =
545 _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
546 _mm_storeu_si128((__m128i*)outacc, vacc_lo);
547 outacc += 4;
548 _mm_storeu_si128((__m128i*)outacc, vacc_hi);
549 outacc += 4;
550 }
551 if (c != 0) {
552 const size_t i_predecrement = 8 - c;
553 const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
554 i00 -= i_predecrement;
555 i01 -= i_predecrement;
556 i02 -= i_predecrement;
557 i10 -= i_predecrement;
558 i11 -= i_predecrement;
559 i12 -= i_predecrement;
560 i20 -= i_predecrement;
561 i21 -= i_predecrement;
562 i22 -= i_predecrement;
563 i23 -= i_predecrement;
564
565 const __m128i vi00 =
566 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
567 const __m128i vxi00 =
568 _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
569 const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
570 const __m128i vxk00 =
571 _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
572 const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
573 const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
574 __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
575 __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
576
577 const __m128i vi01 =
578 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
579 const __m128i vxi01 =
580 _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
581 const __m128i vk01 =
582 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
583 const __m128i vxk01 =
584 _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
585 const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
586 const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
587 vacc_lo = _mm_add_epi32(
588 vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
589 vacc_hi = _mm_add_epi32(
590 vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
591
592 const __m128i vi02 =
593 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
594 const __m128i vxi02 =
595 _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
596 const __m128i vk02 =
597 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
598 const __m128i vxk02 =
599 _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
600 const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
601 const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
602 vacc_lo = _mm_add_epi32(
603 vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
604 vacc_hi = _mm_add_epi32(
605 vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
606
607 const __m128i vi10 =
608 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
609 const __m128i vxi10 =
610 _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
611 const __m128i vk10 =
612 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
613 const __m128i vxk10 =
614 _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
615 const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
616 const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
617 vacc_lo = _mm_add_epi32(
618 vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
619 vacc_hi = _mm_add_epi32(
620 vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
621
622 const __m128i vi11 =
623 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
624 const __m128i vxi11 =
625 _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
626 const __m128i vk11 =
627 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
628 const __m128i vxk11 =
629 _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
630 const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
631 const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
632 vacc_lo = _mm_add_epi32(
633 vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
634 vacc_hi = _mm_add_epi32(
635 vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
636
637 const __m128i vi12 =
638 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i12), vi_shift);
639 const __m128i vxi12 =
640 _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
641 const __m128i vk12 =
642 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
643 const __m128i vxk12 =
644 _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
645 const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
646 const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
647 vacc_lo = _mm_add_epi32(
648 vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
649 vacc_hi = _mm_add_epi32(
650 vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
651
652 const __m128i vi20 =
653 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i20), vi_shift);
654 const __m128i vxi20 =
655 _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
656 const __m128i vk20 =
657 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
658 const __m128i vxk20 =
659 _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
660 const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
661 const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
662 vacc_lo = _mm_add_epi32(
663 vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
664 vacc_hi = _mm_add_epi32(
665 vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
666
667 const __m128i vi21 =
668 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i21), vi_shift);
669 const __m128i vxi21 =
670 _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
671 const __m128i vk21 =
672 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
673 const __m128i vxk21 =
674 _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
675 const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
676 const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
677 vacc_lo = _mm_add_epi32(
678 vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
679 vacc_hi = _mm_add_epi32(
680 vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
681
682 const __m128i vi22 =
683 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i22), vi_shift);
684 const __m128i vxi22 =
685 _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
686 const __m128i vk22 =
687 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
688 const __m128i vxk22 =
689 _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
690 const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
691 const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
692 vacc_lo = _mm_add_epi32(
693 vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
694 vacc_hi = _mm_add_epi32(
695 vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
696
697 const __m128i vi23 =
698 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i23), vi_shift);
699 const __m128i vxi23 =
700 _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
701 const __m128i vk23 =
702 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
703 const __m128i vxk23 =
704 _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
705 const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
706 const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
707 vacc_lo = _mm_add_epi32(
708 vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
709 vacc_hi = _mm_add_epi32(
710 vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
711
712 w = (const void*)((uintptr_t)w + 80);
713 vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
714 vacc_hi =
715 _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
716 _mm_storeu_si128((__m128i*)outacc, vacc_lo);
717 outacc += 4;
718 _mm_storeu_si128((__m128i*)outacc, vacc_hi);
719 outacc += 4;
720 }
721 }
722 {
723 const uint8_t* i00 = input[20];
724 const uint8_t* i01 = input[21];
725 const uint8_t* i02 = input[22];
726 const uint8_t* i10 = input[23];
727 const uint8_t* i11 = input[24];
728 input = (const uint8_t**)((uintptr_t)input + input_stride);
729 outacc = outacc32;
730 size_t c = channels;
731 for (; c >= 8; c -= 8) {
732 const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
733 i00 += 8;
734 const __m128i vxi00 =
735 _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
736 const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
737 const __m128i vxk00 =
738 _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
739 const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
740 const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
741 __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
742 __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
743
744 const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
745 i01 += 8;
746 const __m128i vxi01 =
747 _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
748 const __m128i vk01 =
749 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
750 const __m128i vxk01 =
751 _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
752 const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
753 const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
754 vacc_lo = _mm_add_epi32(
755 vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
756 vacc_hi = _mm_add_epi32(
757 vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
758
759 const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
760 i02 += 8;
761 const __m128i vxi02 =
762 _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
763 const __m128i vk02 =
764 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
765 const __m128i vxk02 =
766 _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
767 const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
768 const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
769 vacc_lo = _mm_add_epi32(
770 vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
771 vacc_hi = _mm_add_epi32(
772 vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
773
774 const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
775 i10 += 8;
776 const __m128i vxi10 =
777 _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
778 const __m128i vk10 =
779 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
780 const __m128i vxk10 =
781 _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
782 const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
783 const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
784 vacc_lo = _mm_add_epi32(
785 vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
786 vacc_hi = _mm_add_epi32(
787 vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
788
789 const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
790 i11 += 8;
791 const __m128i vxi11 =
792 _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
793 const __m128i vk11 =
794 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
795 const __m128i vxk11 =
796 _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
797 const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
798 const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
799 vacc_lo = _mm_add_epi32(
800 vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
801 vacc_hi = _mm_add_epi32(
802 vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
803
804 w = (const void*)((uintptr_t)w + 40);
805
806 vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
807 vacc_hi =
808 _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
809 outacc += 8;
810
811 const __m128 vmultiplier =
812 _mm_set1_ps(quantization_params->sse2.requantization_scales[0]);
813
814 vacc_lo = _mm_cvtps_epi32(
815 _mm_mul_ps(
816 _mm_cvtepi32_ps(vacc_lo),
817 vmultiplier
818 )
819 );
820 vacc_hi = _mm_cvtps_epi32(
821 _mm_mul_ps(
822 _mm_cvtepi32_ps(vacc_hi),
823 vmultiplier
824 )
825 );
826
827 const __m128i voutput_zero_point = _mm_load_si128(
828 (const __m128i*)quantization_params->sse2.output_zero_point);
829 __m128i vout = _mm_adds_epi16(
830 _mm_packs_epi32(vacc_lo, vacc_hi), voutput_zero_point);
831 vout = _mm_packus_epi16(vout, vout);
832 vout = _mm_max_epu8(
833 vout,
834 _mm_load_si128(
835 (const __m128i*)quantization_params->sse2.output_min));
836 vout = _mm_min_epu8(
837 vout,
838 _mm_load_si128(
839 (const __m128i*)quantization_params->sse2.output_max));
840
841 _mm_storel_epi64((__m128i*)output, vout);
842 output += 8;
843 }
844 if (c != 0) {
845 const size_t i_predecrement = 8 - c;
846 const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
847 i00 -= i_predecrement;
848 i01 -= i_predecrement;
849 i02 -= i_predecrement;
850 i10 -= i_predecrement;
851 i11 -= i_predecrement;
852
853 const __m128i vi00 =
854 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
855 const __m128i vxi00 =
856 _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
857 const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
858 const __m128i vxk00 =
859 _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
860 const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
861 const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
862 __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
863 __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
864
865 const __m128i vi01 =
866 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
867 const __m128i vxi01 =
868 _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
869 const __m128i vk01 =
870 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
871 const __m128i vxk01 =
872 _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
873 const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
874 const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
875 vacc_lo = _mm_add_epi32(
876 vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
877 vacc_hi = _mm_add_epi32(
878 vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
879
880 const __m128i vi02 =
881 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
882 const __m128i vxi02 =
883 _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
884 const __m128i vk02 =
885 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
886 const __m128i vxk02 =
887 _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
888 const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
889 const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
890 vacc_lo = _mm_add_epi32(
891 vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
892 vacc_hi = _mm_add_epi32(
893 vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
894
895 const __m128i vi10 =
896 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
897 const __m128i vxi10 =
898 _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
899 const __m128i vk10 =
900 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
901 const __m128i vxk10 =
902 _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
903 const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
904 const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
905 vacc_lo = _mm_add_epi32(
906 vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
907 vacc_hi = _mm_add_epi32(
908 vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
909
910 const __m128i vi11 =
911 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
912 const __m128i vxi11 =
913 _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
914 const __m128i vk11 =
915 _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
916 const __m128i vxk11 =
917 _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
918 const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
919 const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
920 vacc_lo = _mm_add_epi32(
921 vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
922 vacc_hi = _mm_add_epi32(
923 vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
924
925 vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
926 vacc_hi =
927 _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
928 outacc += 8;
929
930 const __m128 vmultiplier =
931 _mm_set1_ps(quantization_params->sse2.requantization_scales[0]);
932
933 vacc_lo = _mm_cvtps_epi32(
934 _mm_mul_ps(
935 _mm_cvtepi32_ps(vacc_lo),
936 vmultiplier
937 )
938 );
939 vacc_hi = _mm_cvtps_epi32(
940 _mm_mul_ps(
941 _mm_cvtepi32_ps(vacc_hi),
942 vmultiplier
943 )
944 );
945
946 const __m128i voutput_zero_point = _mm_load_si128(
947 (const __m128i*)quantization_params->sse2.output_zero_point);
948 __m128i vout = _mm_adds_epi16(
949 _mm_packs_epi32(vacc_lo, vacc_hi), voutput_zero_point);
950 vout = _mm_packus_epi16(vout, vout);
951 vout = _mm_max_epu8(
952 vout,
953 _mm_load_si128(
954 (const __m128i*)quantization_params->sse2.output_min));
955 vout = _mm_min_epu8(
956 vout,
957 _mm_load_si128(
958 (const __m128i*)quantization_params->sse2.output_max));
959
960 if (c & 4) {
961 *((uint32_t*)output) = (uint32_t)_mm_cvtsi128_si32(vout);
962 output += 4;
963 vout = _mm_srli_epi64(vout, 32);
964 }
965 if (c & 2) {
966 *((uint16_t*)output) = (uint16_t)_mm_extract_epi16(vout, 0);
967 output += 2;
968 vout = _mm_srli_epi32(vout, 16);
969 }
970 if (c & 1) {
971 *((uint8_t*)output) = (uint8_t)_mm_cvtsi128_si32(vout);
972 output += 1;
973 }
974 }
975 }
976 output = (uint8_t*)((uintptr_t)output + output_increment);
977 } while (--output_width != 0);
978 }
979