xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-sse2.c (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <immintrin.h>
10 
11 #include <qnnpack/q8dwconv.h>
12 
pytorch_q8dwconv_ukernel_mp8x25__sse2(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,int32_t * outacc32,uint8_t * output,size_t input_stride,size_t output_increment,const union pytorch_qnnp_conv_quantization_params quantization_params[RESTRICT_STATIC1])13 void pytorch_q8dwconv_ukernel_mp8x25__sse2(
14     size_t channels,
15     size_t output_width,
16     const uint8_t** input,
17     const void* weights,
18     int32_t* outacc32,
19     uint8_t* output,
20     size_t input_stride,
21     size_t output_increment,
22     const union pytorch_qnnp_conv_quantization_params
23         quantization_params[RESTRICT_STATIC 1]) {
24   const __m128i vinput_zero_point = _mm_load_si128(
25       (const __m128i*)quantization_params->sse2.input_zero_point);
26   const __m128i vkernel_zero_point = _mm_set1_epi16(
27       quantization_params->sse2.kernel_zero_points[0]);
28   const __m128i vzero = _mm_setzero_si128();
29 
30   do {
31     int32_t* outacc = outacc32;
32     const void* w = weights;
33     {
34       const uint8_t* i00 = input[0];
35       const uint8_t* i01 = input[1];
36       const uint8_t* i02 = input[2];
37       const uint8_t* i10 = input[3];
38       const uint8_t* i11 = input[4];
39       const uint8_t* i12 = input[5];
40       const uint8_t* i20 = input[6];
41       const uint8_t* i21 = input[7];
42       const uint8_t* i22 = input[8];
43       const uint8_t* i23 = input[9];
44 
45       size_t c = channels;
46       for (; c >= 8; c -= 8) {
47         __m128i vacc_lo = _mm_loadu_si128((const __m128i*)w);
48         __m128i vacc_hi = _mm_loadu_si128((const __m128i*)((uintptr_t)w + 16));
49 
50         const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
51         i00 += 8;
52         const __m128i vxi00 =
53             _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
54         const __m128i vk00 =
55             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
56         const __m128i vxk00 =
57             _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
58         const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
59         const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
60         vacc_lo = _mm_add_epi32(
61             vacc_lo, _mm_unpacklo_epi16(vprod00_odd, vprod00_even));
62         vacc_hi = _mm_add_epi32(
63             vacc_hi, _mm_unpackhi_epi16(vprod00_odd, vprod00_even));
64 
65         const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
66         i01 += 8;
67         const __m128i vxi01 =
68             _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
69         const __m128i vk01 =
70             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
71         const __m128i vxk01 =
72             _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
73         const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
74         const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
75         vacc_lo = _mm_add_epi32(
76             vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
77         vacc_hi = _mm_add_epi32(
78             vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
79 
80         const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
81         i02 += 8;
82         const __m128i vxi02 =
83             _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
84         const __m128i vk02 =
85             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
86         const __m128i vxk02 =
87             _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
88         const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
89         const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
90         vacc_lo = _mm_add_epi32(
91             vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
92         vacc_hi = _mm_add_epi32(
93             vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
94 
95         const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
96         i10 += 8;
97         const __m128i vxi10 =
98             _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
99         const __m128i vk10 =
100             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
101         const __m128i vxk10 =
102             _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
103         const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
104         const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
105         vacc_lo = _mm_add_epi32(
106             vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
107         vacc_hi = _mm_add_epi32(
108             vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
109 
110         const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
111         i11 += 8;
112         const __m128i vxi11 =
113             _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
114         const __m128i vk11 =
115             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
116         const __m128i vxk11 =
117             _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
118         const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
119         const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
120         vacc_lo = _mm_add_epi32(
121             vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
122         vacc_hi = _mm_add_epi32(
123             vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
124 
125         const __m128i vi12 = _mm_loadl_epi64((const __m128i*)i12);
126         i12 += 8;
127         const __m128i vxi12 =
128             _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
129         const __m128i vk12 =
130             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
131         const __m128i vxk12 =
132             _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
133         const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
134         const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
135         vacc_lo = _mm_add_epi32(
136             vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
137         vacc_hi = _mm_add_epi32(
138             vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
139 
140         const __m128i vi20 = _mm_loadl_epi64((const __m128i*)i20);
141         i20 += 8;
142         const __m128i vxi20 =
143             _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
144         const __m128i vk20 =
145             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 80));
146         const __m128i vxk20 =
147             _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
148         const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
149         const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
150         vacc_lo = _mm_add_epi32(
151             vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
152         vacc_hi = _mm_add_epi32(
153             vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
154 
155         const __m128i vi21 = _mm_loadl_epi64((const __m128i*)i21);
156         i21 += 8;
157         const __m128i vxi21 =
158             _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
159         const __m128i vk21 =
160             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 88));
161         const __m128i vxk21 =
162             _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
163         const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
164         const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
165         vacc_lo = _mm_add_epi32(
166             vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
167         vacc_hi = _mm_add_epi32(
168             vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
169 
170         const __m128i vi22 = _mm_loadl_epi64((const __m128i*)i22);
171         i22 += 8;
172         const __m128i vxi22 =
173             _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
174         const __m128i vk22 =
175             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 96));
176         const __m128i vxk22 =
177             _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
178         const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
179         const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
180         vacc_lo = _mm_add_epi32(
181             vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
182         vacc_hi = _mm_add_epi32(
183             vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
184 
185         const __m128i vi23 = _mm_loadl_epi64((const __m128i*)i23);
186         i23 += 8;
187         const __m128i vxi23 =
188             _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
189         const __m128i vk23 =
190             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 104));
191         const __m128i vxk23 =
192             _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
193         const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
194         const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
195         vacc_lo = _mm_add_epi32(
196             vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
197         vacc_hi = _mm_add_epi32(
198             vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
199 
200         w = (const void*)((uintptr_t)w + 112);
201         _mm_storeu_si128((__m128i*)outacc, vacc_lo);
202         outacc += 4;
203         _mm_storeu_si128((__m128i*)outacc, vacc_hi);
204         outacc += 4;
205       }
206       if (c != 0) {
207         const size_t i_predecrement = 8 - c;
208         const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
209         i00 -= i_predecrement;
210         i01 -= i_predecrement;
211         i02 -= i_predecrement;
212         i10 -= i_predecrement;
213         i11 -= i_predecrement;
214         i12 -= i_predecrement;
215         i20 -= i_predecrement;
216         i21 -= i_predecrement;
217         i22 -= i_predecrement;
218         i23 -= i_predecrement;
219 
220         __m128i vacc_lo = _mm_loadu_si128((const __m128i*)w);
221         __m128i vacc_hi = _mm_loadu_si128((const __m128i*)((uintptr_t)w + 16));
222 
223         const __m128i vi00 =
224             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
225         const __m128i vxi00 =
226             _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
227         const __m128i vk00 =
228             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
229         const __m128i vxk00 =
230             _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
231         const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
232         const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
233         vacc_lo = _mm_add_epi32(
234             vacc_lo, _mm_unpacklo_epi16(vprod00_odd, vprod00_even));
235         vacc_hi = _mm_add_epi32(
236             vacc_hi, _mm_unpackhi_epi16(vprod00_odd, vprod00_even));
237 
238         const __m128i vi01 =
239             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
240         const __m128i vxi01 =
241             _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
242         const __m128i vk01 =
243             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
244         const __m128i vxk01 =
245             _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
246         const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
247         const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
248         vacc_lo = _mm_add_epi32(
249             vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
250         vacc_hi = _mm_add_epi32(
251             vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
252 
253         const __m128i vi02 =
254             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
255         const __m128i vxi02 =
256             _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
257         const __m128i vk02 =
258             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
259         const __m128i vxk02 =
260             _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
261         const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
262         const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
263         vacc_lo = _mm_add_epi32(
264             vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
265         vacc_hi = _mm_add_epi32(
266             vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
267 
268         const __m128i vi10 =
269             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
270         const __m128i vxi10 =
271             _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
272         const __m128i vk10 =
273             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
274         const __m128i vxk10 =
275             _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
276         const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
277         const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
278         vacc_lo = _mm_add_epi32(
279             vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
280         vacc_hi = _mm_add_epi32(
281             vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
282 
283         const __m128i vi11 =
284             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
285         const __m128i vxi11 =
286             _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
287         const __m128i vk11 =
288             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
289         const __m128i vxk11 =
290             _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
291         const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
292         const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
293         vacc_lo = _mm_add_epi32(
294             vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
295         vacc_hi = _mm_add_epi32(
296             vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
297 
298         const __m128i vi12 =
299             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i12), vi_shift);
300         const __m128i vxi12 =
301             _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
302         const __m128i vk12 =
303             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
304         const __m128i vxk12 =
305             _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
306         const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
307         const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
308         vacc_lo = _mm_add_epi32(
309             vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
310         vacc_hi = _mm_add_epi32(
311             vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
312 
313         const __m128i vi20 =
314             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i20), vi_shift);
315         const __m128i vxi20 =
316             _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
317         const __m128i vk20 =
318             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 80));
319         const __m128i vxk20 =
320             _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
321         const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
322         const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
323         vacc_lo = _mm_add_epi32(
324             vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
325         vacc_hi = _mm_add_epi32(
326             vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
327 
328         const __m128i vi21 =
329             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i21), vi_shift);
330         const __m128i vxi21 =
331             _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
332         const __m128i vk21 =
333             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 88));
334         const __m128i vxk21 =
335             _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
336         const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
337         const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
338         vacc_lo = _mm_add_epi32(
339             vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
340         vacc_hi = _mm_add_epi32(
341             vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
342 
343         const __m128i vi22 =
344             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i22), vi_shift);
345         const __m128i vxi22 =
346             _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
347         const __m128i vk22 =
348             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 96));
349         const __m128i vxk22 =
350             _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
351         const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
352         const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
353         vacc_lo = _mm_add_epi32(
354             vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
355         vacc_hi = _mm_add_epi32(
356             vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
357 
358         const __m128i vi23 =
359             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i23), vi_shift);
360         const __m128i vxi23 =
361             _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
362         const __m128i vk23 =
363             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 104));
364         const __m128i vxk23 =
365             _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
366         const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
367         const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
368         vacc_lo = _mm_add_epi32(
369             vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
370         vacc_hi = _mm_add_epi32(
371             vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
372 
373         w = (const void*)((uintptr_t)w + 112);
374         _mm_storeu_si128((__m128i*)outacc, vacc_lo);
375         outacc += 4;
376         _mm_storeu_si128((__m128i*)outacc, vacc_hi);
377         outacc += 4;
378       }
379     }
380     {
381       const uint8_t* i00 = input[10];
382       const uint8_t* i01 = input[11];
383       const uint8_t* i02 = input[12];
384       const uint8_t* i10 = input[13];
385       const uint8_t* i11 = input[14];
386       const uint8_t* i12 = input[15];
387       const uint8_t* i20 = input[16];
388       const uint8_t* i21 = input[17];
389       const uint8_t* i22 = input[18];
390       const uint8_t* i23 = input[19];
391       outacc = outacc32;
392 
393       size_t c = channels;
394       for (; c >= 8; c -= 8) {
395         const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
396         i00 += 8;
397         const __m128i vxi00 =
398             _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
399         const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
400         const __m128i vxk00 =
401             _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
402         const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
403         const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
404         __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
405         __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
406 
407         const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
408         i01 += 8;
409         const __m128i vxi01 =
410             _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
411         const __m128i vk01 =
412             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
413         const __m128i vxk01 =
414             _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
415         const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
416         const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
417         vacc_lo = _mm_add_epi32(
418             vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
419         vacc_hi = _mm_add_epi32(
420             vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
421 
422         const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
423         i02 += 8;
424         const __m128i vxi02 =
425             _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
426         const __m128i vk02 =
427             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
428         const __m128i vxk02 =
429             _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
430         const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
431         const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
432         vacc_lo = _mm_add_epi32(
433             vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
434         vacc_hi = _mm_add_epi32(
435             vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
436 
437         const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
438         i10 += 8;
439         const __m128i vxi10 =
440             _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
441         const __m128i vk10 =
442             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
443         const __m128i vxk10 =
444             _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
445         const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
446         const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
447         vacc_lo = _mm_add_epi32(
448             vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
449         vacc_hi = _mm_add_epi32(
450             vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
451 
452         const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
453         i11 += 8;
454         const __m128i vxi11 =
455             _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
456         const __m128i vk11 =
457             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
458         const __m128i vxk11 =
459             _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
460         const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
461         const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
462         vacc_lo = _mm_add_epi32(
463             vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
464         vacc_hi = _mm_add_epi32(
465             vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
466 
467         const __m128i vi12 = _mm_loadl_epi64((const __m128i*)i12);
468         i12 += 8;
469         const __m128i vxi12 =
470             _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
471         const __m128i vk12 =
472             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
473         const __m128i vxk12 =
474             _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
475         const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
476         const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
477         vacc_lo = _mm_add_epi32(
478             vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
479         vacc_hi = _mm_add_epi32(
480             vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
481 
482         const __m128i vi20 = _mm_loadl_epi64((const __m128i*)i20);
483         i20 += 8;
484         const __m128i vxi20 =
485             _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
486         const __m128i vk20 =
487             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
488         const __m128i vxk20 =
489             _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
490         const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
491         const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
492         vacc_lo = _mm_add_epi32(
493             vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
494         vacc_hi = _mm_add_epi32(
495             vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
496 
497         const __m128i vi21 = _mm_loadl_epi64((const __m128i*)i21);
498         i21 += 8;
499         const __m128i vxi21 =
500             _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
501         const __m128i vk21 =
502             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
503         const __m128i vxk21 =
504             _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
505         const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
506         const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
507         vacc_lo = _mm_add_epi32(
508             vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
509         vacc_hi = _mm_add_epi32(
510             vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
511 
512         const __m128i vi22 = _mm_loadl_epi64((const __m128i*)i22);
513         i22 += 8;
514         const __m128i vxi22 =
515             _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
516         const __m128i vk22 =
517             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
518         const __m128i vxk22 =
519             _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
520         const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
521         const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
522         vacc_lo = _mm_add_epi32(
523             vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
524         vacc_hi = _mm_add_epi32(
525             vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
526 
527         const __m128i vi23 = _mm_loadl_epi64((const __m128i*)i23);
528         i23 += 8;
529         const __m128i vxi23 =
530             _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
531         const __m128i vk23 =
532             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
533         const __m128i vxk23 =
534             _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
535         const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
536         const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
537         vacc_lo = _mm_add_epi32(
538             vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
539         vacc_hi = _mm_add_epi32(
540             vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
541 
542         w = (const void*)((uintptr_t)w + 80);
543         vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
544         vacc_hi =
545             _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
546         _mm_storeu_si128((__m128i*)outacc, vacc_lo);
547         outacc += 4;
548         _mm_storeu_si128((__m128i*)outacc, vacc_hi);
549         outacc += 4;
550       }
551       if (c != 0) {
552         const size_t i_predecrement = 8 - c;
553         const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
554         i00 -= i_predecrement;
555         i01 -= i_predecrement;
556         i02 -= i_predecrement;
557         i10 -= i_predecrement;
558         i11 -= i_predecrement;
559         i12 -= i_predecrement;
560         i20 -= i_predecrement;
561         i21 -= i_predecrement;
562         i22 -= i_predecrement;
563         i23 -= i_predecrement;
564 
565         const __m128i vi00 =
566             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
567         const __m128i vxi00 =
568             _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
569         const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
570         const __m128i vxk00 =
571             _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
572         const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
573         const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
574         __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
575         __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
576 
577         const __m128i vi01 =
578             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
579         const __m128i vxi01 =
580             _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
581         const __m128i vk01 =
582             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
583         const __m128i vxk01 =
584             _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
585         const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
586         const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
587         vacc_lo = _mm_add_epi32(
588             vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
589         vacc_hi = _mm_add_epi32(
590             vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
591 
592         const __m128i vi02 =
593             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
594         const __m128i vxi02 =
595             _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
596         const __m128i vk02 =
597             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
598         const __m128i vxk02 =
599             _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
600         const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
601         const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
602         vacc_lo = _mm_add_epi32(
603             vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
604         vacc_hi = _mm_add_epi32(
605             vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
606 
607         const __m128i vi10 =
608             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
609         const __m128i vxi10 =
610             _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
611         const __m128i vk10 =
612             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
613         const __m128i vxk10 =
614             _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
615         const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
616         const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
617         vacc_lo = _mm_add_epi32(
618             vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
619         vacc_hi = _mm_add_epi32(
620             vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
621 
622         const __m128i vi11 =
623             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
624         const __m128i vxi11 =
625             _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
626         const __m128i vk11 =
627             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
628         const __m128i vxk11 =
629             _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
630         const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
631         const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
632         vacc_lo = _mm_add_epi32(
633             vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
634         vacc_hi = _mm_add_epi32(
635             vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
636 
637         const __m128i vi12 =
638             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i12), vi_shift);
639         const __m128i vxi12 =
640             _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
641         const __m128i vk12 =
642             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
643         const __m128i vxk12 =
644             _mm_sub_epi16(_mm_unpacklo_epi8(vk12, vzero), vkernel_zero_point);
645         const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
646         const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
647         vacc_lo = _mm_add_epi32(
648             vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
649         vacc_hi = _mm_add_epi32(
650             vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));
651 
652         const __m128i vi20 =
653             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i20), vi_shift);
654         const __m128i vxi20 =
655             _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
656         const __m128i vk20 =
657             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
658         const __m128i vxk20 =
659             _mm_sub_epi16(_mm_unpacklo_epi8(vk20, vzero), vkernel_zero_point);
660         const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
661         const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
662         vacc_lo = _mm_add_epi32(
663             vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
664         vacc_hi = _mm_add_epi32(
665             vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));
666 
667         const __m128i vi21 =
668             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i21), vi_shift);
669         const __m128i vxi21 =
670             _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
671         const __m128i vk21 =
672             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
673         const __m128i vxk21 =
674             _mm_sub_epi16(_mm_unpacklo_epi8(vk21, vzero), vkernel_zero_point);
675         const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
676         const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
677         vacc_lo = _mm_add_epi32(
678             vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
679         vacc_hi = _mm_add_epi32(
680             vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));
681 
682         const __m128i vi22 =
683             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i22), vi_shift);
684         const __m128i vxi22 =
685             _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
686         const __m128i vk22 =
687             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
688         const __m128i vxk22 =
689             _mm_sub_epi16(_mm_unpacklo_epi8(vk22, vzero), vkernel_zero_point);
690         const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
691         const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
692         vacc_lo = _mm_add_epi32(
693             vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
694         vacc_hi = _mm_add_epi32(
695             vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));
696 
697         const __m128i vi23 =
698             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i23), vi_shift);
699         const __m128i vxi23 =
700             _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
701         const __m128i vk23 =
702             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
703         const __m128i vxk23 =
704             _mm_sub_epi16(_mm_unpacklo_epi8(vk23, vzero), vkernel_zero_point);
705         const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
706         const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
707         vacc_lo = _mm_add_epi32(
708             vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
709         vacc_hi = _mm_add_epi32(
710             vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));
711 
712         w = (const void*)((uintptr_t)w + 80);
713         vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
714         vacc_hi =
715             _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
716         _mm_storeu_si128((__m128i*)outacc, vacc_lo);
717         outacc += 4;
718         _mm_storeu_si128((__m128i*)outacc, vacc_hi);
719         outacc += 4;
720       }
721     }
722     {
723       const uint8_t* i00 = input[20];
724       const uint8_t* i01 = input[21];
725       const uint8_t* i02 = input[22];
726       const uint8_t* i10 = input[23];
727       const uint8_t* i11 = input[24];
728       input = (const uint8_t**)((uintptr_t)input + input_stride);
729       outacc = outacc32;
730       size_t c = channels;
731       for (; c >= 8; c -= 8) {
732         const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
733         i00 += 8;
734         const __m128i vxi00 =
735             _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
736         const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
737         const __m128i vxk00 =
738             _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
739         const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
740         const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
741         __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
742         __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
743 
744         const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
745         i01 += 8;
746         const __m128i vxi01 =
747             _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
748         const __m128i vk01 =
749             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
750         const __m128i vxk01 =
751             _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
752         const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
753         const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
754         vacc_lo = _mm_add_epi32(
755             vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
756         vacc_hi = _mm_add_epi32(
757             vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
758 
759         const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
760         i02 += 8;
761         const __m128i vxi02 =
762             _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
763         const __m128i vk02 =
764             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
765         const __m128i vxk02 =
766             _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
767         const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
768         const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
769         vacc_lo = _mm_add_epi32(
770             vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
771         vacc_hi = _mm_add_epi32(
772             vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
773 
774         const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
775         i10 += 8;
776         const __m128i vxi10 =
777             _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
778         const __m128i vk10 =
779             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
780         const __m128i vxk10 =
781             _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
782         const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
783         const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
784         vacc_lo = _mm_add_epi32(
785             vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
786         vacc_hi = _mm_add_epi32(
787             vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
788 
789         const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
790         i11 += 8;
791         const __m128i vxi11 =
792             _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
793         const __m128i vk11 =
794             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
795         const __m128i vxk11 =
796             _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
797         const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
798         const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
799         vacc_lo = _mm_add_epi32(
800             vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
801         vacc_hi = _mm_add_epi32(
802             vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
803 
804         w = (const void*)((uintptr_t)w + 40);
805 
806         vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
807         vacc_hi =
808             _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
809         outacc += 8;
810 
811         const __m128 vmultiplier =
812             _mm_set1_ps(quantization_params->sse2.requantization_scales[0]);
813 
814         vacc_lo = _mm_cvtps_epi32(
815                       _mm_mul_ps(
816                         _mm_cvtepi32_ps(vacc_lo),
817                         vmultiplier
818                         )
819                       );
820         vacc_hi = _mm_cvtps_epi32(
821                       _mm_mul_ps(
822                         _mm_cvtepi32_ps(vacc_hi),
823                         vmultiplier
824                         )
825                       );
826 
827         const __m128i voutput_zero_point = _mm_load_si128(
828             (const __m128i*)quantization_params->sse2.output_zero_point);
829         __m128i vout = _mm_adds_epi16(
830             _mm_packs_epi32(vacc_lo, vacc_hi), voutput_zero_point);
831         vout = _mm_packus_epi16(vout, vout);
832         vout = _mm_max_epu8(
833             vout,
834             _mm_load_si128(
835                 (const __m128i*)quantization_params->sse2.output_min));
836         vout = _mm_min_epu8(
837             vout,
838             _mm_load_si128(
839                 (const __m128i*)quantization_params->sse2.output_max));
840 
841         _mm_storel_epi64((__m128i*)output, vout);
842         output += 8;
843       }
844       if (c != 0) {
845         const size_t i_predecrement = 8 - c;
846         const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
847         i00 -= i_predecrement;
848         i01 -= i_predecrement;
849         i02 -= i_predecrement;
850         i10 -= i_predecrement;
851         i11 -= i_predecrement;
852 
853         const __m128i vi00 =
854             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
855         const __m128i vxi00 =
856             _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
857         const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
858         const __m128i vxk00 =
859             _mm_sub_epi16(_mm_unpacklo_epi8(vk00, vzero), vkernel_zero_point);
860         const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
861         const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
862         __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
863         __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);
864 
865         const __m128i vi01 =
866             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
867         const __m128i vxi01 =
868             _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
869         const __m128i vk01 =
870             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
871         const __m128i vxk01 =
872             _mm_sub_epi16(_mm_unpacklo_epi8(vk01, vzero), vkernel_zero_point);
873         const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
874         const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
875         vacc_lo = _mm_add_epi32(
876             vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
877         vacc_hi = _mm_add_epi32(
878             vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));
879 
880         const __m128i vi02 =
881             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
882         const __m128i vxi02 =
883             _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
884         const __m128i vk02 =
885             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
886         const __m128i vxk02 =
887             _mm_sub_epi16(_mm_unpacklo_epi8(vk02, vzero), vkernel_zero_point);
888         const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
889         const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
890         vacc_lo = _mm_add_epi32(
891             vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
892         vacc_hi = _mm_add_epi32(
893             vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));
894 
895         const __m128i vi10 =
896             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
897         const __m128i vxi10 =
898             _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
899         const __m128i vk10 =
900             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
901         const __m128i vxk10 =
902             _mm_sub_epi16(_mm_unpacklo_epi8(vk10, vzero), vkernel_zero_point);
903         const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
904         const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
905         vacc_lo = _mm_add_epi32(
906             vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
907         vacc_hi = _mm_add_epi32(
908             vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));
909 
910         const __m128i vi11 =
911             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
912         const __m128i vxi11 =
913             _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
914         const __m128i vk11 =
915             _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
916         const __m128i vxk11 =
917             _mm_sub_epi16(_mm_unpacklo_epi8(vk11, vzero), vkernel_zero_point);
918         const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
919         const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
920         vacc_lo = _mm_add_epi32(
921             vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
922         vacc_hi = _mm_add_epi32(
923             vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));
924 
925         vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
926         vacc_hi =
927             _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
928         outacc += 8;
929 
930         const __m128 vmultiplier =
931             _mm_set1_ps(quantization_params->sse2.requantization_scales[0]);
932 
933         vacc_lo = _mm_cvtps_epi32(
934                       _mm_mul_ps(
935                         _mm_cvtepi32_ps(vacc_lo),
936                         vmultiplier
937                         )
938                       );
939         vacc_hi = _mm_cvtps_epi32(
940                       _mm_mul_ps(
941                         _mm_cvtepi32_ps(vacc_hi),
942                         vmultiplier
943                         )
944                       );
945 
946         const __m128i voutput_zero_point = _mm_load_si128(
947             (const __m128i*)quantization_params->sse2.output_zero_point);
948         __m128i vout = _mm_adds_epi16(
949             _mm_packs_epi32(vacc_lo, vacc_hi), voutput_zero_point);
950         vout = _mm_packus_epi16(vout, vout);
951         vout = _mm_max_epu8(
952             vout,
953             _mm_load_si128(
954                 (const __m128i*)quantization_params->sse2.output_min));
955         vout = _mm_min_epu8(
956             vout,
957             _mm_load_si128(
958                 (const __m128i*)quantization_params->sse2.output_max));
959 
960         if (c & 4) {
961           *((uint32_t*)output) = (uint32_t)_mm_cvtsi128_si32(vout);
962           output += 4;
963           vout = _mm_srli_epi64(vout, 32);
964         }
965         if (c & 2) {
966           *((uint16_t*)output) = (uint16_t)_mm_extract_epi16(vout, 0);
967           output += 2;
968           vout = _mm_srli_epi32(vout, 16);
969         }
970         if (c & 1) {
971           *((uint8_t*)output) = (uint8_t)_mm_cvtsi128_si32(vout);
972           output += 1;
973         }
974       }
975     }
976     output = (uint8_t*)((uintptr_t)output + output_increment);
977   } while (--output_width != 0);
978 }
979