xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8avgpool/mp8x9p8q-sse2.c (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <assert.h>
10 
11 #include <emmintrin.h>
12 
13 #include <qnnpack/q8avgpool.h>
14 
pytorch_q8avgpool_ukernel_mp8x9p8q__sse2(size_t n,size_t ks,size_t kc,const uint8_t ** input,const uint8_t * zero,int32_t * buffer,uint8_t * output,size_t input_increment,size_t output_increment,const union pytorch_qnnp_avgpool_quantization_params quantization_params[RESTRICT_STATIC1])15 void pytorch_q8avgpool_ukernel_mp8x9p8q__sse2(
16     size_t n,
17     size_t ks,
18     size_t kc,
19     const uint8_t** input,
20     const uint8_t* zero,
21     int32_t* buffer,
22     uint8_t* output,
23     size_t input_increment,
24     size_t output_increment,
25     const union pytorch_qnnp_avgpool_quantization_params
26         quantization_params[RESTRICT_STATIC 1]) {
27   assert(n != 0);
28   assert(ks > 9);
29   assert(kc >= 8);
30 
31   const __m128i vbias =
32       _mm_load_si128((const __m128i*)&quantization_params->sse2.bias);
33   const __m128i vzero = _mm_setzero_si128();
34   const __m128 vscale = _mm_loadu_ps(quantization_params->sse2.scale);
35 
36   do {
37     {
38       const uint8_t* i0 = *input++;
39       const uint8_t* i1 = *input++;
40       const uint8_t* i2 = *input++;
41       const uint8_t* i3 = *input++;
42       const uint8_t* i4 = *input++;
43       const uint8_t* i5 = *input++;
44       const uint8_t* i6 = *input++;
45       const uint8_t* i7 = *input++;
46       const uint8_t* i8 = *input++;
47 
48       size_t k = kc;
49       int32_t* acc = buffer;
50       while (k >= 8) {
51         const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0);
52         i0 += 8;
53         const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1);
54         i1 += 8;
55         const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2);
56         i2 += 8;
57         const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3);
58         i3 += 8;
59         const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4);
60         i4 += 8;
61         const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5);
62         i5 += 8;
63         const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6);
64         i6 += 8;
65         const __m128i vi7 = _mm_loadl_epi64((const __m128i*)i7);
66         i7 += 8;
67         const __m128i vi8 = _mm_loadl_epi64((const __m128i*)i8);
68         i8 += 8;
69 
70         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
71         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
72         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
73         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
74         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
75         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
76         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
77         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
78         const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
79 
80         const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
81         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
82         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
83         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
84 
85         const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
86         const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
87         const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
88 
89         const __m128i vacc_lo =
90             _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
91         const __m128i vacc_hi =
92             _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
93 
94         _mm_store_si128((__m128i*)acc, vacc_lo);
95         _mm_store_si128((__m128i*)acc + 1, vacc_hi);
96         acc += 8;
97 
98         k -= 8;
99       }
100       if (k != 0) {
101         const size_t address_decrement = 8 - k;
102         i0 = (const uint8_t*)((uintptr_t)i0 - address_decrement);
103         i1 = (const uint8_t*)((uintptr_t)i1 - address_decrement);
104         i2 = (const uint8_t*)((uintptr_t)i2 - address_decrement);
105         i3 = (const uint8_t*)((uintptr_t)i3 - address_decrement);
106         i4 = (const uint8_t*)((uintptr_t)i4 - address_decrement);
107         i5 = (const uint8_t*)((uintptr_t)i5 - address_decrement);
108         i6 = (const uint8_t*)((uintptr_t)i6 - address_decrement);
109         i7 = (const uint8_t*)((uintptr_t)i7 - address_decrement);
110         i8 = (const uint8_t*)((uintptr_t)i8 - address_decrement);
111         const __m128i vshift = _mm_cvtsi32_si128(8 * address_decrement);
112 
113         const __m128i vi0 =
114             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i0), vshift);
115         const __m128i vi1 =
116             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i1), vshift);
117         const __m128i vi2 =
118             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i2), vshift);
119         const __m128i vi3 =
120             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i3), vshift);
121         const __m128i vi4 =
122             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i4), vshift);
123         const __m128i vi5 =
124             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i5), vshift);
125         const __m128i vi6 =
126             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i6), vshift);
127         const __m128i vi7 =
128             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i7), vshift);
129         const __m128i vi8 =
130             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i8), vshift);
131 
132         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
133         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
134         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
135         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
136         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
137         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
138         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
139         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
140         const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
141 
142         const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
143         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
144         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
145         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
146 
147         const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
148         const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
149         const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
150 
151         const __m128i vacc_lo =
152             _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
153         const __m128i vacc_hi =
154             _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
155 
156         _mm_store_si128((__m128i*)acc, vacc_lo);
157         _mm_store_si128((__m128i*)acc + 1, vacc_hi);
158       }
159     }
160 
161     size_t m = ks;
162     for (m -= 9; m > 8; m -= 8) {
163       const uint8_t* i0 = *input++;
164       const uint8_t* i1 = *input++;
165       const uint8_t* i2 = *input++;
166       const uint8_t* i3 = *input++;
167       const uint8_t* i4 = *input++;
168       const uint8_t* i5 = *input++;
169       const uint8_t* i6 = *input++;
170       const uint8_t* i7 = *input++;
171 
172       size_t k = kc;
173       int32_t* acc = buffer;
174       while (k >= 8) {
175         const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0);
176         i0 += 8;
177         const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1);
178         i1 += 8;
179         const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2);
180         i2 += 8;
181         const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3);
182         i3 += 8;
183         const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4);
184         i4 += 8;
185         const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5);
186         i5 += 8;
187         const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6);
188         i6 += 8;
189         const __m128i vi7 = _mm_loadl_epi64((const __m128i*)i7);
190         i7 += 8;
191         __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
192         __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
193 
194         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
195         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
196         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
197         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
198         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
199         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
200         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
201         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
202 
203         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
204         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
205         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
206         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
207 
208         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
209         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
210         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
211 
212         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
213         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
214 
215         _mm_store_si128((__m128i*)acc, vacc_lo);
216         _mm_store_si128((__m128i*)acc + 1, vacc_hi);
217         acc += 8;
218 
219         k -= 8;
220       }
221       if (k != 0) {
222         const size_t address_decrement = 8 - k;
223         i0 = (const uint8_t*)((uintptr_t)i0 - address_decrement);
224         i1 = (const uint8_t*)((uintptr_t)i1 - address_decrement);
225         i2 = (const uint8_t*)((uintptr_t)i2 - address_decrement);
226         i3 = (const uint8_t*)((uintptr_t)i3 - address_decrement);
227         i4 = (const uint8_t*)((uintptr_t)i4 - address_decrement);
228         i5 = (const uint8_t*)((uintptr_t)i5 - address_decrement);
229         i6 = (const uint8_t*)((uintptr_t)i6 - address_decrement);
230         i7 = (const uint8_t*)((uintptr_t)i7 - address_decrement);
231         const __m128i vshift = _mm_cvtsi32_si128(8 * address_decrement);
232 
233         const __m128i vi0 =
234             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i0), vshift);
235         const __m128i vi1 =
236             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i1), vshift);
237         const __m128i vi2 =
238             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i2), vshift);
239         const __m128i vi3 =
240             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i3), vshift);
241         const __m128i vi4 =
242             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i4), vshift);
243         const __m128i vi5 =
244             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i5), vshift);
245         const __m128i vi6 =
246             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i6), vshift);
247         const __m128i vi7 =
248             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i7), vshift);
249         __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
250         __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
251 
252         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
253         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
254         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
255         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
256         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
257         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
258         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
259         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
260 
261         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
262         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
263         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
264         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
265 
266         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
267         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
268         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
269 
270         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
271         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
272 
273         _mm_store_si128((__m128i*)acc, vacc_lo);
274         _mm_store_si128((__m128i*)acc + 1, vacc_hi);
275       }
276     }
277 
278     {
279       const uint8_t* i0 = input[0];
280       const uint8_t* i1 = input[1];
281       const uint8_t* i2 = input[2];
282       const uint8_t* i3 = input[3];
283       const uint8_t* i4 = input[4];
284       const uint8_t* i5 = input[5];
285       const uint8_t* i6 = input[6];
286       const uint8_t* i7 = input[7];
287       input = (const uint8_t**)((uintptr_t)input + input_increment);
288       if (m < 2) {
289         i1 = zero;
290       }
291       if (m <= 2) {
292         i2 = zero;
293       }
294       if (m < 4) {
295         i3 = zero;
296       }
297       if (m <= 4) {
298         i4 = zero;
299       }
300       if (m < 6) {
301         i5 = zero;
302       }
303       if (m <= 6) {
304         i6 = zero;
305       }
306       if (m != 8) {
307         i7 = zero;
308       }
309 
310       size_t k = kc;
311       int32_t* acc = buffer;
312       while (k >= 8) {
313         const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0);
314         i0 += 8;
315         const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1);
316         i1 += 8;
317         const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2);
318         i2 += 8;
319         const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3);
320         i3 += 8;
321         const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4);
322         i4 += 8;
323         const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5);
324         i5 += 8;
325         const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6);
326         i6 += 8;
327         const __m128i vi7 = _mm_loadl_epi64((const __m128i*)i7);
328         i7 += 8;
329         __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
330         __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
331         acc += 8;
332 
333         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
334         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
335         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
336         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
337         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
338         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
339         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
340         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
341 
342         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
343         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
344         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
345         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
346 
347         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
348         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
349         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
350 
351         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
352         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
353 
354         const __m128 vacc_lo_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_lo), vscale);
355         const __m128 vacc_hi_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_hi), vscale);
356 
357         const __m128i vscaled_lo = _mm_cvtps_epi32(vacc_lo_f);
358         const __m128i vscaled_hi = _mm_cvtps_epi32(vacc_hi_f);
359 
360         __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
361         vout = _mm_adds_epi16(
362             vout,
363             _mm_load_si128(
364                 (const __m128i*)&quantization_params->sse2.output_zero_point));
365         vout = _mm_packus_epi16(vout, vout);
366         vout = _mm_min_epu8(
367             vout,
368             _mm_load_si128(
369                 (const __m128i*)&quantization_params->sse2.output_max));
370         vout = _mm_max_epu8(
371             vout,
372             _mm_load_si128(
373                 (const __m128i*)&quantization_params->sse2.output_min));
374 
375         _mm_storel_epi64((__m128i*)output, vout);
376         output += 8;
377 
378         k -= 8;
379       }
380       if (k != 0) {
381         const size_t address_decrement = 8 - k;
382         i0 = (const uint8_t*)((uintptr_t)i0 - address_decrement);
383         i1 = (const uint8_t*)((uintptr_t)i1 - address_decrement);
384         i2 = (const uint8_t*)((uintptr_t)i2 - address_decrement);
385         i3 = (const uint8_t*)((uintptr_t)i3 - address_decrement);
386         i4 = (const uint8_t*)((uintptr_t)i4 - address_decrement);
387         i5 = (const uint8_t*)((uintptr_t)i5 - address_decrement);
388         i6 = (const uint8_t*)((uintptr_t)i6 - address_decrement);
389         i7 = (const uint8_t*)((uintptr_t)i7 - address_decrement);
390         const __m128i vshift = _mm_cvtsi32_si128(8 * address_decrement);
391 
392         const __m128i vi0 =
393             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i0), vshift);
394         const __m128i vi1 =
395             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i1), vshift);
396         const __m128i vi2 =
397             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i2), vshift);
398         const __m128i vi3 =
399             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i3), vshift);
400         const __m128i vi4 =
401             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i4), vshift);
402         const __m128i vi5 =
403             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i5), vshift);
404         const __m128i vi6 =
405             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i6), vshift);
406         const __m128i vi7 =
407             _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i7), vshift);
408         __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
409         __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
410 
411         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
412         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
413         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
414         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
415         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
416         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
417         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
418         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
419 
420         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
421         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
422         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
423         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
424 
425         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
426         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
427         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
428 
429         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
430         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
431 
432         const __m128 vacc_lo_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_lo), vscale);
433         const __m128 vacc_hi_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_hi), vscale);
434 
435         const __m128i vscaled_lo = _mm_cvtps_epi32(vacc_lo_f);
436         const __m128i vscaled_hi = _mm_cvtps_epi32(vacc_hi_f);
437 
438         __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
439         vout = _mm_adds_epi16(
440             vout,
441             _mm_load_si128(
442                 (const __m128i*)&quantization_params->sse2.output_zero_point));
443         vout = _mm_packus_epi16(vout, vout);
444         vout = _mm_min_epu8(
445             vout,
446             _mm_load_si128(
447                 (const __m128i*)&quantization_params->sse2.output_max));
448         vout = _mm_max_epu8(
449             vout,
450             _mm_load_si128(
451                 (const __m128i*)&quantization_params->sse2.output_min));
452 
453         if (k & 4) {
454           *((uint32_t*)output) = (uint32_t)_mm_cvtsi128_si32(vout);
455           output += 4;
456           vout = _mm_srli_epi64(vout, 32);
457         }
458         if (k & 2) {
459           *((uint16_t*)output) = (uint16_t)_mm_extract_epi16(vout, 0);
460           output += 2;
461           vout = _mm_srli_epi32(vout, 16);
462         }
463         if (k & 1) {
464           *((uint8_t*)output) = (uint8_t)_mm_cvtsi128_si32(vout);
465           output += 1;
466         }
467       }
468     }
469     output = (uint8_t*)((uintptr_t)output + output_increment);
470   } while (--n != 0);
471 }
472