xref: /aosp_15_r20/external/XNNPACK/src/qu8-avgpool/9p8x-minmax-sse2-c8.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 
11 #include <emmintrin.h>
12 
13 #include <xnnpack/avgpool.h>
14 #include <xnnpack/unaligned.h>
15 
16 
xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const uint8_t ** input,size_t input_offset,const uint8_t * zero,int32_t * buffer,uint8_t * output,size_t input_increment,size_t output_increment,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])17 void xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8(
18     size_t output_pixels,
19     size_t kernel_elements,
20     size_t channels,
21     const uint8_t** input,
22     size_t input_offset,
23     const uint8_t* zero,
24     int32_t* buffer,
25     uint8_t* output,
26     size_t input_increment,
27     size_t output_increment,
28     const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
29 {
30   assert(output_pixels != 0);
31   assert(kernel_elements > 9);
32   assert(channels != 0);
33 
34   const __m128i vbias = _mm_load_si128((const __m128i*) &params->sse2.bias);
35   const __m128i vzero = _mm_setzero_si128();
36   const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
37   const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
38   const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
39 
40   do {
41     {
42       const uint8_t* i0 = *input++;
43       assert(i0 != NULL);
44       if XNN_UNPREDICTABLE(i0 != zero) {
45         i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
46       }
47       const uint8_t* i1 = *input++;
48       assert(i1 != NULL);
49       if XNN_UNPREDICTABLE(i1 != zero) {
50         i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
51       }
52       const uint8_t* i2 = *input++;
53       assert(i2 != NULL);
54       if XNN_UNPREDICTABLE(i2 != zero) {
55         i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
56       }
57       const uint8_t* i3 = *input++;
58       assert(i3 != NULL);
59       if XNN_UNPREDICTABLE(i3 != zero) {
60         i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
61       }
62       const uint8_t* i4 = *input++;
63       assert(i4 != NULL);
64       if XNN_UNPREDICTABLE(i4 != zero) {
65         i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
66       }
67       const uint8_t* i5 = *input++;
68       assert(i5 != NULL);
69       if XNN_UNPREDICTABLE(i5 != zero) {
70         i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
71       }
72       const uint8_t* i6 = *input++;
73       assert(i6 != NULL);
74       if XNN_UNPREDICTABLE(i6 != zero) {
75         i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
76       }
77       const uint8_t* i7 = *input++;
78       assert(i7 != NULL);
79       if XNN_UNPREDICTABLE(i7 != zero) {
80         i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
81       }
82       const uint8_t* i8 = *input++;
83       assert(i8 != NULL);
84       if XNN_UNPREDICTABLE(i8 != zero) {
85         i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
86       }
87 
88       int32_t* b = buffer;
89       for (size_t c = 0; c < channels; c += 8) {
90         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
91         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
92         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
93         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
94         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
95         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
96         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
97         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
98         const __m128i vi8 = _mm_loadl_epi64((const __m128i*) i8); i8 += 8;
99 
100         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
101         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
102         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
103         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
104         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
105         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
106         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
107         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
108         const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
109 
110         const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
111         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
112         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
113         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
114 
115         const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
116         const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
117         const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
118 
119         const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
120         const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
121 
122         _mm_store_si128((__m128i*) b, vacc_lo);
123         _mm_store_si128((__m128i*) b + 1, vacc_hi);
124         b += 8;
125       }
126     }
127 
128     size_t k = kernel_elements;
129     for (k -= 9; k > 8; k -= 8) {
130       const uint8_t* i0 = *input++;
131       assert(i0 != NULL);
132       if XNN_UNPREDICTABLE(i0 != zero) {
133         i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
134       }
135       const uint8_t* i1 = *input++;
136       assert(i1 != NULL);
137       if XNN_UNPREDICTABLE(i1 != zero) {
138         i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
139       }
140       const uint8_t* i2 = *input++;
141       assert(i2 != NULL);
142       if XNN_UNPREDICTABLE(i2 != zero) {
143         i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
144       }
145       const uint8_t* i3 = *input++;
146       assert(i3 != NULL);
147       if XNN_UNPREDICTABLE(i3 != zero) {
148         i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
149       }
150       const uint8_t* i4 = *input++;
151       assert(i4 != NULL);
152       if XNN_UNPREDICTABLE(i4 != zero) {
153         i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
154       }
155       const uint8_t* i5 = *input++;
156       assert(i5 != NULL);
157       if XNN_UNPREDICTABLE(i5 != zero) {
158         i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
159       }
160       const uint8_t* i6 = *input++;
161       assert(i6 != NULL);
162       if XNN_UNPREDICTABLE(i6 != zero) {
163         i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
164       }
165       const uint8_t* i7 = *input++;
166       assert(i7 != NULL);
167       if XNN_UNPREDICTABLE(i7 != zero) {
168         i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
169       }
170 
171       int32_t* b = buffer;
172       for (size_t c = 0; c < channels; c += 8) {
173         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
174         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
175         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
176         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
177         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
178         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
179         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
180         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
181         __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
182         __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
183 
184         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
185         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
186         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
187         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
188         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
189         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
190         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
191         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
192 
193         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
194         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
195         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
196         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
197 
198         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
199         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
200         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
201 
202         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
203         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
204 
205         _mm_store_si128((__m128i*) b, vacc_lo);
206         _mm_store_si128((__m128i*) b + 1, vacc_hi);
207         b += 8;
208       }
209     }
210 
211     {
212       const uint8_t* i0 = input[0];
213       assert(i0 != NULL);
214       const uint8_t* i1 = input[1];
215       const uint8_t* i2 = input[2];
216       const uint8_t* i3 = input[3];
217       const uint8_t* i4 = input[4];
218       const uint8_t* i5 = input[5];
219       const uint8_t* i6 = input[6];
220       const uint8_t* i7 = input[7];
221       input = (const uint8_t**) ((uintptr_t) input + input_increment);
222       if (k < 2) {
223         i1 = zero;
224       }
225       assert(i1 != NULL);
226       if (k <= 2) {
227         i2 = zero;
228       }
229       assert(i2 != NULL);
230       if (k < 4) {
231         i3 = zero;
232       }
233       assert(i3 != NULL);
234       if (k <= 4) {
235         i4 = zero;
236       }
237       assert(i4 != NULL);
238       if (k < 6) {
239         i5 = zero;
240       }
241       assert(i5 != NULL);
242       if (k <= 6) {
243         i6 = zero;
244       }
245       assert(i6 != NULL);
246       if (k < 8) {
247         i7 = zero;
248       }
249       assert(i7 != NULL);
250       if XNN_UNPREDICTABLE(i0 != zero) {
251         i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
252       }
253       if XNN_UNPREDICTABLE(i1 != zero) {
254         i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
255       }
256       if XNN_UNPREDICTABLE(i2 != zero) {
257         i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
258       }
259       if XNN_UNPREDICTABLE(i3 != zero) {
260         i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
261       }
262       if XNN_UNPREDICTABLE(i4 != zero) {
263         i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
264       }
265       if XNN_UNPREDICTABLE(i5 != zero) {
266         i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
267       }
268       if XNN_UNPREDICTABLE(i6 != zero) {
269         i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
270       }
271       if XNN_UNPREDICTABLE(i7 != zero) {
272         i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
273       }
274 
275       size_t c = channels;
276       int32_t* b = buffer;
277       while (c >= 8) {
278         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
279         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
280         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
281         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
282         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
283         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
284         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
285         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
286         __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
287         __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
288         b += 8;
289 
290         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
291         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
292         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
293         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
294         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
295         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
296         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
297         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
298 
299         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
300         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
301         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
302         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
303 
304         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
305         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
306         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
307 
308         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
309         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
310 
311         const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
312         const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
313 
314         const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
315         const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
316 
317         const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
318         const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
319 
320         const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
321         const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
322 
323         const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
324         const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
325 
326         const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
327         const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
328         const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
329         const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
330 
331         const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
332             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
333         const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
334             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
335 
336         const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
337         const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
338 
339         const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
340         const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
341 
342         __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
343         vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) &params->sse2.output_zero_point));
344         vout = _mm_packus_epi16(vout, vout);
345         vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_max));
346         vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_min));
347 
348         _mm_storel_epi64((__m128i*) output, vout);
349         output += 8;
350 
351         c -= 8;
352       }
353       if (c != 0) {
354         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
355         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
356         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
357         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
358         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
359         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
360         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
361         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7);
362         __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
363         __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
364 
365         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
366         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
367         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
368         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
369         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
370         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
371         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
372         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
373 
374         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
375         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
376         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
377         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
378 
379         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
380         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
381         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
382 
383         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
384         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
385 
386         const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
387         const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
388 
389         const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
390         const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
391 
392         const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
393         const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
394 
395         const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
396         const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
397 
398         const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
399         const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
400 
401         const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
402         const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
403         const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
404         const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
405 
406         const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
407             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
408         const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
409             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
410 
411         const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
412         const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
413 
414         const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
415         const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
416 
417         __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
418         vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) &params->sse2.output_zero_point));
419         vout = _mm_packus_epi16(vout, vout);
420         vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_max));
421         vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_min));
422 
423         if (c & 4) {
424           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout));
425           output += 4;
426           vout = _mm_srli_epi64(vout, 32);
427         }
428         if (c & 2) {
429           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout, 0));
430           output += 2;
431           vout = _mm_srli_epi32(vout, 16);
432         }
433         if (c & 1) {
434           *output = (uint8_t) _mm_cvtsi128_si32(vout);
435           output += 1;
436         }
437       }
438     }
439     output = (uint8_t*) ((uintptr_t) output + output_increment);
440   } while (--output_pixels != 0);
441 }
442