1 /*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <assert.h>
10
11 #include <emmintrin.h>
12
13 #include <qnnpack/q8avgpool.h>
14
pytorch_q8avgpool_ukernel_mp8x9p8q__sse2(size_t n,size_t ks,size_t kc,const uint8_t ** input,const uint8_t * zero,int32_t * buffer,uint8_t * output,size_t input_increment,size_t output_increment,const union pytorch_qnnp_avgpool_quantization_params quantization_params[RESTRICT_STATIC1])15 void pytorch_q8avgpool_ukernel_mp8x9p8q__sse2(
16 size_t n,
17 size_t ks,
18 size_t kc,
19 const uint8_t** input,
20 const uint8_t* zero,
21 int32_t* buffer,
22 uint8_t* output,
23 size_t input_increment,
24 size_t output_increment,
25 const union pytorch_qnnp_avgpool_quantization_params
26 quantization_params[RESTRICT_STATIC 1]) {
27 assert(n != 0);
28 assert(ks > 9);
29 assert(kc >= 8);
30
31 const __m128i vbias =
32 _mm_load_si128((const __m128i*)&quantization_params->sse2.bias);
33 const __m128i vzero = _mm_setzero_si128();
34 const __m128 vscale = _mm_loadu_ps(quantization_params->sse2.scale);
35
36 do {
37 {
38 const uint8_t* i0 = *input++;
39 const uint8_t* i1 = *input++;
40 const uint8_t* i2 = *input++;
41 const uint8_t* i3 = *input++;
42 const uint8_t* i4 = *input++;
43 const uint8_t* i5 = *input++;
44 const uint8_t* i6 = *input++;
45 const uint8_t* i7 = *input++;
46 const uint8_t* i8 = *input++;
47
48 size_t k = kc;
49 int32_t* acc = buffer;
50 while (k >= 8) {
51 const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0);
52 i0 += 8;
53 const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1);
54 i1 += 8;
55 const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2);
56 i2 += 8;
57 const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3);
58 i3 += 8;
59 const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4);
60 i4 += 8;
61 const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5);
62 i5 += 8;
63 const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6);
64 i6 += 8;
65 const __m128i vi7 = _mm_loadl_epi64((const __m128i*)i7);
66 i7 += 8;
67 const __m128i vi8 = _mm_loadl_epi64((const __m128i*)i8);
68 i8 += 8;
69
70 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
71 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
72 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
73 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
74 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
75 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
76 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
77 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
78 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
79
80 const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
81 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
82 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
83 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
84
85 const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
86 const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
87 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
88
89 const __m128i vacc_lo =
90 _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
91 const __m128i vacc_hi =
92 _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
93
94 _mm_store_si128((__m128i*)acc, vacc_lo);
95 _mm_store_si128((__m128i*)acc + 1, vacc_hi);
96 acc += 8;
97
98 k -= 8;
99 }
100 if (k != 0) {
101 const size_t address_decrement = 8 - k;
102 i0 = (const uint8_t*)((uintptr_t)i0 - address_decrement);
103 i1 = (const uint8_t*)((uintptr_t)i1 - address_decrement);
104 i2 = (const uint8_t*)((uintptr_t)i2 - address_decrement);
105 i3 = (const uint8_t*)((uintptr_t)i3 - address_decrement);
106 i4 = (const uint8_t*)((uintptr_t)i4 - address_decrement);
107 i5 = (const uint8_t*)((uintptr_t)i5 - address_decrement);
108 i6 = (const uint8_t*)((uintptr_t)i6 - address_decrement);
109 i7 = (const uint8_t*)((uintptr_t)i7 - address_decrement);
110 i8 = (const uint8_t*)((uintptr_t)i8 - address_decrement);
111 const __m128i vshift = _mm_cvtsi32_si128(8 * address_decrement);
112
113 const __m128i vi0 =
114 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i0), vshift);
115 const __m128i vi1 =
116 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i1), vshift);
117 const __m128i vi2 =
118 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i2), vshift);
119 const __m128i vi3 =
120 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i3), vshift);
121 const __m128i vi4 =
122 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i4), vshift);
123 const __m128i vi5 =
124 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i5), vshift);
125 const __m128i vi6 =
126 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i6), vshift);
127 const __m128i vi7 =
128 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i7), vshift);
129 const __m128i vi8 =
130 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i8), vshift);
131
132 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
133 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
134 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
135 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
136 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
137 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
138 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
139 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
140 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
141
142 const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
143 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
144 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
145 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
146
147 const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
148 const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
149 const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
150
151 const __m128i vacc_lo =
152 _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
153 const __m128i vacc_hi =
154 _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
155
156 _mm_store_si128((__m128i*)acc, vacc_lo);
157 _mm_store_si128((__m128i*)acc + 1, vacc_hi);
158 }
159 }
160
161 size_t m = ks;
162 for (m -= 9; m > 8; m -= 8) {
163 const uint8_t* i0 = *input++;
164 const uint8_t* i1 = *input++;
165 const uint8_t* i2 = *input++;
166 const uint8_t* i3 = *input++;
167 const uint8_t* i4 = *input++;
168 const uint8_t* i5 = *input++;
169 const uint8_t* i6 = *input++;
170 const uint8_t* i7 = *input++;
171
172 size_t k = kc;
173 int32_t* acc = buffer;
174 while (k >= 8) {
175 const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0);
176 i0 += 8;
177 const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1);
178 i1 += 8;
179 const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2);
180 i2 += 8;
181 const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3);
182 i3 += 8;
183 const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4);
184 i4 += 8;
185 const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5);
186 i5 += 8;
187 const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6);
188 i6 += 8;
189 const __m128i vi7 = _mm_loadl_epi64((const __m128i*)i7);
190 i7 += 8;
191 __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
192 __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
193
194 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
195 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
196 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
197 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
198 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
199 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
200 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
201 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
202
203 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
204 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
205 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
206 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
207
208 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
209 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
210 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
211
212 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
213 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
214
215 _mm_store_si128((__m128i*)acc, vacc_lo);
216 _mm_store_si128((__m128i*)acc + 1, vacc_hi);
217 acc += 8;
218
219 k -= 8;
220 }
221 if (k != 0) {
222 const size_t address_decrement = 8 - k;
223 i0 = (const uint8_t*)((uintptr_t)i0 - address_decrement);
224 i1 = (const uint8_t*)((uintptr_t)i1 - address_decrement);
225 i2 = (const uint8_t*)((uintptr_t)i2 - address_decrement);
226 i3 = (const uint8_t*)((uintptr_t)i3 - address_decrement);
227 i4 = (const uint8_t*)((uintptr_t)i4 - address_decrement);
228 i5 = (const uint8_t*)((uintptr_t)i5 - address_decrement);
229 i6 = (const uint8_t*)((uintptr_t)i6 - address_decrement);
230 i7 = (const uint8_t*)((uintptr_t)i7 - address_decrement);
231 const __m128i vshift = _mm_cvtsi32_si128(8 * address_decrement);
232
233 const __m128i vi0 =
234 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i0), vshift);
235 const __m128i vi1 =
236 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i1), vshift);
237 const __m128i vi2 =
238 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i2), vshift);
239 const __m128i vi3 =
240 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i3), vshift);
241 const __m128i vi4 =
242 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i4), vshift);
243 const __m128i vi5 =
244 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i5), vshift);
245 const __m128i vi6 =
246 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i6), vshift);
247 const __m128i vi7 =
248 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i7), vshift);
249 __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
250 __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
251
252 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
253 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
254 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
255 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
256 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
257 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
258 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
259 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
260
261 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
262 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
263 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
264 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
265
266 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
267 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
268 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
269
270 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
271 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
272
273 _mm_store_si128((__m128i*)acc, vacc_lo);
274 _mm_store_si128((__m128i*)acc + 1, vacc_hi);
275 }
276 }
277
278 {
279 const uint8_t* i0 = input[0];
280 const uint8_t* i1 = input[1];
281 const uint8_t* i2 = input[2];
282 const uint8_t* i3 = input[3];
283 const uint8_t* i4 = input[4];
284 const uint8_t* i5 = input[5];
285 const uint8_t* i6 = input[6];
286 const uint8_t* i7 = input[7];
287 input = (const uint8_t**)((uintptr_t)input + input_increment);
288 if (m < 2) {
289 i1 = zero;
290 }
291 if (m <= 2) {
292 i2 = zero;
293 }
294 if (m < 4) {
295 i3 = zero;
296 }
297 if (m <= 4) {
298 i4 = zero;
299 }
300 if (m < 6) {
301 i5 = zero;
302 }
303 if (m <= 6) {
304 i6 = zero;
305 }
306 if (m != 8) {
307 i7 = zero;
308 }
309
310 size_t k = kc;
311 int32_t* acc = buffer;
312 while (k >= 8) {
313 const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0);
314 i0 += 8;
315 const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1);
316 i1 += 8;
317 const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2);
318 i2 += 8;
319 const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3);
320 i3 += 8;
321 const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4);
322 i4 += 8;
323 const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5);
324 i5 += 8;
325 const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6);
326 i6 += 8;
327 const __m128i vi7 = _mm_loadl_epi64((const __m128i*)i7);
328 i7 += 8;
329 __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
330 __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
331 acc += 8;
332
333 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
334 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
335 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
336 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
337 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
338 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
339 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
340 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
341
342 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
343 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
344 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
345 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
346
347 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
348 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
349 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
350
351 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
352 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
353
354 const __m128 vacc_lo_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_lo), vscale);
355 const __m128 vacc_hi_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_hi), vscale);
356
357 const __m128i vscaled_lo = _mm_cvtps_epi32(vacc_lo_f);
358 const __m128i vscaled_hi = _mm_cvtps_epi32(vacc_hi_f);
359
360 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
361 vout = _mm_adds_epi16(
362 vout,
363 _mm_load_si128(
364 (const __m128i*)&quantization_params->sse2.output_zero_point));
365 vout = _mm_packus_epi16(vout, vout);
366 vout = _mm_min_epu8(
367 vout,
368 _mm_load_si128(
369 (const __m128i*)&quantization_params->sse2.output_max));
370 vout = _mm_max_epu8(
371 vout,
372 _mm_load_si128(
373 (const __m128i*)&quantization_params->sse2.output_min));
374
375 _mm_storel_epi64((__m128i*)output, vout);
376 output += 8;
377
378 k -= 8;
379 }
380 if (k != 0) {
381 const size_t address_decrement = 8 - k;
382 i0 = (const uint8_t*)((uintptr_t)i0 - address_decrement);
383 i1 = (const uint8_t*)((uintptr_t)i1 - address_decrement);
384 i2 = (const uint8_t*)((uintptr_t)i2 - address_decrement);
385 i3 = (const uint8_t*)((uintptr_t)i3 - address_decrement);
386 i4 = (const uint8_t*)((uintptr_t)i4 - address_decrement);
387 i5 = (const uint8_t*)((uintptr_t)i5 - address_decrement);
388 i6 = (const uint8_t*)((uintptr_t)i6 - address_decrement);
389 i7 = (const uint8_t*)((uintptr_t)i7 - address_decrement);
390 const __m128i vshift = _mm_cvtsi32_si128(8 * address_decrement);
391
392 const __m128i vi0 =
393 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i0), vshift);
394 const __m128i vi1 =
395 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i1), vshift);
396 const __m128i vi2 =
397 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i2), vshift);
398 const __m128i vi3 =
399 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i3), vshift);
400 const __m128i vi4 =
401 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i4), vshift);
402 const __m128i vi5 =
403 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i5), vshift);
404 const __m128i vi6 =
405 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i6), vshift);
406 const __m128i vi7 =
407 _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i7), vshift);
408 __m128i vacc_lo = _mm_load_si128((const __m128i*)acc);
409 __m128i vacc_hi = _mm_load_si128((const __m128i*)acc + 1);
410
411 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
412 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
413 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
414 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
415 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
416 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
417 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
418 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
419
420 const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
421 const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
422 const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
423 const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
424
425 const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
426 const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
427 const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
428
429 vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
430 vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
431
432 const __m128 vacc_lo_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_lo), vscale);
433 const __m128 vacc_hi_f = _mm_mul_ps(_mm_cvtepi32_ps(vacc_hi), vscale);
434
435 const __m128i vscaled_lo = _mm_cvtps_epi32(vacc_lo_f);
436 const __m128i vscaled_hi = _mm_cvtps_epi32(vacc_hi_f);
437
438 __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
439 vout = _mm_adds_epi16(
440 vout,
441 _mm_load_si128(
442 (const __m128i*)&quantization_params->sse2.output_zero_point));
443 vout = _mm_packus_epi16(vout, vout);
444 vout = _mm_min_epu8(
445 vout,
446 _mm_load_si128(
447 (const __m128i*)&quantization_params->sse2.output_max));
448 vout = _mm_max_epu8(
449 vout,
450 _mm_load_si128(
451 (const __m128i*)&quantization_params->sse2.output_min));
452
453 if (k & 4) {
454 *((uint32_t*)output) = (uint32_t)_mm_cvtsi128_si32(vout);
455 output += 4;
456 vout = _mm_srli_epi64(vout, 32);
457 }
458 if (k & 2) {
459 *((uint16_t*)output) = (uint16_t)_mm_extract_epi16(vout, 0);
460 output += 2;
461 vout = _mm_srli_epi32(vout, 16);
462 }
463 if (k & 1) {
464 *((uint8_t*)output) = (uint8_t)_mm_cvtsi128_si32(vout);
465 output += 1;
466 }
467 }
468 }
469 output = (uint8_t*)((uintptr_t)output + output_increment);
470 } while (--n != 0);
471 }
472