1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <immintrin.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
16 #include "vpx_ports/mem.h"
17
18 #if CONFIG_VP9_HIGHBITDEPTH
highbd_hadamard_col8_avx2(__m256i * in,int iter)19 static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
20 __m256i a0 = in[0];
21 __m256i a1 = in[1];
22 __m256i a2 = in[2];
23 __m256i a3 = in[3];
24 __m256i a4 = in[4];
25 __m256i a5 = in[5];
26 __m256i a6 = in[6];
27 __m256i a7 = in[7];
28
29 __m256i b0 = _mm256_add_epi32(a0, a1);
30 __m256i b1 = _mm256_sub_epi32(a0, a1);
31 __m256i b2 = _mm256_add_epi32(a2, a3);
32 __m256i b3 = _mm256_sub_epi32(a2, a3);
33 __m256i b4 = _mm256_add_epi32(a4, a5);
34 __m256i b5 = _mm256_sub_epi32(a4, a5);
35 __m256i b6 = _mm256_add_epi32(a6, a7);
36 __m256i b7 = _mm256_sub_epi32(a6, a7);
37
38 a0 = _mm256_add_epi32(b0, b2);
39 a1 = _mm256_add_epi32(b1, b3);
40 a2 = _mm256_sub_epi32(b0, b2);
41 a3 = _mm256_sub_epi32(b1, b3);
42 a4 = _mm256_add_epi32(b4, b6);
43 a5 = _mm256_add_epi32(b5, b7);
44 a6 = _mm256_sub_epi32(b4, b6);
45 a7 = _mm256_sub_epi32(b5, b7);
46
47 if (iter == 0) {
48 b0 = _mm256_add_epi32(a0, a4);
49 b7 = _mm256_add_epi32(a1, a5);
50 b3 = _mm256_add_epi32(a2, a6);
51 b4 = _mm256_add_epi32(a3, a7);
52 b2 = _mm256_sub_epi32(a0, a4);
53 b6 = _mm256_sub_epi32(a1, a5);
54 b1 = _mm256_sub_epi32(a2, a6);
55 b5 = _mm256_sub_epi32(a3, a7);
56
57 a0 = _mm256_unpacklo_epi32(b0, b1);
58 a1 = _mm256_unpacklo_epi32(b2, b3);
59 a2 = _mm256_unpackhi_epi32(b0, b1);
60 a3 = _mm256_unpackhi_epi32(b2, b3);
61 a4 = _mm256_unpacklo_epi32(b4, b5);
62 a5 = _mm256_unpacklo_epi32(b6, b7);
63 a6 = _mm256_unpackhi_epi32(b4, b5);
64 a7 = _mm256_unpackhi_epi32(b6, b7);
65
66 b0 = _mm256_unpacklo_epi64(a0, a1);
67 b1 = _mm256_unpacklo_epi64(a4, a5);
68 b2 = _mm256_unpackhi_epi64(a0, a1);
69 b3 = _mm256_unpackhi_epi64(a4, a5);
70 b4 = _mm256_unpacklo_epi64(a2, a3);
71 b5 = _mm256_unpacklo_epi64(a6, a7);
72 b6 = _mm256_unpackhi_epi64(a2, a3);
73 b7 = _mm256_unpackhi_epi64(a6, a7);
74
75 in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
76 in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
77 in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
78 in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
79 in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
80 in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
81 in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
82 in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
83 } else {
84 in[0] = _mm256_add_epi32(a0, a4);
85 in[7] = _mm256_add_epi32(a1, a5);
86 in[3] = _mm256_add_epi32(a2, a6);
87 in[4] = _mm256_add_epi32(a3, a7);
88 in[2] = _mm256_sub_epi32(a0, a4);
89 in[6] = _mm256_sub_epi32(a1, a5);
90 in[1] = _mm256_sub_epi32(a2, a6);
91 in[5] = _mm256_sub_epi32(a3, a7);
92 }
93 }
94
vpx_highbd_hadamard_8x8_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)95 void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
96 tran_low_t *coeff) {
97 __m128i src16[8];
98 __m256i src32[8];
99
100 src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
101 src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
102 src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
103 src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
104 src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
105 src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
106 src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
107 src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
108
109 src32[0] = _mm256_cvtepi16_epi32(src16[0]);
110 src32[1] = _mm256_cvtepi16_epi32(src16[1]);
111 src32[2] = _mm256_cvtepi16_epi32(src16[2]);
112 src32[3] = _mm256_cvtepi16_epi32(src16[3]);
113 src32[4] = _mm256_cvtepi16_epi32(src16[4]);
114 src32[5] = _mm256_cvtepi16_epi32(src16[5]);
115 src32[6] = _mm256_cvtepi16_epi32(src16[6]);
116 src32[7] = _mm256_cvtepi16_epi32(src16[7]);
117
118 highbd_hadamard_col8_avx2(src32, 0);
119 highbd_hadamard_col8_avx2(src32, 1);
120
121 _mm256_storeu_si256((__m256i *)coeff, src32[0]);
122 coeff += 8;
123 _mm256_storeu_si256((__m256i *)coeff, src32[1]);
124 coeff += 8;
125 _mm256_storeu_si256((__m256i *)coeff, src32[2]);
126 coeff += 8;
127 _mm256_storeu_si256((__m256i *)coeff, src32[3]);
128 coeff += 8;
129 _mm256_storeu_si256((__m256i *)coeff, src32[4]);
130 coeff += 8;
131 _mm256_storeu_si256((__m256i *)coeff, src32[5]);
132 coeff += 8;
133 _mm256_storeu_si256((__m256i *)coeff, src32[6]);
134 coeff += 8;
135 _mm256_storeu_si256((__m256i *)coeff, src32[7]);
136 }
137
vpx_highbd_hadamard_16x16_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)138 void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
139 ptrdiff_t src_stride, tran_low_t *coeff) {
140 int idx;
141 tran_low_t *t_coeff = coeff;
142 for (idx = 0; idx < 4; ++idx) {
143 const int16_t *src_ptr =
144 src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
145 vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
146 }
147
148 for (idx = 0; idx < 64; idx += 8) {
149 __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
150 __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
151 __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
152 __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
153
154 __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
155 __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
156 __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
157 __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
158
159 b0 = _mm256_srai_epi32(b0, 1);
160 b1 = _mm256_srai_epi32(b1, 1);
161 b2 = _mm256_srai_epi32(b2, 1);
162 b3 = _mm256_srai_epi32(b3, 1);
163
164 coeff0 = _mm256_add_epi32(b0, b2);
165 coeff1 = _mm256_add_epi32(b1, b3);
166 coeff2 = _mm256_sub_epi32(b0, b2);
167 coeff3 = _mm256_sub_epi32(b1, b3);
168
169 _mm256_storeu_si256((__m256i *)coeff, coeff0);
170 _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
171 _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
172 _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
173
174 coeff += 8;
175 t_coeff += 8;
176 }
177 }
178
vpx_highbd_hadamard_32x32_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)179 void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
180 ptrdiff_t src_stride, tran_low_t *coeff) {
181 int idx;
182 tran_low_t *t_coeff = coeff;
183 for (idx = 0; idx < 4; ++idx) {
184 const int16_t *src_ptr =
185 src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
186 vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
187 }
188
189 for (idx = 0; idx < 256; idx += 8) {
190 __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
191 __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
192 __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
193 __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
194
195 __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
196 __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
197 __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
198 __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
199
200 b0 = _mm256_srai_epi32(b0, 2);
201 b1 = _mm256_srai_epi32(b1, 2);
202 b2 = _mm256_srai_epi32(b2, 2);
203 b3 = _mm256_srai_epi32(b3, 2);
204
205 coeff0 = _mm256_add_epi32(b0, b2);
206 coeff1 = _mm256_add_epi32(b1, b3);
207 coeff2 = _mm256_sub_epi32(b0, b2);
208 coeff3 = _mm256_sub_epi32(b1, b3);
209
210 _mm256_storeu_si256((__m256i *)coeff, coeff0);
211 _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
212 _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
213 _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
214
215 coeff += 8;
216 t_coeff += 8;
217 }
218 }
219 #endif // CONFIG_VP9_HIGHBITDEPTH
220
sign_extend_16bit_to_32bit_avx2(__m256i in,__m256i zero,__m256i * out_lo,__m256i * out_hi)221 static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
222 __m256i *out_lo,
223 __m256i *out_hi) {
224 const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
225 *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
226 *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
227 }
228
hadamard_col8x2_avx2(__m256i * in,int iter)229 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
230 __m256i a0 = in[0];
231 __m256i a1 = in[1];
232 __m256i a2 = in[2];
233 __m256i a3 = in[3];
234 __m256i a4 = in[4];
235 __m256i a5 = in[5];
236 __m256i a6 = in[6];
237 __m256i a7 = in[7];
238
239 __m256i b0 = _mm256_add_epi16(a0, a1);
240 __m256i b1 = _mm256_sub_epi16(a0, a1);
241 __m256i b2 = _mm256_add_epi16(a2, a3);
242 __m256i b3 = _mm256_sub_epi16(a2, a3);
243 __m256i b4 = _mm256_add_epi16(a4, a5);
244 __m256i b5 = _mm256_sub_epi16(a4, a5);
245 __m256i b6 = _mm256_add_epi16(a6, a7);
246 __m256i b7 = _mm256_sub_epi16(a6, a7);
247
248 a0 = _mm256_add_epi16(b0, b2);
249 a1 = _mm256_add_epi16(b1, b3);
250 a2 = _mm256_sub_epi16(b0, b2);
251 a3 = _mm256_sub_epi16(b1, b3);
252 a4 = _mm256_add_epi16(b4, b6);
253 a5 = _mm256_add_epi16(b5, b7);
254 a6 = _mm256_sub_epi16(b4, b6);
255 a7 = _mm256_sub_epi16(b5, b7);
256
257 if (iter == 0) {
258 b0 = _mm256_add_epi16(a0, a4);
259 b7 = _mm256_add_epi16(a1, a5);
260 b3 = _mm256_add_epi16(a2, a6);
261 b4 = _mm256_add_epi16(a3, a7);
262 b2 = _mm256_sub_epi16(a0, a4);
263 b6 = _mm256_sub_epi16(a1, a5);
264 b1 = _mm256_sub_epi16(a2, a6);
265 b5 = _mm256_sub_epi16(a3, a7);
266
267 a0 = _mm256_unpacklo_epi16(b0, b1);
268 a1 = _mm256_unpacklo_epi16(b2, b3);
269 a2 = _mm256_unpackhi_epi16(b0, b1);
270 a3 = _mm256_unpackhi_epi16(b2, b3);
271 a4 = _mm256_unpacklo_epi16(b4, b5);
272 a5 = _mm256_unpacklo_epi16(b6, b7);
273 a6 = _mm256_unpackhi_epi16(b4, b5);
274 a7 = _mm256_unpackhi_epi16(b6, b7);
275
276 b0 = _mm256_unpacklo_epi32(a0, a1);
277 b1 = _mm256_unpacklo_epi32(a4, a5);
278 b2 = _mm256_unpackhi_epi32(a0, a1);
279 b3 = _mm256_unpackhi_epi32(a4, a5);
280 b4 = _mm256_unpacklo_epi32(a2, a3);
281 b5 = _mm256_unpacklo_epi32(a6, a7);
282 b6 = _mm256_unpackhi_epi32(a2, a3);
283 b7 = _mm256_unpackhi_epi32(a6, a7);
284
285 in[0] = _mm256_unpacklo_epi64(b0, b1);
286 in[1] = _mm256_unpackhi_epi64(b0, b1);
287 in[2] = _mm256_unpacklo_epi64(b2, b3);
288 in[3] = _mm256_unpackhi_epi64(b2, b3);
289 in[4] = _mm256_unpacklo_epi64(b4, b5);
290 in[5] = _mm256_unpackhi_epi64(b4, b5);
291 in[6] = _mm256_unpacklo_epi64(b6, b7);
292 in[7] = _mm256_unpackhi_epi64(b6, b7);
293 } else {
294 in[0] = _mm256_add_epi16(a0, a4);
295 in[7] = _mm256_add_epi16(a1, a5);
296 in[3] = _mm256_add_epi16(a2, a6);
297 in[4] = _mm256_add_epi16(a3, a7);
298 in[2] = _mm256_sub_epi16(a0, a4);
299 in[6] = _mm256_sub_epi16(a1, a5);
300 in[1] = _mm256_sub_epi16(a2, a6);
301 in[5] = _mm256_sub_epi16(a3, a7);
302 }
303 }
304
hadamard_8x8x2_avx2(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)305 static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
306 int16_t *coeff) {
307 __m256i src[8];
308 src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
309 src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
310 src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
311 src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
312 src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
313 src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
314 src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
315 src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
316
317 hadamard_col8x2_avx2(src, 0);
318 hadamard_col8x2_avx2(src, 1);
319
320 _mm256_storeu_si256((__m256i *)coeff,
321 _mm256_permute2x128_si256(src[0], src[1], 0x20));
322 coeff += 16;
323 _mm256_storeu_si256((__m256i *)coeff,
324 _mm256_permute2x128_si256(src[2], src[3], 0x20));
325 coeff += 16;
326 _mm256_storeu_si256((__m256i *)coeff,
327 _mm256_permute2x128_si256(src[4], src[5], 0x20));
328 coeff += 16;
329 _mm256_storeu_si256((__m256i *)coeff,
330 _mm256_permute2x128_si256(src[6], src[7], 0x20));
331 coeff += 16;
332 _mm256_storeu_si256((__m256i *)coeff,
333 _mm256_permute2x128_si256(src[0], src[1], 0x31));
334 coeff += 16;
335 _mm256_storeu_si256((__m256i *)coeff,
336 _mm256_permute2x128_si256(src[2], src[3], 0x31));
337 coeff += 16;
338 _mm256_storeu_si256((__m256i *)coeff,
339 _mm256_permute2x128_si256(src[4], src[5], 0x31));
340 coeff += 16;
341 _mm256_storeu_si256((__m256i *)coeff,
342 _mm256_permute2x128_si256(src[6], src[7], 0x31));
343 }
344
hadamard_16x16_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff,int is_final)345 static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
346 ptrdiff_t src_stride, tran_low_t *coeff,
347 int is_final) {
348 #if CONFIG_VP9_HIGHBITDEPTH
349 DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
350 int16_t *t_coeff = temp_coeff;
351 #else
352 int16_t *t_coeff = coeff;
353 #endif
354 int16_t *coeff16 = (int16_t *)coeff;
355 int idx;
356 for (idx = 0; idx < 2; ++idx) {
357 const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
358 hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
359 }
360
361 for (idx = 0; idx < 64; idx += 16) {
362 const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
363 const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
364 const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
365 const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
366
367 __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
368 __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
369 __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
370 __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
371
372 b0 = _mm256_srai_epi16(b0, 1);
373 b1 = _mm256_srai_epi16(b1, 1);
374 b2 = _mm256_srai_epi16(b2, 1);
375 b3 = _mm256_srai_epi16(b3, 1);
376 if (is_final) {
377 store_tran_low(_mm256_add_epi16(b0, b2), coeff);
378 store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
379 store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
380 store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
381 coeff += 16;
382 } else {
383 _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
384 _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
385 _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
386 _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
387 coeff16 += 16;
388 }
389 t_coeff += 16;
390 }
391 }
392
vpx_hadamard_16x16_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)393 void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
394 tran_low_t *coeff) {
395 hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
396 }
397
vpx_hadamard_32x32_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)398 void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
399 tran_low_t *coeff) {
400 #if CONFIG_VP9_HIGHBITDEPTH
401 // For high bitdepths, it is unnecessary to store_tran_low
402 // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
403 // next stage. Output to an intermediate buffer first, then store_tran_low()
404 // in the final stage.
405 DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
406 int16_t *t_coeff = temp_coeff;
407 #else
408 int16_t *t_coeff = coeff;
409 #endif
410 int idx;
411 __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
412 b3_lo;
413 __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
414 b3_hi;
415 __m256i b0, b1, b2, b3;
416 const __m256i zero = _mm256_setzero_si256();
417 for (idx = 0; idx < 4; ++idx) {
418 // src_diff: 9 bit, dynamic range [-255, 255]
419 const int16_t *src_ptr =
420 src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
421 hadamard_16x16_avx2(src_ptr, src_stride,
422 (tran_low_t *)(t_coeff + idx * 256), 0);
423 }
424
425 for (idx = 0; idx < 256; idx += 16) {
426 const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
427 const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
428 const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
429 const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
430
431 // Sign extend 16 bit to 32 bit.
432 sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
433 sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
434 sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
435 sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
436
437 b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
438 b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
439
440 b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
441 b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
442
443 b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
444 b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
445
446 b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
447 b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
448
449 b0_lo = _mm256_srai_epi32(b0_lo, 2);
450 b1_lo = _mm256_srai_epi32(b1_lo, 2);
451 b2_lo = _mm256_srai_epi32(b2_lo, 2);
452 b3_lo = _mm256_srai_epi32(b3_lo, 2);
453
454 b0_hi = _mm256_srai_epi32(b0_hi, 2);
455 b1_hi = _mm256_srai_epi32(b1_hi, 2);
456 b2_hi = _mm256_srai_epi32(b2_hi, 2);
457 b3_hi = _mm256_srai_epi32(b3_hi, 2);
458
459 b0 = _mm256_packs_epi32(b0_lo, b0_hi);
460 b1 = _mm256_packs_epi32(b1_lo, b1_hi);
461 b2 = _mm256_packs_epi32(b2_lo, b2_hi);
462 b3 = _mm256_packs_epi32(b3_lo, b3_hi);
463
464 store_tran_low(_mm256_add_epi16(b0, b2), coeff);
465 store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
466 store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
467 store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
468
469 coeff += 16;
470 t_coeff += 16;
471 }
472 }
473
vpx_satd_avx2(const tran_low_t * coeff,int length)474 int vpx_satd_avx2(const tran_low_t *coeff, int length) {
475 const __m256i one = _mm256_set1_epi16(1);
476 __m256i accum = _mm256_setzero_si256();
477 int i;
478
479 for (i = 0; i < length; i += 16) {
480 const __m256i src_line = load_tran_low(coeff);
481 const __m256i abs = _mm256_abs_epi16(src_line);
482 const __m256i sum = _mm256_madd_epi16(abs, one);
483 accum = _mm256_add_epi32(accum, sum);
484 coeff += 16;
485 }
486
487 { // 32 bit horizontal add
488 const __m256i a = _mm256_srli_si256(accum, 8);
489 const __m256i b = _mm256_add_epi32(accum, a);
490 const __m256i c = _mm256_srli_epi64(b, 32);
491 const __m256i d = _mm256_add_epi32(b, c);
492 const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
493 _mm256_extractf128_si256(d, 1));
494 return _mm_cvtsi128_si32(accum_128);
495 }
496 }
497
498 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_satd_avx2(const tran_low_t * coeff,int length)499 int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
500 __m256i accum = _mm256_setzero_si256();
501 int i;
502
503 for (i = 0; i < length; i += 8, coeff += 8) {
504 const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
505 const __m256i abs = _mm256_abs_epi32(src_line);
506 accum = _mm256_add_epi32(accum, abs);
507 }
508
509 { // 32 bit horizontal add
510 const __m256i a = _mm256_srli_si256(accum, 8);
511 const __m256i b = _mm256_add_epi32(accum, a);
512 const __m256i c = _mm256_srli_epi64(b, 32);
513 const __m256i d = _mm256_add_epi32(b, c);
514 const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
515 _mm256_extractf128_si256(d, 1));
516 return _mm_cvtsi128_si32(accum_128);
517 }
518 }
519 #endif // CONFIG_VP9_HIGHBITDEPTH
520