1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <immintrin.h>
12 #include <string.h>
13
14 #include "config/av1_rtcd.h"
15
16 #include "aom_dsp/x86/convolve.h"
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_dsp/x86/synonyms.h"
19
20 // -----------------------------------------------------------------------------
21 // Copy and average
22
23 static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
24 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
25 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
26 static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
27 8, 9, 10, 11, 10, 11, 12, 13,
28 4, 5, 6, 7, 6, 7, 8, 9,
29 8, 9, 10, 11, 10, 11, 12, 13 };
30
31 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
32 uint16_t *dst, int dst_stride, int w, int h,
33 const InterpFilterParams *filter_params_x,
34 const int subpel_x_qn,
35 ConvolveParams *conv_params, int bd);
36 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
37 uint16_t *dst, int dst_stride, int w, int h,
38 const InterpFilterParams *filter_params_y,
39 const int subpel_y_qn, int bd);
40
av1_highbd_convolve_y_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)41 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
42 uint16_t *dst, int dst_stride, int w, int h,
43 const InterpFilterParams *filter_params_y,
44 const int subpel_y_qn, int bd) {
45 if (filter_params_y->taps == 12) {
46 av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
47 filter_params_y, subpel_y_qn, bd);
48 return;
49 }
50 int i, j;
51 const int fo_vert = filter_params_y->taps / 2 - 1;
52 const uint16_t *const src_ptr = src - fo_vert * src_stride;
53
54 __m256i s[8], coeffs_y[4];
55
56 const int bits = FILTER_BITS;
57
58 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
59 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
60 const __m256i clip_pixel =
61 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
62 const __m256i zero = _mm256_setzero_si256();
63
64 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
65
66 for (j = 0; j < w; j += 8) {
67 const uint16_t *data = &src_ptr[j];
68 /* Vertical filter */
69 {
70 __m256i src6;
71 __m256i s01 = _mm256_permute2x128_si256(
72 _mm256_castsi128_si256(
73 _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
74 _mm256_castsi128_si256(
75 _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
76 0x20);
77 __m256i s12 = _mm256_permute2x128_si256(
78 _mm256_castsi128_si256(
79 _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
80 _mm256_castsi128_si256(
81 _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
82 0x20);
83 __m256i s23 = _mm256_permute2x128_si256(
84 _mm256_castsi128_si256(
85 _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
86 _mm256_castsi128_si256(
87 _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
88 0x20);
89 __m256i s34 = _mm256_permute2x128_si256(
90 _mm256_castsi128_si256(
91 _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
92 _mm256_castsi128_si256(
93 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
94 0x20);
95 __m256i s45 = _mm256_permute2x128_si256(
96 _mm256_castsi128_si256(
97 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
98 _mm256_castsi128_si256(
99 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
100 0x20);
101 src6 = _mm256_castsi128_si256(
102 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
103 __m256i s56 = _mm256_permute2x128_si256(
104 _mm256_castsi128_si256(
105 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
106 src6, 0x20);
107
108 s[0] = _mm256_unpacklo_epi16(s01, s12);
109 s[1] = _mm256_unpacklo_epi16(s23, s34);
110 s[2] = _mm256_unpacklo_epi16(s45, s56);
111
112 s[4] = _mm256_unpackhi_epi16(s01, s12);
113 s[5] = _mm256_unpackhi_epi16(s23, s34);
114 s[6] = _mm256_unpackhi_epi16(s45, s56);
115
116 for (i = 0; i < h; i += 2) {
117 data = &src_ptr[i * src_stride + j];
118
119 const __m256i s67 = _mm256_permute2x128_si256(
120 src6,
121 _mm256_castsi128_si256(
122 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
123 0x20);
124
125 src6 = _mm256_castsi128_si256(
126 _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
127
128 const __m256i s78 = _mm256_permute2x128_si256(
129 _mm256_castsi128_si256(
130 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
131 src6, 0x20);
132
133 s[3] = _mm256_unpacklo_epi16(s67, s78);
134 s[7] = _mm256_unpackhi_epi16(s67, s78);
135
136 const __m256i res_a = convolve(s, coeffs_y);
137
138 __m256i res_a_round = _mm256_sra_epi32(
139 _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
140
141 if (w - j > 4) {
142 const __m256i res_b = convolve(s + 4, coeffs_y);
143 __m256i res_b_round = _mm256_sra_epi32(
144 _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
145
146 __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
147 res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
148 res_16bit = _mm256_max_epi16(res_16bit, zero);
149
150 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
151 _mm256_castsi256_si128(res_16bit));
152 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
153 _mm256_extracti128_si256(res_16bit, 1));
154 } else if (w == 4) {
155 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
156 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
157 res_a_round = _mm256_max_epi16(res_a_round, zero);
158
159 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
160 _mm256_castsi256_si128(res_a_round));
161 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
162 _mm256_extracti128_si256(res_a_round, 1));
163 } else {
164 res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
165 res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
166 res_a_round = _mm256_max_epi16(res_a_round, zero);
167
168 xx_storel_32(&dst[i * dst_stride + j],
169 _mm256_castsi256_si128(res_a_round));
170 xx_storel_32(&dst[i * dst_stride + j + dst_stride],
171 _mm256_extracti128_si256(res_a_round, 1));
172 }
173
174 s[0] = s[1];
175 s[1] = s[2];
176 s[2] = s[3];
177
178 s[4] = s[5];
179 s[5] = s[6];
180 s[6] = s[7];
181 }
182 }
183 }
184 }
185
av1_highbd_convolve_x_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)186 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
187 uint16_t *dst, int dst_stride, int w, int h,
188 const InterpFilterParams *filter_params_x,
189 const int subpel_x_qn,
190 ConvolveParams *conv_params, int bd) {
191 if (filter_params_x->taps == 12) {
192 av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
193 filter_params_x, subpel_x_qn, conv_params,
194 bd);
195 return;
196 }
197 int i, j;
198 const int fo_horiz = filter_params_x->taps / 2 - 1;
199 const uint16_t *const src_ptr = src - fo_horiz;
200
201 // Check that, even with 12-bit input, the intermediate values will fit
202 // into an unsigned 16-bit intermediate array.
203 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
204
205 __m256i s[4], coeffs_x[4];
206
207 const __m256i round_const_x =
208 _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
209 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
210
211 const int bits = FILTER_BITS - conv_params->round_0;
212 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
213 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
214 const __m256i clip_pixel =
215 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
216 const __m256i zero = _mm256_setzero_si256();
217
218 assert(bits >= 0);
219 assert((FILTER_BITS - conv_params->round_1) >= 0 ||
220 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
221
222 prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
223
224 for (j = 0; j < w; j += 8) {
225 /* Horizontal filter */
226 for (i = 0; i < h; i += 2) {
227 const __m256i row0 =
228 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
229 __m256i row1 =
230 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
231
232 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
233 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
234
235 // even pixels
236 s[0] = _mm256_alignr_epi8(r1, r0, 0);
237 s[1] = _mm256_alignr_epi8(r1, r0, 4);
238 s[2] = _mm256_alignr_epi8(r1, r0, 8);
239 s[3] = _mm256_alignr_epi8(r1, r0, 12);
240
241 __m256i res_even = convolve(s, coeffs_x);
242 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
243 round_shift_x);
244
245 // odd pixels
246 s[0] = _mm256_alignr_epi8(r1, r0, 2);
247 s[1] = _mm256_alignr_epi8(r1, r0, 6);
248 s[2] = _mm256_alignr_epi8(r1, r0, 10);
249 s[3] = _mm256_alignr_epi8(r1, r0, 14);
250
251 __m256i res_odd = convolve(s, coeffs_x);
252 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
253 round_shift_x);
254
255 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
256 round_shift_bits);
257 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
258 round_shift_bits);
259
260 __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
261 __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
262
263 __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
264 res = _mm256_min_epi16(res, clip_pixel);
265 res = _mm256_max_epi16(res, zero);
266
267 if (w - j > 4) {
268 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
269 _mm256_castsi256_si128(res));
270 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
271 _mm256_extracti128_si256(res, 1));
272 } else if (w == 4) {
273 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
274 _mm256_castsi256_si128(res));
275 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
276 _mm256_extracti128_si256(res, 1));
277 } else {
278 xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
279 xx_storel_32(&dst[i * dst_stride + j + dst_stride],
280 _mm256_extracti128_si256(res, 1));
281 }
282 }
283 }
284 }
285
286 #define CONV8_ROUNDING_BITS (7)
287
288 // -----------------------------------------------------------------------------
289 // Horizontal and vertical filtering
290
291 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
292 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
293 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
294
295 static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
296 8, 9, 10, 11, 10, 11, 12, 13,
297 4, 5, 6, 7, 6, 7, 8, 9,
298 8, 9, 10, 11, 10, 11, 12, 13 };
299
300 static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
301 10, 11, 12, 13, 12, 13, 14, 15,
302 6, 7, 8, 9, 8, 9, 10, 11,
303 10, 11, 12, 13, 12, 13, 14, 15 };
304
305 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
306
307 // -----------------------------------------------------------------------------
308 // Horizontal Filtering
309
pack_pixels(const __m256i * s,__m256i * p)310 static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
311 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
312 const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
313 const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
314 const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
315
316 p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
317 p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
318 p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
319 p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
320 }
321
322 // Note:
323 // Shared by 8x2 and 16x1 block
pack_16_pixels(const __m256i * s0,const __m256i * s1,__m256i * x)324 static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1,
325 __m256i *x /*x[8]*/) {
326 __m256i pp[8];
327 pack_pixels(s0, pp);
328 pack_pixels(s1, &pp[4]);
329 x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
330 x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
331 x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
332 x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
333 x[4] = x[2];
334 x[5] = x[3];
335 x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
336 x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
337 }
338
pack_8x1_pixels(const uint16_t * src,__m256i * x)339 static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
340 __m256i pp[8];
341 __m256i s0;
342 s0 = _mm256_loadu_si256((const __m256i *)src);
343 pack_pixels(&s0, pp);
344 x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
345 x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
346 x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
347 x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
348 }
349
pack_8x2_pixels(const uint16_t * src,ptrdiff_t stride,__m256i * x)350 static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
351 __m256i *x) {
352 __m256i s0, s1;
353 s0 = _mm256_loadu_si256((const __m256i *)src);
354 s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
355 pack_16_pixels(&s0, &s1, x);
356 }
357
pack_16x1_pixels(const uint16_t * src,__m256i * x)358 static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
359 __m256i s0, s1;
360 s0 = _mm256_loadu_si256((const __m256i *)src);
361 s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
362 pack_16_pixels(&s0, &s1, x);
363 }
364
365 // Note:
366 // Shared by horizontal and vertical filtering
pack_filters(const int16_t * filter,__m256i * f)367 static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
368 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
369 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
370 const __m256i p0 = _mm256_set1_epi32(0x03020100);
371 const __m256i p1 = _mm256_set1_epi32(0x07060504);
372 const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
373 const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
374 f[0] = _mm256_shuffle_epi8(hh, p0);
375 f[1] = _mm256_shuffle_epi8(hh, p1);
376 f[2] = _mm256_shuffle_epi8(hh, p2);
377 f[3] = _mm256_shuffle_epi8(hh, p3);
378 }
379
pack_filters_4tap(const int16_t * filter,__m256i * f)380 static inline void pack_filters_4tap(const int16_t *filter,
381 __m256i *f /*f[4]*/) {
382 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
383 const __m256i coeff = _mm256_broadcastsi128_si256(h);
384
385 // coeffs 2 3 2 3 2 3 2 3
386 f[0] = _mm256_shuffle_epi32(coeff, 0x55);
387 // coeffs 4 5 4 5 4 5 4 5
388 f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
389 }
390
filter_8x1_pixels(const __m256i * sig,const __m256i * fil,__m256i * y)391 static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
392 const __m256i *fil /*fil[4]*/,
393 __m256i *y) {
394 __m256i a, a0, a1;
395
396 a0 = _mm256_madd_epi16(fil[0], sig[0]);
397 a1 = _mm256_madd_epi16(fil[3], sig[3]);
398 a = _mm256_add_epi32(a0, a1);
399
400 a0 = _mm256_madd_epi16(fil[1], sig[1]);
401 a1 = _mm256_madd_epi16(fil[2], sig[2]);
402
403 {
404 const __m256i min = _mm256_min_epi32(a0, a1);
405 a = _mm256_add_epi32(a, min);
406 }
407 {
408 const __m256i max = _mm256_max_epi32(a0, a1);
409 a = _mm256_add_epi32(a, max);
410 }
411 {
412 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
413 a = _mm256_add_epi32(a, rounding);
414 *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
415 }
416 }
417
store_8x1_pixels(const __m256i * y,const __m256i * mask,uint16_t * dst)418 static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask,
419 uint16_t *dst) {
420 const __m128i a0 = _mm256_castsi256_si128(*y);
421 const __m128i a1 = _mm256_extractf128_si256(*y, 1);
422 __m128i res = _mm_packus_epi32(a0, a1);
423 res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
424 _mm_storeu_si128((__m128i *)dst, res);
425 }
426
store_8x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)427 static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
428 const __m256i *mask, uint16_t *dst,
429 ptrdiff_t pitch) {
430 __m256i a = _mm256_packus_epi32(*y0, *y1);
431 a = _mm256_min_epi16(a, *mask);
432 _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
433 _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
434 }
435
store_16x1_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst)436 static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
437 const __m256i *mask, uint16_t *dst) {
438 __m256i a = _mm256_packus_epi32(*y0, *y1);
439 a = _mm256_min_epi16(a, *mask);
440 _mm256_storeu_si256((__m256i *)dst, a);
441 }
442
aom_highbd_filter_block1d8_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)443 static void aom_highbd_filter_block1d8_h8_avx2(
444 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
445 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
446 __m256i signal[8], res0, res1;
447 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
448
449 __m256i ff[4];
450 pack_filters(filter, ff);
451
452 src_ptr -= 3;
453 do {
454 pack_8x2_pixels(src_ptr, src_pitch, signal);
455 filter_8x1_pixels(signal, ff, &res0);
456 filter_8x1_pixels(&signal[4], ff, &res1);
457 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
458 height -= 2;
459 src_ptr += src_pitch << 1;
460 dst_ptr += dst_pitch << 1;
461 } while (height > 1);
462
463 if (height > 0) {
464 pack_8x1_pixels(src_ptr, signal);
465 filter_8x1_pixels(signal, ff, &res0);
466 store_8x1_pixels(&res0, &max, dst_ptr);
467 }
468 }
469
aom_highbd_filter_block1d16_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)470 static void aom_highbd_filter_block1d16_h8_avx2(
471 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
472 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
473 __m256i signal[8], res0, res1;
474 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
475
476 __m256i ff[4];
477 pack_filters(filter, ff);
478
479 src_ptr -= 3;
480 do {
481 pack_16x1_pixels(src_ptr, signal);
482 filter_8x1_pixels(signal, ff, &res0);
483 filter_8x1_pixels(&signal[4], ff, &res1);
484 store_16x1_pixels(&res0, &res1, &max, dst_ptr);
485 height -= 1;
486 src_ptr += src_pitch;
487 dst_ptr += dst_pitch;
488 } while (height > 0);
489 }
490
aom_highbd_filter_block1d4_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)491 static void aom_highbd_filter_block1d4_h4_avx2(
492 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
493 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
494 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
495 __m256i ff[2], s[2];
496 uint32_t i;
497 const __m256i clip_pixel =
498 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
499 const __m256i zero = _mm256_setzero_si256();
500
501 static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
502 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
503 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
504
505 __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
506 __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
507 __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
508
509 pack_filters_4tap(filter, ff);
510 src_ptr -= 3;
511 for (i = 0; i <= (height - 2); i += 2) {
512 __m256i row0 = _mm256_castsi128_si256(
513 _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
514 __m256i row1 = _mm256_castsi128_si256(
515 _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
516
517 s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
518 s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
519
520 s[0] = _mm256_shuffle_epi8(s[0], mask);
521 s[1] = _mm256_shuffle_epi8(s[1], mask);
522
523 __m256i res = convolve_4tap(s, ff);
524 res =
525 _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
526
527 res = _mm256_packs_epi32(res, res);
528 res = _mm256_min_epi16(res, clip_pixel);
529 res = _mm256_max_epi16(res, zero);
530
531 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
532 _mm256_castsi256_si128(res));
533 _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
534 _mm256_extracti128_si256(res, 1));
535 }
536 if (height % 2 != 0) {
537 i = height - 1;
538 const __m256i row0_0 = _mm256_castsi128_si256(
539 _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
540 const __m256i row0_1 = _mm256_castsi128_si256(
541 _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
542
543 const __m256i r0 =
544 _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
545
546 s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
547 s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
548
549 __m256i res = convolve_4tap(s, ff);
550 res =
551 _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
552
553 res = _mm256_packs_epi32(res, res);
554 res = _mm256_min_epi16(res, clip_pixel);
555 res = _mm256_max_epi16(res, zero);
556
557 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
558 _mm256_castsi256_si128(res));
559 }
560 }
561
aom_highbd_filter_block1d8_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)562 static void aom_highbd_filter_block1d8_h4_avx2(
563 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
564 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
565 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
566 __m256i ff[2], s[2];
567 uint32_t i = 0;
568 const __m256i clip_pixel =
569 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
570 const __m256i zero = _mm256_setzero_si256();
571
572 static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11,
573 4, 5, 12, 13, 6, 7, 14, 15,
574 0, 1, 8, 9, 2, 3, 10, 11,
575 4, 5, 12, 13, 6, 7, 14, 15 };
576
577 __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
578 __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
579 __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
580
581 pack_filters_4tap(filter, ff);
582 src_ptr -= 3;
583
584 /* Horizontal filter */
585
586 for (i = 0; i <= (height - 2); i += 2) {
587 const __m256i row0 =
588 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
589 __m256i row1 =
590 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
591
592 const __m256i r0 =
593 _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
594 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
595
596 // even pixels
597 s[0] = r0;
598 s[1] = _mm256_alignr_epi8(r1, r0, 4);
599
600 __m256i res_even = convolve_4tap(s, ff);
601 res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
602 CONV8_ROUNDING_BITS);
603
604 // odd pixels
605 s[0] = _mm256_alignr_epi8(r1, r0, 2);
606 s[1] = _mm256_alignr_epi8(r1, r0, 6);
607
608 __m256i res_odd = convolve_4tap(s, ff);
609 res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
610 CONV8_ROUNDING_BITS);
611
612 __m256i res = _mm256_packs_epi32(res_even, res_odd);
613 res = _mm256_shuffle_epi8(res, mask);
614
615 res = _mm256_min_epi16(res, clip_pixel);
616 res = _mm256_max_epi16(res, zero);
617
618 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
619 _mm256_castsi256_si128(res));
620 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
621 _mm256_extracti128_si256(res, 1));
622 }
623
624 if (height % 2 != 0) {
625 i = height - 1;
626 const __m256i row0_0 =
627 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
628 const __m256i row0_1 =
629 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
630
631 const __m256i r0 =
632 _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
633
634 s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
635 s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
636
637 __m256i res = convolve_4tap(s, ff);
638 res =
639 _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
640
641 res = _mm256_packs_epi32(res, res);
642 res = _mm256_min_epi16(res, clip_pixel);
643 res = _mm256_max_epi16(res, zero);
644
645 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
646 _mm256_castsi256_si128(res));
647 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
648 _mm256_extracti128_si256(res, 1));
649 }
650 }
651
aom_highbd_filter_block1d16_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)652 static void aom_highbd_filter_block1d16_h4_avx2(
653 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
654 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
655 aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
656 height, filter, bd);
657 aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
658 dst_pitch, height, filter, bd);
659 }
660
661 // -----------------------------------------------------------------------------
662 // 2-tap horizontal filtering
663
pack_2t_filter(const int16_t * filter,__m256i * f)664 static inline void pack_2t_filter(const int16_t *filter, __m256i *f) {
665 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
666 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
667 const __m256i p = _mm256_set1_epi32(0x09080706);
668 f[0] = _mm256_shuffle_epi8(hh, p);
669 }
670
671 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
672 // the difference is s0/s1 specifies first and second rows or,
673 // first 16 samples and 8-sample shifted 16 samples
pack_16_2t_pixels(const __m256i * s0,const __m256i * s1,__m256i * sig)674 static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
675 __m256i *sig) {
676 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
677 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
678 __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
679 __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
680 __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
681 __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
682 r0 = _mm256_shuffle_epi8(r0, sf2);
683 r1 = _mm256_shuffle_epi8(r1, sf2);
684 sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
685 sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
686 }
687
pack_8x2_2t_pixels(const uint16_t * src,const ptrdiff_t pitch,__m256i * sig)688 static inline void pack_8x2_2t_pixels(const uint16_t *src,
689 const ptrdiff_t pitch, __m256i *sig) {
690 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
691 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
692 pack_16_2t_pixels(&r0, &r1, sig);
693 }
694
pack_16x1_2t_pixels(const uint16_t * src,__m256i * sig)695 static inline void pack_16x1_2t_pixels(const uint16_t *src,
696 __m256i *sig /*sig[2]*/) {
697 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
698 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
699 pack_16_2t_pixels(&r0, &r1, sig);
700 }
701
pack_8x1_2t_pixels(const uint16_t * src,__m256i * sig)702 static inline void pack_8x1_2t_pixels(const uint16_t *src,
703 __m256i *sig /*sig[2]*/) {
704 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
705 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
706 __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
707 __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
708 r0 = _mm256_permutevar8x32_epi32(r0, idx);
709 r0 = _mm256_shuffle_epi8(r0, sf2);
710 sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
711 }
712
713 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
filter_16_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)714 static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
715 __m256i *y0, __m256i *y1) {
716 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
717 __m256i x0 = _mm256_madd_epi16(sig[0], *f);
718 __m256i x1 = _mm256_madd_epi16(sig[1], *f);
719 x0 = _mm256_add_epi32(x0, rounding);
720 x1 = _mm256_add_epi32(x1, rounding);
721 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
722 *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
723 }
724
filter_8x1_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0)725 static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
726 __m256i *y0) {
727 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
728 __m256i x0 = _mm256_madd_epi16(sig[0], *f);
729 x0 = _mm256_add_epi32(x0, rounding);
730 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
731 }
732
aom_highbd_filter_block1d8_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)733 static void aom_highbd_filter_block1d8_h2_avx2(
734 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
735 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
736 __m256i signal[2], res0, res1;
737 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
738
739 __m256i ff;
740 pack_2t_filter(filter, &ff);
741
742 src_ptr -= 3;
743 do {
744 pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
745 filter_16_2t_pixels(signal, &ff, &res0, &res1);
746 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
747 height -= 2;
748 src_ptr += src_pitch << 1;
749 dst_ptr += dst_pitch << 1;
750 } while (height > 1);
751
752 if (height > 0) {
753 pack_8x1_2t_pixels(src_ptr, signal);
754 filter_8x1_2t_pixels(signal, &ff, &res0);
755 store_8x1_pixels(&res0, &max, dst_ptr);
756 }
757 }
758
aom_highbd_filter_block1d16_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)759 static void aom_highbd_filter_block1d16_h2_avx2(
760 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
761 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
762 __m256i signal[2], res0, res1;
763 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
764
765 __m256i ff;
766 pack_2t_filter(filter, &ff);
767
768 src_ptr -= 3;
769 do {
770 pack_16x1_2t_pixels(src_ptr, signal);
771 filter_16_2t_pixels(signal, &ff, &res0, &res1);
772 store_16x1_pixels(&res0, &res1, &max, dst_ptr);
773 height -= 1;
774 src_ptr += src_pitch;
775 dst_ptr += dst_pitch;
776 } while (height > 0);
777 }
778
779 // -----------------------------------------------------------------------------
780 // Vertical Filtering
781
pack_8x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)782 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
783 __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
784 __m256i s1 =
785 _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
786 __m256i s2 = _mm256_castsi128_si256(
787 _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
788 __m256i s3 = _mm256_castsi128_si256(
789 _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
790 __m256i s4 = _mm256_castsi128_si256(
791 _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
792 __m256i s5 = _mm256_castsi128_si256(
793 _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
794 __m256i s6 = _mm256_castsi128_si256(
795 _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
796
797 s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
798 s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
799 s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
800 s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
801 s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
802 s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
803
804 sig[0] = _mm256_unpacklo_epi16(s0, s1);
805 sig[4] = _mm256_unpackhi_epi16(s0, s1);
806 sig[1] = _mm256_unpacklo_epi16(s2, s3);
807 sig[5] = _mm256_unpackhi_epi16(s2, s3);
808 sig[2] = _mm256_unpacklo_epi16(s4, s5);
809 sig[6] = _mm256_unpackhi_epi16(s4, s5);
810 sig[8] = s6;
811 }
812
pack_8x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)813 static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
814 __m256i *sig) {
815 // base + 7th row
816 __m256i s0 = _mm256_castsi128_si256(
817 _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
818 // base + 8th row
819 __m256i s1 = _mm256_castsi128_si256(
820 _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
821 __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
822 __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
823 sig[3] = _mm256_unpacklo_epi16(s2, s3);
824 sig[7] = _mm256_unpackhi_epi16(s2, s3);
825 sig[8] = s1;
826 }
827
filter_8x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)828 static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
829 __m256i *y0, __m256i *y1) {
830 filter_8x1_pixels(sig, f, y0);
831 filter_8x1_pixels(&sig[4], f, y1);
832 }
833
update_pixels(__m256i * sig)834 static inline void update_pixels(__m256i *sig) {
835 int i;
836 for (i = 0; i < 3; ++i) {
837 sig[i] = sig[i + 1];
838 sig[i + 4] = sig[i + 5];
839 }
840 }
841
aom_highbd_filter_block1d8_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)842 static void aom_highbd_filter_block1d8_v8_avx2(
843 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
844 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
845 __m256i signal[9], res0, res1;
846 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
847
848 __m256i ff[4];
849 pack_filters(filter, ff);
850
851 pack_8x9_init(src_ptr, src_pitch, signal);
852
853 do {
854 pack_8x9_pixels(src_ptr, src_pitch, signal);
855
856 filter_8x9_pixels(signal, ff, &res0, &res1);
857 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
858 update_pixels(signal);
859
860 src_ptr += src_pitch << 1;
861 dst_ptr += dst_pitch << 1;
862 height -= 2;
863 } while (height > 0);
864 }
865
pack_16x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)866 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
867 __m256i u0, u1, u2, u3;
868 // load 0-6 rows
869 const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
870 const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
871 const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
872 const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
873 const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
874 const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
875 const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
876
877 u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
878 u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
879
880 u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
881 u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
882
883 sig[0] = _mm256_unpacklo_epi16(u0, u2);
884 sig[4] = _mm256_unpackhi_epi16(u0, u2);
885
886 sig[8] = _mm256_unpacklo_epi16(u1, u3);
887 sig[12] = _mm256_unpackhi_epi16(u1, u3);
888
889 u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
890 u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
891
892 u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
893 u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
894
895 sig[1] = _mm256_unpacklo_epi16(u0, u2);
896 sig[5] = _mm256_unpackhi_epi16(u0, u2);
897
898 sig[9] = _mm256_unpacklo_epi16(u1, u3);
899 sig[13] = _mm256_unpackhi_epi16(u1, u3);
900
901 u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
902 u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
903
904 u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
905 u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
906
907 sig[2] = _mm256_unpacklo_epi16(u0, u2);
908 sig[6] = _mm256_unpackhi_epi16(u0, u2);
909
910 sig[10] = _mm256_unpacklo_epi16(u1, u3);
911 sig[14] = _mm256_unpackhi_epi16(u1, u3);
912
913 sig[16] = s6;
914 }
915
pack_16x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)916 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
917 __m256i *sig) {
918 // base + 7th row
919 const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
920 // base + 8th row
921 const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
922
923 __m256i u0, u1, u2, u3;
924 u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
925 u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
926
927 u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
928 u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
929
930 sig[3] = _mm256_unpacklo_epi16(u0, u2);
931 sig[7] = _mm256_unpackhi_epi16(u0, u2);
932
933 sig[11] = _mm256_unpacklo_epi16(u1, u3);
934 sig[15] = _mm256_unpackhi_epi16(u1, u3);
935
936 sig[16] = s8;
937 }
938
filter_16x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)939 static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
940 __m256i *y0, __m256i *y1) {
941 __m256i res[4];
942 int i;
943 for (i = 0; i < 4; ++i) {
944 filter_8x1_pixels(&sig[i << 2], f, &res[i]);
945 }
946
947 {
948 const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
949 const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
950 *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
951 *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
952 }
953 }
954
store_16x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)955 static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
956 const __m256i *mask, uint16_t *dst,
957 ptrdiff_t pitch) {
958 __m256i p = _mm256_min_epi16(*y0, *mask);
959 _mm256_storeu_si256((__m256i *)dst, p);
960 p = _mm256_min_epi16(*y1, *mask);
961 _mm256_storeu_si256((__m256i *)(dst + pitch), p);
962 }
963
update_16x9_pixels(__m256i * sig)964 static void update_16x9_pixels(__m256i *sig) {
965 update_pixels(&sig[0]);
966 update_pixels(&sig[8]);
967 }
968
aom_highbd_filter_block1d16_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)969 static void aom_highbd_filter_block1d16_v8_avx2(
970 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
971 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
972 __m256i signal[17], res0, res1;
973 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
974
975 __m256i ff[4];
976 pack_filters(filter, ff);
977
978 pack_16x9_init(src_ptr, src_pitch, signal);
979
980 do {
981 pack_16x9_pixels(src_ptr, src_pitch, signal);
982 filter_16x9_pixels(signal, ff, &res0, &res1);
983 store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
984 update_16x9_pixels(signal);
985
986 src_ptr += src_pitch << 1;
987 dst_ptr += dst_pitch << 1;
988 height -= 2;
989 } while (height > 0);
990 }
991
aom_highbd_filter_block1d4_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)992 static void aom_highbd_filter_block1d4_v4_avx2(
993 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
994 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
995 const int bits = FILTER_BITS;
996
997 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
998 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
999 const __m256i clip_pixel =
1000 _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
1001 const __m256i zero = _mm256_setzero_si256();
1002 uint32_t i;
1003 __m256i s[2], ff[2];
1004
1005 pack_filters_4tap(filter, ff);
1006
1007 const uint16_t *data = src_ptr;
1008 /* Vertical filter */
1009 {
1010 __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
1011 __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
1012
1013 __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
1014
1015 __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
1016
1017 __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
1018
1019 s[0] = _mm256_unpacklo_epi16(s23, s34);
1020
1021 for (i = 0; i < height; i += 2) {
1022 data = &src_ptr[i * src_pitch];
1023
1024 __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
1025 __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
1026
1027 __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
1028 __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
1029
1030 s[1] = _mm256_unpacklo_epi16(s45, s56);
1031
1032 const __m256i res_a = convolve_4tap(s, ff);
1033
1034 __m256i res_a_round = _mm256_sra_epi32(
1035 _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
1036
1037 __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
1038 res_16bit = _mm256_max_epi32(res_16bit, zero);
1039 res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
1040
1041 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
1042 _mm256_castsi256_si128(res_16bit));
1043 _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
1044 _mm256_extracti128_si256(res_16bit, 1));
1045
1046 s[0] = s[1];
1047 s4 = s6;
1048 }
1049 }
1050 }
1051
aom_highbd_filter_block1d8_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1052 static void aom_highbd_filter_block1d8_v4_avx2(
1053 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1054 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1055 const int bits = FILTER_BITS;
1056
1057 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
1058 const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
1059 const __m256i clip_pixel =
1060 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
1061 const __m256i zero = _mm256_setzero_si256();
1062 __m256i s[4], ff[2];
1063 uint32_t i;
1064 pack_filters_4tap(filter, ff);
1065
1066 const uint16_t *data = src_ptr;
1067 /* Vertical filter */
1068 {
1069 __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
1070 __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
1071
1072 __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
1073
1074 __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
1075
1076 __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
1077
1078 s[0] = _mm256_unpacklo_epi16(s23, s34);
1079 s[2] = _mm256_unpackhi_epi16(s23, s34);
1080
1081 for (i = 0; i < height; i += 2) {
1082 data = &src_ptr[i * src_pitch];
1083
1084 __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
1085 __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
1086
1087 __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
1088 __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
1089
1090 s[1] = _mm256_unpacklo_epi16(s45, s56);
1091 s[3] = _mm256_unpackhi_epi16(s45, s56);
1092
1093 const __m256i res_a = convolve_4tap(s, ff);
1094
1095 __m256i res_a_round = _mm256_sra_epi32(
1096 _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
1097
1098 const __m256i res_b = convolve_4tap(s + 2, ff);
1099 __m256i res_b_round = _mm256_sra_epi32(
1100 _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
1101
1102 __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
1103 res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
1104 res_16bit = _mm256_max_epi16(res_16bit, zero);
1105
1106 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
1107 _mm256_castsi256_si128(res_16bit));
1108 _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
1109 _mm256_extracti128_si256(res_16bit, 1));
1110
1111 s[0] = s[1];
1112 s[2] = s[3];
1113 s4 = s6;
1114 }
1115 }
1116 }
1117
aom_highbd_filter_block1d16_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1118 static void aom_highbd_filter_block1d16_v4_avx2(
1119 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121 aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
1122 height, filter, bd);
1123
1124 aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
1125 dst_pitch, height, filter, bd);
1126 }
1127
1128 // -----------------------------------------------------------------------------
1129 // 2-tap vertical filtering
1130
pack_16x2_init(const uint16_t * src,__m256i * sig)1131 static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
1132 sig[2] = _mm256_loadu_si256((const __m256i *)src);
1133 }
1134
pack_16x2_2t_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)1135 static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
1136 __m256i *sig) {
1137 // load the next row
1138 const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
1139 sig[0] = _mm256_unpacklo_epi16(sig[2], u);
1140 sig[1] = _mm256_unpackhi_epi16(sig[2], u);
1141 sig[2] = u;
1142 }
1143
filter_16x2_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)1144 static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
1145 __m256i *y0, __m256i *y1) {
1146 filter_16_2t_pixels(sig, f, y0, y1);
1147 }
1148
aom_highbd_filter_block1d16_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1149 static void aom_highbd_filter_block1d16_v2_avx2(
1150 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1151 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1152 __m256i signal[3], res0, res1;
1153 const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1154 __m256i ff;
1155
1156 pack_2t_filter(filter, &ff);
1157 pack_16x2_init(src_ptr, signal);
1158
1159 do {
1160 pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1161 filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1162 store_16x1_pixels(&res0, &res1, &max, dst_ptr);
1163
1164 src_ptr += src_pitch;
1165 dst_ptr += dst_pitch;
1166 height -= 1;
1167 } while (height > 0);
1168 }
1169
pack_8x1_2t_filter(const int16_t * filter,__m128i * f)1170 static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
1171 const __m128i h = _mm_loadu_si128((const __m128i *)filter);
1172 const __m128i p = _mm_set1_epi32(0x09080706);
1173 f[0] = _mm_shuffle_epi8(h, p);
1174 }
1175
pack_8x2_init(const uint16_t * src,__m128i * sig)1176 static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
1177 sig[2] = _mm_loadu_si128((const __m128i *)src);
1178 }
1179
pack_8x2_2t_pixels_ver(const uint16_t * src,ptrdiff_t pitch,__m128i * sig)1180 static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
1181 __m128i *sig) {
1182 // load the next row
1183 const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
1184 sig[0] = _mm_unpacklo_epi16(sig[2], u);
1185 sig[1] = _mm_unpackhi_epi16(sig[2], u);
1186 sig[2] = u;
1187 }
1188
filter_8_2t_pixels(const __m128i * sig,const __m128i * f,__m128i * y0,__m128i * y1)1189 static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
1190 __m128i *y0, __m128i *y1) {
1191 const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
1192 __m128i x0 = _mm_madd_epi16(sig[0], *f);
1193 __m128i x1 = _mm_madd_epi16(sig[1], *f);
1194 x0 = _mm_add_epi32(x0, rounding);
1195 x1 = _mm_add_epi32(x1, rounding);
1196 *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
1197 *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
1198 }
1199
store_8x1_2t_pixels_ver(const __m128i * y0,const __m128i * y1,const __m128i * mask,uint16_t * dst)1200 static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
1201 const __m128i *mask, uint16_t *dst) {
1202 __m128i res = _mm_packus_epi32(*y0, *y1);
1203 res = _mm_min_epi16(res, *mask);
1204 _mm_storeu_si128((__m128i *)dst, res);
1205 }
1206
aom_highbd_filter_block1d8_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1207 static void aom_highbd_filter_block1d8_v2_avx2(
1208 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1209 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1210 __m128i signal[3], res0, res1;
1211 const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1212 __m128i ff;
1213
1214 pack_8x1_2t_filter(filter, &ff);
1215 pack_8x2_init(src_ptr, signal);
1216
1217 do {
1218 pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1219 filter_8_2t_pixels(signal, &ff, &res0, &res1);
1220 store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
1221
1222 src_ptr += src_pitch;
1223 dst_ptr += dst_pitch;
1224 height -= 1;
1225 } while (height > 0);
1226 }
1227
1228 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1229 ptrdiff_t, uint32_t, const int16_t *,
1230 int);
1231 void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1232 ptrdiff_t, uint32_t, const int16_t *,
1233 int);
1234 void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1235 ptrdiff_t, uint32_t, const int16_t *,
1236 int);
1237 void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1238 ptrdiff_t, uint32_t, const int16_t *,
1239 int);
1240 #define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
1241 #define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
1242 #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
1243 #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
1244
1245 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
1246 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
1247
1248 #undef HIGHBD_FUNC
1249