xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_convolve_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <immintrin.h>
12 #include <string.h>
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "aom_dsp/x86/convolve.h"
17 #include "aom_dsp/x86/convolve_avx2.h"
18 #include "aom_dsp/x86/synonyms.h"
19 
20 // -----------------------------------------------------------------------------
21 // Copy and average
22 
23 static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
24                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
25                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
26 static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
27                                              8, 9, 10, 11, 10, 11, 12, 13,
28                                              4, 5, 6,  7,  6,  7,  8,  9,
29                                              8, 9, 10, 11, 10, 11, 12, 13 };
30 
31 void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
32                                     uint16_t *dst, int dst_stride, int w, int h,
33                                     const InterpFilterParams *filter_params_x,
34                                     const int subpel_x_qn,
35                                     ConvolveParams *conv_params, int bd);
36 void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
37                                     uint16_t *dst, int dst_stride, int w, int h,
38                                     const InterpFilterParams *filter_params_y,
39                                     const int subpel_y_qn, int bd);
40 
av1_highbd_convolve_y_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn,int bd)41 void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
42                                    uint16_t *dst, int dst_stride, int w, int h,
43                                    const InterpFilterParams *filter_params_y,
44                                    const int subpel_y_qn, int bd) {
45   if (filter_params_y->taps == 12) {
46     av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
47                                    filter_params_y, subpel_y_qn, bd);
48     return;
49   }
50   int i, j;
51   const int fo_vert = filter_params_y->taps / 2 - 1;
52   const uint16_t *const src_ptr = src - fo_vert * src_stride;
53 
54   __m256i s[8], coeffs_y[4];
55 
56   const int bits = FILTER_BITS;
57 
58   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
59   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
60   const __m256i clip_pixel =
61       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
62   const __m256i zero = _mm256_setzero_si256();
63 
64   prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
65 
66   for (j = 0; j < w; j += 8) {
67     const uint16_t *data = &src_ptr[j];
68     /* Vertical filter */
69     {
70       __m256i src6;
71       __m256i s01 = _mm256_permute2x128_si256(
72           _mm256_castsi128_si256(
73               _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
74           _mm256_castsi128_si256(
75               _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
76           0x20);
77       __m256i s12 = _mm256_permute2x128_si256(
78           _mm256_castsi128_si256(
79               _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
80           _mm256_castsi128_si256(
81               _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
82           0x20);
83       __m256i s23 = _mm256_permute2x128_si256(
84           _mm256_castsi128_si256(
85               _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
86           _mm256_castsi128_si256(
87               _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
88           0x20);
89       __m256i s34 = _mm256_permute2x128_si256(
90           _mm256_castsi128_si256(
91               _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
92           _mm256_castsi128_si256(
93               _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
94           0x20);
95       __m256i s45 = _mm256_permute2x128_si256(
96           _mm256_castsi128_si256(
97               _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
98           _mm256_castsi128_si256(
99               _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
100           0x20);
101       src6 = _mm256_castsi128_si256(
102           _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
103       __m256i s56 = _mm256_permute2x128_si256(
104           _mm256_castsi128_si256(
105               _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
106           src6, 0x20);
107 
108       s[0] = _mm256_unpacklo_epi16(s01, s12);
109       s[1] = _mm256_unpacklo_epi16(s23, s34);
110       s[2] = _mm256_unpacklo_epi16(s45, s56);
111 
112       s[4] = _mm256_unpackhi_epi16(s01, s12);
113       s[5] = _mm256_unpackhi_epi16(s23, s34);
114       s[6] = _mm256_unpackhi_epi16(s45, s56);
115 
116       for (i = 0; i < h; i += 2) {
117         data = &src_ptr[i * src_stride + j];
118 
119         const __m256i s67 = _mm256_permute2x128_si256(
120             src6,
121             _mm256_castsi128_si256(
122                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
123             0x20);
124 
125         src6 = _mm256_castsi128_si256(
126             _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
127 
128         const __m256i s78 = _mm256_permute2x128_si256(
129             _mm256_castsi128_si256(
130                 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
131             src6, 0x20);
132 
133         s[3] = _mm256_unpacklo_epi16(s67, s78);
134         s[7] = _mm256_unpackhi_epi16(s67, s78);
135 
136         const __m256i res_a = convolve(s, coeffs_y);
137 
138         __m256i res_a_round = _mm256_sra_epi32(
139             _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
140 
141         if (w - j > 4) {
142           const __m256i res_b = convolve(s + 4, coeffs_y);
143           __m256i res_b_round = _mm256_sra_epi32(
144               _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
145 
146           __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
147           res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
148           res_16bit = _mm256_max_epi16(res_16bit, zero);
149 
150           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
151                            _mm256_castsi256_si128(res_16bit));
152           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
153                            _mm256_extracti128_si256(res_16bit, 1));
154         } else if (w == 4) {
155           res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
156           res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
157           res_a_round = _mm256_max_epi16(res_a_round, zero);
158 
159           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
160                            _mm256_castsi256_si128(res_a_round));
161           _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
162                            _mm256_extracti128_si256(res_a_round, 1));
163         } else {
164           res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
165           res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
166           res_a_round = _mm256_max_epi16(res_a_round, zero);
167 
168           xx_storel_32(&dst[i * dst_stride + j],
169                        _mm256_castsi256_si128(res_a_round));
170           xx_storel_32(&dst[i * dst_stride + j + dst_stride],
171                        _mm256_extracti128_si256(res_a_round, 1));
172         }
173 
174         s[0] = s[1];
175         s[1] = s[2];
176         s[2] = s[3];
177 
178         s[4] = s[5];
179         s[5] = s[6];
180         s[6] = s[7];
181       }
182     }
183   }
184 }
185 
av1_highbd_convolve_x_sr_avx2(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params,int bd)186 void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
187                                    uint16_t *dst, int dst_stride, int w, int h,
188                                    const InterpFilterParams *filter_params_x,
189                                    const int subpel_x_qn,
190                                    ConvolveParams *conv_params, int bd) {
191   if (filter_params_x->taps == 12) {
192     av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
193                                    filter_params_x, subpel_x_qn, conv_params,
194                                    bd);
195     return;
196   }
197   int i, j;
198   const int fo_horiz = filter_params_x->taps / 2 - 1;
199   const uint16_t *const src_ptr = src - fo_horiz;
200 
201   // Check that, even with 12-bit input, the intermediate values will fit
202   // into an unsigned 16-bit intermediate array.
203   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
204 
205   __m256i s[4], coeffs_x[4];
206 
207   const __m256i round_const_x =
208       _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
209   const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
210 
211   const int bits = FILTER_BITS - conv_params->round_0;
212   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
213   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
214   const __m256i clip_pixel =
215       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
216   const __m256i zero = _mm256_setzero_si256();
217 
218   assert(bits >= 0);
219   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
220          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
221 
222   prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
223 
224   for (j = 0; j < w; j += 8) {
225     /* Horizontal filter */
226     for (i = 0; i < h; i += 2) {
227       const __m256i row0 =
228           _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
229       __m256i row1 =
230           _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
231 
232       const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
233       const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
234 
235       // even pixels
236       s[0] = _mm256_alignr_epi8(r1, r0, 0);
237       s[1] = _mm256_alignr_epi8(r1, r0, 4);
238       s[2] = _mm256_alignr_epi8(r1, r0, 8);
239       s[3] = _mm256_alignr_epi8(r1, r0, 12);
240 
241       __m256i res_even = convolve(s, coeffs_x);
242       res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
243                                   round_shift_x);
244 
245       // odd pixels
246       s[0] = _mm256_alignr_epi8(r1, r0, 2);
247       s[1] = _mm256_alignr_epi8(r1, r0, 6);
248       s[2] = _mm256_alignr_epi8(r1, r0, 10);
249       s[3] = _mm256_alignr_epi8(r1, r0, 14);
250 
251       __m256i res_odd = convolve(s, coeffs_x);
252       res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
253                                  round_shift_x);
254 
255       res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
256                                   round_shift_bits);
257       res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
258                                  round_shift_bits);
259 
260       __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
261       __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
262 
263       __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
264       res = _mm256_min_epi16(res, clip_pixel);
265       res = _mm256_max_epi16(res, zero);
266 
267       if (w - j > 4) {
268         _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
269                          _mm256_castsi256_si128(res));
270         _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
271                          _mm256_extracti128_si256(res, 1));
272       } else if (w == 4) {
273         _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
274                          _mm256_castsi256_si128(res));
275         _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
276                          _mm256_extracti128_si256(res, 1));
277       } else {
278         xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
279         xx_storel_32(&dst[i * dst_stride + j + dst_stride],
280                      _mm256_extracti128_si256(res, 1));
281       }
282     }
283   }
284 }
285 
286 #define CONV8_ROUNDING_BITS (7)
287 
288 // -----------------------------------------------------------------------------
289 // Horizontal and vertical filtering
290 
291 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
292                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
293                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
294 
295 static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
296                                               8, 9, 10, 11, 10, 11, 12, 13,
297                                               4, 5, 6,  7,  6,  7,  8,  9,
298                                               8, 9, 10, 11, 10, 11, 12, 13 };
299 
300 static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
301                                               10, 11, 12, 13, 12, 13, 14, 15,
302                                               6,  7,  8,  9,  8,  9,  10, 11,
303                                               10, 11, 12, 13, 12, 13, 14, 15 };
304 
305 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
306 
307 // -----------------------------------------------------------------------------
308 // Horizontal Filtering
309 
pack_pixels(const __m256i * s,__m256i * p)310 static inline void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
311   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
312   const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
313   const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
314   const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
315 
316   p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
317   p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
318   p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
319   p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
320 }
321 
322 // Note:
323 //  Shared by 8x2 and 16x1 block
pack_16_pixels(const __m256i * s0,const __m256i * s1,__m256i * x)324 static inline void pack_16_pixels(const __m256i *s0, const __m256i *s1,
325                                   __m256i *x /*x[8]*/) {
326   __m256i pp[8];
327   pack_pixels(s0, pp);
328   pack_pixels(s1, &pp[4]);
329   x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
330   x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
331   x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
332   x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
333   x[4] = x[2];
334   x[5] = x[3];
335   x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
336   x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
337 }
338 
pack_8x1_pixels(const uint16_t * src,__m256i * x)339 static inline void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
340   __m256i pp[8];
341   __m256i s0;
342   s0 = _mm256_loadu_si256((const __m256i *)src);
343   pack_pixels(&s0, pp);
344   x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
345   x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
346   x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
347   x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
348 }
349 
pack_8x2_pixels(const uint16_t * src,ptrdiff_t stride,__m256i * x)350 static inline void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
351                                    __m256i *x) {
352   __m256i s0, s1;
353   s0 = _mm256_loadu_si256((const __m256i *)src);
354   s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
355   pack_16_pixels(&s0, &s1, x);
356 }
357 
pack_16x1_pixels(const uint16_t * src,__m256i * x)358 static inline void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
359   __m256i s0, s1;
360   s0 = _mm256_loadu_si256((const __m256i *)src);
361   s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
362   pack_16_pixels(&s0, &s1, x);
363 }
364 
365 // Note:
366 //  Shared by horizontal and vertical filtering
pack_filters(const int16_t * filter,__m256i * f)367 static inline void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
368   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
369   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
370   const __m256i p0 = _mm256_set1_epi32(0x03020100);
371   const __m256i p1 = _mm256_set1_epi32(0x07060504);
372   const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
373   const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
374   f[0] = _mm256_shuffle_epi8(hh, p0);
375   f[1] = _mm256_shuffle_epi8(hh, p1);
376   f[2] = _mm256_shuffle_epi8(hh, p2);
377   f[3] = _mm256_shuffle_epi8(hh, p3);
378 }
379 
pack_filters_4tap(const int16_t * filter,__m256i * f)380 static inline void pack_filters_4tap(const int16_t *filter,
381                                      __m256i *f /*f[4]*/) {
382   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
383   const __m256i coeff = _mm256_broadcastsi128_si256(h);
384 
385   // coeffs 2 3 2 3 2 3 2 3
386   f[0] = _mm256_shuffle_epi32(coeff, 0x55);
387   // coeffs 4 5 4 5 4 5 4 5
388   f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
389 }
390 
filter_8x1_pixels(const __m256i * sig,const __m256i * fil,__m256i * y)391 static inline void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
392                                      const __m256i *fil /*fil[4]*/,
393                                      __m256i *y) {
394   __m256i a, a0, a1;
395 
396   a0 = _mm256_madd_epi16(fil[0], sig[0]);
397   a1 = _mm256_madd_epi16(fil[3], sig[3]);
398   a = _mm256_add_epi32(a0, a1);
399 
400   a0 = _mm256_madd_epi16(fil[1], sig[1]);
401   a1 = _mm256_madd_epi16(fil[2], sig[2]);
402 
403   {
404     const __m256i min = _mm256_min_epi32(a0, a1);
405     a = _mm256_add_epi32(a, min);
406   }
407   {
408     const __m256i max = _mm256_max_epi32(a0, a1);
409     a = _mm256_add_epi32(a, max);
410   }
411   {
412     const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
413     a = _mm256_add_epi32(a, rounding);
414     *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
415   }
416 }
417 
store_8x1_pixels(const __m256i * y,const __m256i * mask,uint16_t * dst)418 static inline void store_8x1_pixels(const __m256i *y, const __m256i *mask,
419                                     uint16_t *dst) {
420   const __m128i a0 = _mm256_castsi256_si128(*y);
421   const __m128i a1 = _mm256_extractf128_si256(*y, 1);
422   __m128i res = _mm_packus_epi32(a0, a1);
423   res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
424   _mm_storeu_si128((__m128i *)dst, res);
425 }
426 
store_8x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)427 static inline void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
428                                     const __m256i *mask, uint16_t *dst,
429                                     ptrdiff_t pitch) {
430   __m256i a = _mm256_packus_epi32(*y0, *y1);
431   a = _mm256_min_epi16(a, *mask);
432   _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
433   _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
434 }
435 
store_16x1_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst)436 static inline void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
437                                      const __m256i *mask, uint16_t *dst) {
438   __m256i a = _mm256_packus_epi32(*y0, *y1);
439   a = _mm256_min_epi16(a, *mask);
440   _mm256_storeu_si256((__m256i *)dst, a);
441 }
442 
aom_highbd_filter_block1d8_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)443 static void aom_highbd_filter_block1d8_h8_avx2(
444     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
445     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
446   __m256i signal[8], res0, res1;
447   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
448 
449   __m256i ff[4];
450   pack_filters(filter, ff);
451 
452   src_ptr -= 3;
453   do {
454     pack_8x2_pixels(src_ptr, src_pitch, signal);
455     filter_8x1_pixels(signal, ff, &res0);
456     filter_8x1_pixels(&signal[4], ff, &res1);
457     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
458     height -= 2;
459     src_ptr += src_pitch << 1;
460     dst_ptr += dst_pitch << 1;
461   } while (height > 1);
462 
463   if (height > 0) {
464     pack_8x1_pixels(src_ptr, signal);
465     filter_8x1_pixels(signal, ff, &res0);
466     store_8x1_pixels(&res0, &max, dst_ptr);
467   }
468 }
469 
aom_highbd_filter_block1d16_h8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)470 static void aom_highbd_filter_block1d16_h8_avx2(
471     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
472     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
473   __m256i signal[8], res0, res1;
474   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
475 
476   __m256i ff[4];
477   pack_filters(filter, ff);
478 
479   src_ptr -= 3;
480   do {
481     pack_16x1_pixels(src_ptr, signal);
482     filter_8x1_pixels(signal, ff, &res0);
483     filter_8x1_pixels(&signal[4], ff, &res1);
484     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
485     height -= 1;
486     src_ptr += src_pitch;
487     dst_ptr += dst_pitch;
488   } while (height > 0);
489 }
490 
aom_highbd_filter_block1d4_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)491 static void aom_highbd_filter_block1d4_h4_avx2(
492     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
493     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
494   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
495   __m256i ff[2], s[2];
496   uint32_t i;
497   const __m256i clip_pixel =
498       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
499   const __m256i zero = _mm256_setzero_si256();
500 
501   static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
502                                             7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
503                                             4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
504 
505   __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
506   __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
507   __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
508 
509   pack_filters_4tap(filter, ff);
510   src_ptr -= 3;
511   for (i = 0; i <= (height - 2); i += 2) {
512     __m256i row0 = _mm256_castsi128_si256(
513         _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
514     __m256i row1 = _mm256_castsi128_si256(
515         _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
516 
517     s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
518     s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
519 
520     s[0] = _mm256_shuffle_epi8(s[0], mask);
521     s[1] = _mm256_shuffle_epi8(s[1], mask);
522 
523     __m256i res = convolve_4tap(s, ff);
524     res =
525         _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
526 
527     res = _mm256_packs_epi32(res, res);
528     res = _mm256_min_epi16(res, clip_pixel);
529     res = _mm256_max_epi16(res, zero);
530 
531     _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
532                      _mm256_castsi256_si128(res));
533     _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
534                      _mm256_extracti128_si256(res, 1));
535   }
536   if (height % 2 != 0) {
537     i = height - 1;
538     const __m256i row0_0 = _mm256_castsi128_si256(
539         _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
540     const __m256i row0_1 = _mm256_castsi128_si256(
541         _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
542 
543     const __m256i r0 =
544         _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
545 
546     s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
547     s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
548 
549     __m256i res = convolve_4tap(s, ff);
550     res =
551         _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
552 
553     res = _mm256_packs_epi32(res, res);
554     res = _mm256_min_epi16(res, clip_pixel);
555     res = _mm256_max_epi16(res, zero);
556 
557     _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
558                      _mm256_castsi256_si128(res));
559   }
560 }
561 
aom_highbd_filter_block1d8_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)562 static void aom_highbd_filter_block1d8_h4_avx2(
563     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
564     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
565   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
566   __m256i ff[2], s[2];
567   uint32_t i = 0;
568   const __m256i clip_pixel =
569       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
570   const __m256i zero = _mm256_setzero_si256();
571 
572   static const uint8_t shuffle_mask[32] = { 0, 1, 8,  9,  2, 3, 10, 11,
573                                             4, 5, 12, 13, 6, 7, 14, 15,
574                                             0, 1, 8,  9,  2, 3, 10, 11,
575                                             4, 5, 12, 13, 6, 7, 14, 15 };
576 
577   __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
578   __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
579   __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
580 
581   pack_filters_4tap(filter, ff);
582   src_ptr -= 3;
583 
584   /* Horizontal filter */
585 
586   for (i = 0; i <= (height - 2); i += 2) {
587     const __m256i row0 =
588         _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
589     __m256i row1 =
590         _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
591 
592     const __m256i r0 =
593         _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
594     const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
595 
596     // even pixels
597     s[0] = r0;
598     s[1] = _mm256_alignr_epi8(r1, r0, 4);
599 
600     __m256i res_even = convolve_4tap(s, ff);
601     res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
602                                  CONV8_ROUNDING_BITS);
603 
604     // odd pixels
605     s[0] = _mm256_alignr_epi8(r1, r0, 2);
606     s[1] = _mm256_alignr_epi8(r1, r0, 6);
607 
608     __m256i res_odd = convolve_4tap(s, ff);
609     res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
610                                 CONV8_ROUNDING_BITS);
611 
612     __m256i res = _mm256_packs_epi32(res_even, res_odd);
613     res = _mm256_shuffle_epi8(res, mask);
614 
615     res = _mm256_min_epi16(res, clip_pixel);
616     res = _mm256_max_epi16(res, zero);
617 
618     _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
619                      _mm256_castsi256_si128(res));
620     _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
621                      _mm256_extracti128_si256(res, 1));
622   }
623 
624   if (height % 2 != 0) {
625     i = height - 1;
626     const __m256i row0_0 =
627         _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
628     const __m256i row0_1 =
629         _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
630 
631     const __m256i r0 =
632         _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
633 
634     s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
635     s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
636 
637     __m256i res = convolve_4tap(s, ff);
638     res =
639         _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
640 
641     res = _mm256_packs_epi32(res, res);
642     res = _mm256_min_epi16(res, clip_pixel);
643     res = _mm256_max_epi16(res, zero);
644 
645     _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
646                      _mm256_castsi256_si128(res));
647     _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
648                      _mm256_extracti128_si256(res, 1));
649   }
650 }
651 
aom_highbd_filter_block1d16_h4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)652 static void aom_highbd_filter_block1d16_h4_avx2(
653     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
654     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
655   aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
656                                      height, filter, bd);
657   aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
658                                      dst_pitch, height, filter, bd);
659 }
660 
661 // -----------------------------------------------------------------------------
662 // 2-tap horizontal filtering
663 
pack_2t_filter(const int16_t * filter,__m256i * f)664 static inline void pack_2t_filter(const int16_t *filter, __m256i *f) {
665   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
666   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
667   const __m256i p = _mm256_set1_epi32(0x09080706);
668   f[0] = _mm256_shuffle_epi8(hh, p);
669 }
670 
671 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
672 // the difference is s0/s1 specifies first and second rows or,
673 // first 16 samples and 8-sample shifted 16 samples
pack_16_2t_pixels(const __m256i * s0,const __m256i * s1,__m256i * sig)674 static inline void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
675                                      __m256i *sig) {
676   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
677   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
678   __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
679   __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
680   __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
681   __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
682   r0 = _mm256_shuffle_epi8(r0, sf2);
683   r1 = _mm256_shuffle_epi8(r1, sf2);
684   sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
685   sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
686 }
687 
pack_8x2_2t_pixels(const uint16_t * src,const ptrdiff_t pitch,__m256i * sig)688 static inline void pack_8x2_2t_pixels(const uint16_t *src,
689                                       const ptrdiff_t pitch, __m256i *sig) {
690   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
691   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
692   pack_16_2t_pixels(&r0, &r1, sig);
693 }
694 
pack_16x1_2t_pixels(const uint16_t * src,__m256i * sig)695 static inline void pack_16x1_2t_pixels(const uint16_t *src,
696                                        __m256i *sig /*sig[2]*/) {
697   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
698   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
699   pack_16_2t_pixels(&r0, &r1, sig);
700 }
701 
pack_8x1_2t_pixels(const uint16_t * src,__m256i * sig)702 static inline void pack_8x1_2t_pixels(const uint16_t *src,
703                                       __m256i *sig /*sig[2]*/) {
704   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
705   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
706   __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
707   __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
708   r0 = _mm256_permutevar8x32_epi32(r0, idx);
709   r0 = _mm256_shuffle_epi8(r0, sf2);
710   sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
711 }
712 
713 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
filter_16_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)714 static inline void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
715                                        __m256i *y0, __m256i *y1) {
716   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
717   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
718   __m256i x1 = _mm256_madd_epi16(sig[1], *f);
719   x0 = _mm256_add_epi32(x0, rounding);
720   x1 = _mm256_add_epi32(x1, rounding);
721   *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
722   *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
723 }
724 
filter_8x1_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0)725 static inline void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
726                                         __m256i *y0) {
727   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
728   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
729   x0 = _mm256_add_epi32(x0, rounding);
730   *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
731 }
732 
aom_highbd_filter_block1d8_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)733 static void aom_highbd_filter_block1d8_h2_avx2(
734     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
735     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
736   __m256i signal[2], res0, res1;
737   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
738 
739   __m256i ff;
740   pack_2t_filter(filter, &ff);
741 
742   src_ptr -= 3;
743   do {
744     pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
745     filter_16_2t_pixels(signal, &ff, &res0, &res1);
746     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
747     height -= 2;
748     src_ptr += src_pitch << 1;
749     dst_ptr += dst_pitch << 1;
750   } while (height > 1);
751 
752   if (height > 0) {
753     pack_8x1_2t_pixels(src_ptr, signal);
754     filter_8x1_2t_pixels(signal, &ff, &res0);
755     store_8x1_pixels(&res0, &max, dst_ptr);
756   }
757 }
758 
aom_highbd_filter_block1d16_h2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)759 static void aom_highbd_filter_block1d16_h2_avx2(
760     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
761     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
762   __m256i signal[2], res0, res1;
763   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
764 
765   __m256i ff;
766   pack_2t_filter(filter, &ff);
767 
768   src_ptr -= 3;
769   do {
770     pack_16x1_2t_pixels(src_ptr, signal);
771     filter_16_2t_pixels(signal, &ff, &res0, &res1);
772     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
773     height -= 1;
774     src_ptr += src_pitch;
775     dst_ptr += dst_pitch;
776   } while (height > 0);
777 }
778 
779 // -----------------------------------------------------------------------------
780 // Vertical Filtering
781 
pack_8x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)782 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
783   __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
784   __m256i s1 =
785       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
786   __m256i s2 = _mm256_castsi128_si256(
787       _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
788   __m256i s3 = _mm256_castsi128_si256(
789       _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
790   __m256i s4 = _mm256_castsi128_si256(
791       _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
792   __m256i s5 = _mm256_castsi128_si256(
793       _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
794   __m256i s6 = _mm256_castsi128_si256(
795       _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
796 
797   s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
798   s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
799   s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
800   s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
801   s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
802   s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
803 
804   sig[0] = _mm256_unpacklo_epi16(s0, s1);
805   sig[4] = _mm256_unpackhi_epi16(s0, s1);
806   sig[1] = _mm256_unpacklo_epi16(s2, s3);
807   sig[5] = _mm256_unpackhi_epi16(s2, s3);
808   sig[2] = _mm256_unpacklo_epi16(s4, s5);
809   sig[6] = _mm256_unpackhi_epi16(s4, s5);
810   sig[8] = s6;
811 }
812 
pack_8x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)813 static inline void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
814                                    __m256i *sig) {
815   // base + 7th row
816   __m256i s0 = _mm256_castsi128_si256(
817       _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
818   // base + 8th row
819   __m256i s1 = _mm256_castsi128_si256(
820       _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
821   __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
822   __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
823   sig[3] = _mm256_unpacklo_epi16(s2, s3);
824   sig[7] = _mm256_unpackhi_epi16(s2, s3);
825   sig[8] = s1;
826 }
827 
filter_8x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)828 static inline void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
829                                      __m256i *y0, __m256i *y1) {
830   filter_8x1_pixels(sig, f, y0);
831   filter_8x1_pixels(&sig[4], f, y1);
832 }
833 
update_pixels(__m256i * sig)834 static inline void update_pixels(__m256i *sig) {
835   int i;
836   for (i = 0; i < 3; ++i) {
837     sig[i] = sig[i + 1];
838     sig[i + 4] = sig[i + 5];
839   }
840 }
841 
aom_highbd_filter_block1d8_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)842 static void aom_highbd_filter_block1d8_v8_avx2(
843     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
844     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
845   __m256i signal[9], res0, res1;
846   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
847 
848   __m256i ff[4];
849   pack_filters(filter, ff);
850 
851   pack_8x9_init(src_ptr, src_pitch, signal);
852 
853   do {
854     pack_8x9_pixels(src_ptr, src_pitch, signal);
855 
856     filter_8x9_pixels(signal, ff, &res0, &res1);
857     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
858     update_pixels(signal);
859 
860     src_ptr += src_pitch << 1;
861     dst_ptr += dst_pitch << 1;
862     height -= 2;
863   } while (height > 0);
864 }
865 
pack_16x9_init(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)866 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
867   __m256i u0, u1, u2, u3;
868   // load 0-6 rows
869   const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
870   const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
871   const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
872   const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
873   const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
874   const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
875   const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
876 
877   u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
878   u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
879 
880   u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
881   u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
882 
883   sig[0] = _mm256_unpacklo_epi16(u0, u2);
884   sig[4] = _mm256_unpackhi_epi16(u0, u2);
885 
886   sig[8] = _mm256_unpacklo_epi16(u1, u3);
887   sig[12] = _mm256_unpackhi_epi16(u1, u3);
888 
889   u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
890   u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
891 
892   u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
893   u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
894 
895   sig[1] = _mm256_unpacklo_epi16(u0, u2);
896   sig[5] = _mm256_unpackhi_epi16(u0, u2);
897 
898   sig[9] = _mm256_unpacklo_epi16(u1, u3);
899   sig[13] = _mm256_unpackhi_epi16(u1, u3);
900 
901   u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
902   u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
903 
904   u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
905   u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
906 
907   sig[2] = _mm256_unpacklo_epi16(u0, u2);
908   sig[6] = _mm256_unpackhi_epi16(u0, u2);
909 
910   sig[10] = _mm256_unpacklo_epi16(u1, u3);
911   sig[14] = _mm256_unpackhi_epi16(u1, u3);
912 
913   sig[16] = s6;
914 }
915 
pack_16x9_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)916 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
917                              __m256i *sig) {
918   // base + 7th row
919   const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
920   // base + 8th row
921   const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
922 
923   __m256i u0, u1, u2, u3;
924   u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
925   u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
926 
927   u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
928   u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
929 
930   sig[3] = _mm256_unpacklo_epi16(u0, u2);
931   sig[7] = _mm256_unpackhi_epi16(u0, u2);
932 
933   sig[11] = _mm256_unpacklo_epi16(u1, u3);
934   sig[15] = _mm256_unpackhi_epi16(u1, u3);
935 
936   sig[16] = s8;
937 }
938 
filter_16x9_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)939 static inline void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
940                                       __m256i *y0, __m256i *y1) {
941   __m256i res[4];
942   int i;
943   for (i = 0; i < 4; ++i) {
944     filter_8x1_pixels(&sig[i << 2], f, &res[i]);
945   }
946 
947   {
948     const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
949     const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
950     *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
951     *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
952   }
953 }
954 
store_16x2_pixels(const __m256i * y0,const __m256i * y1,const __m256i * mask,uint16_t * dst,ptrdiff_t pitch)955 static inline void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
956                                      const __m256i *mask, uint16_t *dst,
957                                      ptrdiff_t pitch) {
958   __m256i p = _mm256_min_epi16(*y0, *mask);
959   _mm256_storeu_si256((__m256i *)dst, p);
960   p = _mm256_min_epi16(*y1, *mask);
961   _mm256_storeu_si256((__m256i *)(dst + pitch), p);
962 }
963 
update_16x9_pixels(__m256i * sig)964 static void update_16x9_pixels(__m256i *sig) {
965   update_pixels(&sig[0]);
966   update_pixels(&sig[8]);
967 }
968 
aom_highbd_filter_block1d16_v8_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)969 static void aom_highbd_filter_block1d16_v8_avx2(
970     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
971     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
972   __m256i signal[17], res0, res1;
973   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
974 
975   __m256i ff[4];
976   pack_filters(filter, ff);
977 
978   pack_16x9_init(src_ptr, src_pitch, signal);
979 
980   do {
981     pack_16x9_pixels(src_ptr, src_pitch, signal);
982     filter_16x9_pixels(signal, ff, &res0, &res1);
983     store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
984     update_16x9_pixels(signal);
985 
986     src_ptr += src_pitch << 1;
987     dst_ptr += dst_pitch << 1;
988     height -= 2;
989   } while (height > 0);
990 }
991 
aom_highbd_filter_block1d4_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)992 static void aom_highbd_filter_block1d4_v4_avx2(
993     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
994     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
995   const int bits = FILTER_BITS;
996 
997   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
998   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
999   const __m256i clip_pixel =
1000       _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
1001   const __m256i zero = _mm256_setzero_si256();
1002   uint32_t i;
1003   __m256i s[2], ff[2];
1004 
1005   pack_filters_4tap(filter, ff);
1006 
1007   const uint16_t *data = src_ptr;
1008   /* Vertical filter */
1009   {
1010     __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
1011     __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
1012 
1013     __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
1014 
1015     __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
1016 
1017     __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
1018 
1019     s[0] = _mm256_unpacklo_epi16(s23, s34);
1020 
1021     for (i = 0; i < height; i += 2) {
1022       data = &src_ptr[i * src_pitch];
1023 
1024       __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
1025       __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
1026 
1027       __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
1028       __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
1029 
1030       s[1] = _mm256_unpacklo_epi16(s45, s56);
1031 
1032       const __m256i res_a = convolve_4tap(s, ff);
1033 
1034       __m256i res_a_round = _mm256_sra_epi32(
1035           _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
1036 
1037       __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
1038       res_16bit = _mm256_max_epi32(res_16bit, zero);
1039       res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
1040 
1041       _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
1042                        _mm256_castsi256_si128(res_16bit));
1043       _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
1044                        _mm256_extracti128_si256(res_16bit, 1));
1045 
1046       s[0] = s[1];
1047       s4 = s6;
1048     }
1049   }
1050 }
1051 
aom_highbd_filter_block1d8_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1052 static void aom_highbd_filter_block1d8_v4_avx2(
1053     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1054     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1055   const int bits = FILTER_BITS;
1056 
1057   const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
1058   const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
1059   const __m256i clip_pixel =
1060       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
1061   const __m256i zero = _mm256_setzero_si256();
1062   __m256i s[4], ff[2];
1063   uint32_t i;
1064   pack_filters_4tap(filter, ff);
1065 
1066   const uint16_t *data = src_ptr;
1067   /* Vertical filter */
1068   {
1069     __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
1070     __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
1071 
1072     __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
1073 
1074     __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
1075 
1076     __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
1077 
1078     s[0] = _mm256_unpacklo_epi16(s23, s34);
1079     s[2] = _mm256_unpackhi_epi16(s23, s34);
1080 
1081     for (i = 0; i < height; i += 2) {
1082       data = &src_ptr[i * src_pitch];
1083 
1084       __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
1085       __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
1086 
1087       __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
1088       __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
1089 
1090       s[1] = _mm256_unpacklo_epi16(s45, s56);
1091       s[3] = _mm256_unpackhi_epi16(s45, s56);
1092 
1093       const __m256i res_a = convolve_4tap(s, ff);
1094 
1095       __m256i res_a_round = _mm256_sra_epi32(
1096           _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
1097 
1098       const __m256i res_b = convolve_4tap(s + 2, ff);
1099       __m256i res_b_round = _mm256_sra_epi32(
1100           _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
1101 
1102       __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
1103       res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
1104       res_16bit = _mm256_max_epi16(res_16bit, zero);
1105 
1106       _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
1107                        _mm256_castsi256_si128(res_16bit));
1108       _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
1109                        _mm256_extracti128_si256(res_16bit, 1));
1110 
1111       s[0] = s[1];
1112       s[2] = s[3];
1113       s4 = s6;
1114     }
1115   }
1116 }
1117 
aom_highbd_filter_block1d16_v4_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1118 static void aom_highbd_filter_block1d16_v4_avx2(
1119     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1120     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1121   aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
1122                                      height, filter, bd);
1123 
1124   aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
1125                                      dst_pitch, height, filter, bd);
1126 }
1127 
1128 // -----------------------------------------------------------------------------
1129 // 2-tap vertical filtering
1130 
pack_16x2_init(const uint16_t * src,__m256i * sig)1131 static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
1132   sig[2] = _mm256_loadu_si256((const __m256i *)src);
1133 }
1134 
pack_16x2_2t_pixels(const uint16_t * src,ptrdiff_t pitch,__m256i * sig)1135 static inline void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
1136                                        __m256i *sig) {
1137   // load the next row
1138   const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
1139   sig[0] = _mm256_unpacklo_epi16(sig[2], u);
1140   sig[1] = _mm256_unpackhi_epi16(sig[2], u);
1141   sig[2] = u;
1142 }
1143 
filter_16x2_2t_pixels(const __m256i * sig,const __m256i * f,__m256i * y0,__m256i * y1)1144 static inline void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
1145                                          __m256i *y0, __m256i *y1) {
1146   filter_16_2t_pixels(sig, f, y0, y1);
1147 }
1148 
aom_highbd_filter_block1d16_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1149 static void aom_highbd_filter_block1d16_v2_avx2(
1150     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1151     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1152   __m256i signal[3], res0, res1;
1153   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
1154   __m256i ff;
1155 
1156   pack_2t_filter(filter, &ff);
1157   pack_16x2_init(src_ptr, signal);
1158 
1159   do {
1160     pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
1161     filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
1162     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
1163 
1164     src_ptr += src_pitch;
1165     dst_ptr += dst_pitch;
1166     height -= 1;
1167   } while (height > 0);
1168 }
1169 
pack_8x1_2t_filter(const int16_t * filter,__m128i * f)1170 static inline void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
1171   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
1172   const __m128i p = _mm_set1_epi32(0x09080706);
1173   f[0] = _mm_shuffle_epi8(h, p);
1174 }
1175 
pack_8x2_init(const uint16_t * src,__m128i * sig)1176 static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
1177   sig[2] = _mm_loadu_si128((const __m128i *)src);
1178 }
1179 
pack_8x2_2t_pixels_ver(const uint16_t * src,ptrdiff_t pitch,__m128i * sig)1180 static inline void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
1181                                           __m128i *sig) {
1182   // load the next row
1183   const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
1184   sig[0] = _mm_unpacklo_epi16(sig[2], u);
1185   sig[1] = _mm_unpackhi_epi16(sig[2], u);
1186   sig[2] = u;
1187 }
1188 
filter_8_2t_pixels(const __m128i * sig,const __m128i * f,__m128i * y0,__m128i * y1)1189 static inline void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
1190                                       __m128i *y0, __m128i *y1) {
1191   const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
1192   __m128i x0 = _mm_madd_epi16(sig[0], *f);
1193   __m128i x1 = _mm_madd_epi16(sig[1], *f);
1194   x0 = _mm_add_epi32(x0, rounding);
1195   x1 = _mm_add_epi32(x1, rounding);
1196   *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
1197   *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
1198 }
1199 
store_8x1_2t_pixels_ver(const __m128i * y0,const __m128i * y1,const __m128i * mask,uint16_t * dst)1200 static inline void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
1201                                            const __m128i *mask, uint16_t *dst) {
1202   __m128i res = _mm_packus_epi32(*y0, *y1);
1203   res = _mm_min_epi16(res, *mask);
1204   _mm_storeu_si128((__m128i *)dst, res);
1205 }
1206 
aom_highbd_filter_block1d8_v2_avx2(const uint16_t * src_ptr,ptrdiff_t src_pitch,uint16_t * dst_ptr,ptrdiff_t dst_pitch,uint32_t height,const int16_t * filter,int bd)1207 static void aom_highbd_filter_block1d8_v2_avx2(
1208     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
1209     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
1210   __m128i signal[3], res0, res1;
1211   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
1212   __m128i ff;
1213 
1214   pack_8x1_2t_filter(filter, &ff);
1215   pack_8x2_init(src_ptr, signal);
1216 
1217   do {
1218     pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
1219     filter_8_2t_pixels(signal, &ff, &res0, &res1);
1220     store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
1221 
1222     src_ptr += src_pitch;
1223     dst_ptr += dst_pitch;
1224     height -= 1;
1225   } while (height > 0);
1226 }
1227 
1228 void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1229                                         ptrdiff_t, uint32_t, const int16_t *,
1230                                         int);
1231 void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1232                                         ptrdiff_t, uint32_t, const int16_t *,
1233                                         int);
1234 void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1235                                         ptrdiff_t, uint32_t, const int16_t *,
1236                                         int);
1237 void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
1238                                         ptrdiff_t, uint32_t, const int16_t *,
1239                                         int);
1240 #define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
1241 #define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
1242 #define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
1243 #define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
1244 
1245 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
1246 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
1247 
1248 #undef HIGHBD_FUNC
1249