1*09537850SAkhilesh Sanikop // Copyright 2020 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/convolve.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_AVX2
19*09537850SAkhilesh Sanikop #include <immintrin.h>
20*09537850SAkhilesh Sanikop
21*09537850SAkhilesh Sanikop #include <algorithm>
22*09537850SAkhilesh Sanikop #include <cassert>
23*09537850SAkhilesh Sanikop #include <cstdint>
24*09537850SAkhilesh Sanikop #include <cstring>
25*09537850SAkhilesh Sanikop
26*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
27*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
28*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_avx2.h"
29*09537850SAkhilesh Sanikop #include "src/utils/common.h"
30*09537850SAkhilesh Sanikop #include "src/utils/compiler_attributes.h"
31*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
32*09537850SAkhilesh Sanikop
33*09537850SAkhilesh Sanikop namespace libgav1 {
34*09537850SAkhilesh Sanikop namespace dsp {
35*09537850SAkhilesh Sanikop namespace low_bitdepth {
36*09537850SAkhilesh Sanikop namespace {
37*09537850SAkhilesh Sanikop
38*09537850SAkhilesh Sanikop #include "src/dsp/x86/convolve_sse4.inc"
39*09537850SAkhilesh Sanikop
40*09537850SAkhilesh Sanikop // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
41*09537850SAkhilesh Sanikop // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
42*09537850SAkhilesh Sanikop // sum from outranging int16_t.
43*09537850SAkhilesh Sanikop template <int num_taps>
SumOnePassTaps(const __m256i * const src,const __m256i * const taps)44*09537850SAkhilesh Sanikop __m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
45*09537850SAkhilesh Sanikop __m256i sum;
46*09537850SAkhilesh Sanikop if (num_taps == 6) {
47*09537850SAkhilesh Sanikop // 6 taps.
48*09537850SAkhilesh Sanikop const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
49*09537850SAkhilesh Sanikop const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
50*09537850SAkhilesh Sanikop const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
51*09537850SAkhilesh Sanikop sum = _mm256_add_epi16(v_madd_21, v_madd_43);
52*09537850SAkhilesh Sanikop sum = _mm256_add_epi16(sum, v_madd_65);
53*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
54*09537850SAkhilesh Sanikop // 8 taps.
55*09537850SAkhilesh Sanikop const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
56*09537850SAkhilesh Sanikop const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
57*09537850SAkhilesh Sanikop const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4
58*09537850SAkhilesh Sanikop const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6
59*09537850SAkhilesh Sanikop const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
60*09537850SAkhilesh Sanikop const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
61*09537850SAkhilesh Sanikop sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
62*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
63*09537850SAkhilesh Sanikop // 2 taps.
64*09537850SAkhilesh Sanikop sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
65*09537850SAkhilesh Sanikop } else {
66*09537850SAkhilesh Sanikop // 4 taps.
67*09537850SAkhilesh Sanikop const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2
68*09537850SAkhilesh Sanikop const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4
69*09537850SAkhilesh Sanikop sum = _mm256_add_epi16(v_madd_32, v_madd_54);
70*09537850SAkhilesh Sanikop }
71*09537850SAkhilesh Sanikop return sum;
72*09537850SAkhilesh Sanikop }
73*09537850SAkhilesh Sanikop
74*09537850SAkhilesh Sanikop template <int num_taps>
SumHorizontalTaps(const __m256i * const src,const __m256i * const v_tap)75*09537850SAkhilesh Sanikop __m256i SumHorizontalTaps(const __m256i* const src,
76*09537850SAkhilesh Sanikop const __m256i* const v_tap) {
77*09537850SAkhilesh Sanikop __m256i v_src[4];
78*09537850SAkhilesh Sanikop const __m256i src_long = *src;
79*09537850SAkhilesh Sanikop const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
80*09537850SAkhilesh Sanikop const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
81*09537850SAkhilesh Sanikop
82*09537850SAkhilesh Sanikop if (num_taps == 6) {
83*09537850SAkhilesh Sanikop // 6 taps.
84*09537850SAkhilesh Sanikop v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
85*09537850SAkhilesh Sanikop v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
86*09537850SAkhilesh Sanikop v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
87*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
88*09537850SAkhilesh Sanikop // 8 taps.
89*09537850SAkhilesh Sanikop v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
90*09537850SAkhilesh Sanikop v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
91*09537850SAkhilesh Sanikop v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
92*09537850SAkhilesh Sanikop v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
93*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
94*09537850SAkhilesh Sanikop // 2 taps.
95*09537850SAkhilesh Sanikop v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
96*09537850SAkhilesh Sanikop } else {
97*09537850SAkhilesh Sanikop // 4 taps.
98*09537850SAkhilesh Sanikop v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
99*09537850SAkhilesh Sanikop v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
100*09537850SAkhilesh Sanikop }
101*09537850SAkhilesh Sanikop return SumOnePassTaps<num_taps>(v_src, v_tap);
102*09537850SAkhilesh Sanikop }
103*09537850SAkhilesh Sanikop
104*09537850SAkhilesh Sanikop template <int num_taps>
SimpleHorizontalTaps(const __m256i * const src,const __m256i * const v_tap)105*09537850SAkhilesh Sanikop __m256i SimpleHorizontalTaps(const __m256i* const src,
106*09537850SAkhilesh Sanikop const __m256i* const v_tap) {
107*09537850SAkhilesh Sanikop __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
108*09537850SAkhilesh Sanikop
109*09537850SAkhilesh Sanikop // Normally the Horizontal pass does the downshift in two passes:
110*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
111*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
112*09537850SAkhilesh Sanikop // requires adding the rounding offset from the skipped shift.
113*09537850SAkhilesh Sanikop constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
114*09537850SAkhilesh Sanikop
115*09537850SAkhilesh Sanikop sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
116*09537850SAkhilesh Sanikop sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
117*09537850SAkhilesh Sanikop return _mm256_packus_epi16(sum, sum);
118*09537850SAkhilesh Sanikop }
119*09537850SAkhilesh Sanikop
120*09537850SAkhilesh Sanikop template <int num_taps>
HorizontalTaps8To16(const __m256i * const src,const __m256i * const v_tap)121*09537850SAkhilesh Sanikop __m256i HorizontalTaps8To16(const __m256i* const src,
122*09537850SAkhilesh Sanikop const __m256i* const v_tap) {
123*09537850SAkhilesh Sanikop const __m256i sum = SumHorizontalTaps<num_taps>(src, v_tap);
124*09537850SAkhilesh Sanikop
125*09537850SAkhilesh Sanikop return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
126*09537850SAkhilesh Sanikop }
127*09537850SAkhilesh Sanikop
128*09537850SAkhilesh Sanikop // Filter 2xh sizes.
129*09537850SAkhilesh Sanikop template <int num_taps, bool is_2d = false, bool is_compound = false>
FilterHorizontal(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int,const int height,const __m128i * const v_tap)130*09537850SAkhilesh Sanikop void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
131*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
132*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
133*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int /*width*/,
134*09537850SAkhilesh Sanikop const int height, const __m128i* const v_tap) {
135*09537850SAkhilesh Sanikop auto* dest8 = static_cast<uint8_t*>(dest);
136*09537850SAkhilesh Sanikop auto* dest16 = static_cast<uint16_t*>(dest);
137*09537850SAkhilesh Sanikop
138*09537850SAkhilesh Sanikop // Horizontal passes only need to account for |num_taps| 2 and 4 when
139*09537850SAkhilesh Sanikop // |width| <= 4.
140*09537850SAkhilesh Sanikop assert(num_taps <= 4);
141*09537850SAkhilesh Sanikop if (num_taps <= 4) {
142*09537850SAkhilesh Sanikop if (!is_compound) {
143*09537850SAkhilesh Sanikop int y = height;
144*09537850SAkhilesh Sanikop if (is_2d) y -= 1;
145*09537850SAkhilesh Sanikop do {
146*09537850SAkhilesh Sanikop if (is_2d) {
147*09537850SAkhilesh Sanikop const __m128i sum =
148*09537850SAkhilesh Sanikop HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
149*09537850SAkhilesh Sanikop Store4(&dest16[0], sum);
150*09537850SAkhilesh Sanikop dest16 += pred_stride;
151*09537850SAkhilesh Sanikop Store4(&dest16[0], _mm_srli_si128(sum, 8));
152*09537850SAkhilesh Sanikop dest16 += pred_stride;
153*09537850SAkhilesh Sanikop } else {
154*09537850SAkhilesh Sanikop const __m128i sum =
155*09537850SAkhilesh Sanikop SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
156*09537850SAkhilesh Sanikop Store2(dest8, sum);
157*09537850SAkhilesh Sanikop dest8 += pred_stride;
158*09537850SAkhilesh Sanikop Store2(dest8, _mm_srli_si128(sum, 4));
159*09537850SAkhilesh Sanikop dest8 += pred_stride;
160*09537850SAkhilesh Sanikop }
161*09537850SAkhilesh Sanikop
162*09537850SAkhilesh Sanikop src += src_stride << 1;
163*09537850SAkhilesh Sanikop y -= 2;
164*09537850SAkhilesh Sanikop } while (y != 0);
165*09537850SAkhilesh Sanikop
166*09537850SAkhilesh Sanikop // The 2d filters have an odd |height| because the horizontal pass
167*09537850SAkhilesh Sanikop // generates context for the vertical pass.
168*09537850SAkhilesh Sanikop if (is_2d) {
169*09537850SAkhilesh Sanikop assert(height % 2 == 1);
170*09537850SAkhilesh Sanikop __m128i sum;
171*09537850SAkhilesh Sanikop const __m128i input = LoadLo8(&src[2]);
172*09537850SAkhilesh Sanikop if (num_taps == 2) {
173*09537850SAkhilesh Sanikop // 03 04 04 05 05 06 06 07 ....
174*09537850SAkhilesh Sanikop const __m128i v_src_43 =
175*09537850SAkhilesh Sanikop _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
176*09537850SAkhilesh Sanikop sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
177*09537850SAkhilesh Sanikop } else {
178*09537850SAkhilesh Sanikop // 02 03 03 04 04 05 05 06 06 07 ....
179*09537850SAkhilesh Sanikop const __m128i v_src_32 =
180*09537850SAkhilesh Sanikop _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
181*09537850SAkhilesh Sanikop // 04 05 05 06 06 07 07 08 ...
182*09537850SAkhilesh Sanikop const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
183*09537850SAkhilesh Sanikop const __m128i v_madd_32 =
184*09537850SAkhilesh Sanikop _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
185*09537850SAkhilesh Sanikop const __m128i v_madd_54 =
186*09537850SAkhilesh Sanikop _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
187*09537850SAkhilesh Sanikop sum = _mm_add_epi16(v_madd_54, v_madd_32);
188*09537850SAkhilesh Sanikop }
189*09537850SAkhilesh Sanikop sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
190*09537850SAkhilesh Sanikop Store4(dest16, sum);
191*09537850SAkhilesh Sanikop }
192*09537850SAkhilesh Sanikop }
193*09537850SAkhilesh Sanikop }
194*09537850SAkhilesh Sanikop }
195*09537850SAkhilesh Sanikop
196*09537850SAkhilesh Sanikop // Filter widths >= 4.
197*09537850SAkhilesh Sanikop template <int num_taps, bool is_2d = false, bool is_compound = false>
FilterHorizontal(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int width,const int height,const __m256i * const v_tap)198*09537850SAkhilesh Sanikop void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
199*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
200*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
201*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int width,
202*09537850SAkhilesh Sanikop const int height, const __m256i* const v_tap) {
203*09537850SAkhilesh Sanikop auto* dest8 = static_cast<uint8_t*>(dest);
204*09537850SAkhilesh Sanikop auto* dest16 = static_cast<uint16_t*>(dest);
205*09537850SAkhilesh Sanikop
206*09537850SAkhilesh Sanikop if (width >= 32) {
207*09537850SAkhilesh Sanikop int y = height;
208*09537850SAkhilesh Sanikop do {
209*09537850SAkhilesh Sanikop int x = 0;
210*09537850SAkhilesh Sanikop do {
211*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
212*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
213*09537850SAkhilesh Sanikop const __m256i src_long =
214*09537850SAkhilesh Sanikop SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
215*09537850SAkhilesh Sanikop const __m256i result =
216*09537850SAkhilesh Sanikop HorizontalTaps8To16<num_taps>(&src_long, v_tap);
217*09537850SAkhilesh Sanikop const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
218*09537850SAkhilesh Sanikop LoadUnaligned16(&src[x + 24]));
219*09537850SAkhilesh Sanikop const __m256i result2 =
220*09537850SAkhilesh Sanikop HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
221*09537850SAkhilesh Sanikop if (is_2d) {
222*09537850SAkhilesh Sanikop StoreAligned32(&dest16[x], result);
223*09537850SAkhilesh Sanikop StoreAligned32(&dest16[x + 16], result2);
224*09537850SAkhilesh Sanikop } else {
225*09537850SAkhilesh Sanikop StoreUnaligned32(&dest16[x], result);
226*09537850SAkhilesh Sanikop StoreUnaligned32(&dest16[x + 16], result2);
227*09537850SAkhilesh Sanikop }
228*09537850SAkhilesh Sanikop } else {
229*09537850SAkhilesh Sanikop // Load src used to calculate dest8[7:0] and dest8[23:16].
230*09537850SAkhilesh Sanikop const __m256i src_long = LoadUnaligned32(&src[x]);
231*09537850SAkhilesh Sanikop const __m256i result =
232*09537850SAkhilesh Sanikop SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
233*09537850SAkhilesh Sanikop // Load src used to calculate dest8[15:8] and dest8[31:24].
234*09537850SAkhilesh Sanikop const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
235*09537850SAkhilesh Sanikop const __m256i result2 =
236*09537850SAkhilesh Sanikop SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
237*09537850SAkhilesh Sanikop // Combine results and store.
238*09537850SAkhilesh Sanikop StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
239*09537850SAkhilesh Sanikop }
240*09537850SAkhilesh Sanikop x += 32;
241*09537850SAkhilesh Sanikop } while (x < width);
242*09537850SAkhilesh Sanikop src += src_stride;
243*09537850SAkhilesh Sanikop dest8 += pred_stride;
244*09537850SAkhilesh Sanikop dest16 += pred_stride;
245*09537850SAkhilesh Sanikop } while (--y != 0);
246*09537850SAkhilesh Sanikop } else if (width == 16) {
247*09537850SAkhilesh Sanikop int y = height;
248*09537850SAkhilesh Sanikop if (is_2d) y -= 1;
249*09537850SAkhilesh Sanikop do {
250*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
251*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
252*09537850SAkhilesh Sanikop const __m256i src_long =
253*09537850SAkhilesh Sanikop SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
254*09537850SAkhilesh Sanikop const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
255*09537850SAkhilesh Sanikop const __m256i src_long2 =
256*09537850SAkhilesh Sanikop SetrM128i(LoadUnaligned16(&src[src_stride]),
257*09537850SAkhilesh Sanikop LoadUnaligned16(&src[8 + src_stride]));
258*09537850SAkhilesh Sanikop const __m256i result2 =
259*09537850SAkhilesh Sanikop HorizontalTaps8To16<num_taps>(&src_long2, v_tap);
260*09537850SAkhilesh Sanikop if (is_2d) {
261*09537850SAkhilesh Sanikop StoreAligned32(&dest16[0], result);
262*09537850SAkhilesh Sanikop StoreAligned32(&dest16[pred_stride], result2);
263*09537850SAkhilesh Sanikop } else {
264*09537850SAkhilesh Sanikop StoreUnaligned32(&dest16[0], result);
265*09537850SAkhilesh Sanikop StoreUnaligned32(&dest16[pred_stride], result2);
266*09537850SAkhilesh Sanikop }
267*09537850SAkhilesh Sanikop } else {
268*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
269*09537850SAkhilesh Sanikop const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
270*09537850SAkhilesh Sanikop LoadUnaligned16(&src[src_stride]));
271*09537850SAkhilesh Sanikop const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
272*09537850SAkhilesh Sanikop const __m256i src_long2 = SetrM128i(
273*09537850SAkhilesh Sanikop LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
274*09537850SAkhilesh Sanikop const __m256i result2 =
275*09537850SAkhilesh Sanikop SimpleHorizontalTaps<num_taps>(&src_long2, v_tap);
276*09537850SAkhilesh Sanikop const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
277*09537850SAkhilesh Sanikop StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
278*09537850SAkhilesh Sanikop StoreUnaligned16(&dest8[pred_stride],
279*09537850SAkhilesh Sanikop _mm256_extracti128_si256(packed_result, 1));
280*09537850SAkhilesh Sanikop }
281*09537850SAkhilesh Sanikop src += src_stride * 2;
282*09537850SAkhilesh Sanikop dest8 += pred_stride * 2;
283*09537850SAkhilesh Sanikop dest16 += pred_stride * 2;
284*09537850SAkhilesh Sanikop y -= 2;
285*09537850SAkhilesh Sanikop } while (y != 0);
286*09537850SAkhilesh Sanikop
287*09537850SAkhilesh Sanikop // The 2d filters have an odd |height| during the horizontal pass, so
288*09537850SAkhilesh Sanikop // filter the remaining row.
289*09537850SAkhilesh Sanikop if (is_2d) {
290*09537850SAkhilesh Sanikop const __m256i src_long =
291*09537850SAkhilesh Sanikop SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
292*09537850SAkhilesh Sanikop const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
293*09537850SAkhilesh Sanikop StoreAligned32(&dest16[0], result);
294*09537850SAkhilesh Sanikop }
295*09537850SAkhilesh Sanikop
296*09537850SAkhilesh Sanikop } else if (width == 8) {
297*09537850SAkhilesh Sanikop int y = height;
298*09537850SAkhilesh Sanikop if (is_2d) y -= 1;
299*09537850SAkhilesh Sanikop do {
300*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
301*09537850SAkhilesh Sanikop const __m128i this_row = LoadUnaligned16(&src[0]);
302*09537850SAkhilesh Sanikop const __m128i next_row = LoadUnaligned16(&src[src_stride]);
303*09537850SAkhilesh Sanikop const __m256i src_long = SetrM128i(this_row, next_row);
304*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
305*09537850SAkhilesh Sanikop const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
306*09537850SAkhilesh Sanikop if (is_2d) {
307*09537850SAkhilesh Sanikop StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
308*09537850SAkhilesh Sanikop StoreAligned16(&dest16[pred_stride],
309*09537850SAkhilesh Sanikop _mm256_extracti128_si256(result, 1));
310*09537850SAkhilesh Sanikop } else {
311*09537850SAkhilesh Sanikop StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
312*09537850SAkhilesh Sanikop StoreUnaligned16(&dest16[pred_stride],
313*09537850SAkhilesh Sanikop _mm256_extracti128_si256(result, 1));
314*09537850SAkhilesh Sanikop }
315*09537850SAkhilesh Sanikop } else {
316*09537850SAkhilesh Sanikop const __m128i this_row = LoadUnaligned16(&src[0]);
317*09537850SAkhilesh Sanikop const __m128i next_row = LoadUnaligned16(&src[src_stride]);
318*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
319*09537850SAkhilesh Sanikop const __m256i src_long = SetrM128i(this_row, next_row);
320*09537850SAkhilesh Sanikop const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
321*09537850SAkhilesh Sanikop StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
322*09537850SAkhilesh Sanikop StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
323*09537850SAkhilesh Sanikop }
324*09537850SAkhilesh Sanikop src += src_stride * 2;
325*09537850SAkhilesh Sanikop dest8 += pred_stride * 2;
326*09537850SAkhilesh Sanikop dest16 += pred_stride * 2;
327*09537850SAkhilesh Sanikop y -= 2;
328*09537850SAkhilesh Sanikop } while (y != 0);
329*09537850SAkhilesh Sanikop
330*09537850SAkhilesh Sanikop // The 2d filters have an odd |height| during the horizontal pass, so
331*09537850SAkhilesh Sanikop // filter the remaining row.
332*09537850SAkhilesh Sanikop if (is_2d) {
333*09537850SAkhilesh Sanikop const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
334*09537850SAkhilesh Sanikop const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
335*09537850SAkhilesh Sanikop StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
336*09537850SAkhilesh Sanikop }
337*09537850SAkhilesh Sanikop
338*09537850SAkhilesh Sanikop } else { // width == 4
339*09537850SAkhilesh Sanikop int y = height;
340*09537850SAkhilesh Sanikop if (is_2d) y -= 1;
341*09537850SAkhilesh Sanikop do {
342*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
343*09537850SAkhilesh Sanikop const __m128i this_row = LoadUnaligned16(&src[0]);
344*09537850SAkhilesh Sanikop const __m128i next_row = LoadUnaligned16(&src[src_stride]);
345*09537850SAkhilesh Sanikop const __m256i src_long = SetrM128i(this_row, next_row);
346*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
347*09537850SAkhilesh Sanikop const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
348*09537850SAkhilesh Sanikop StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
349*09537850SAkhilesh Sanikop StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
350*09537850SAkhilesh Sanikop } else {
351*09537850SAkhilesh Sanikop const __m128i this_row = LoadUnaligned16(&src[0]);
352*09537850SAkhilesh Sanikop const __m128i next_row = LoadUnaligned16(&src[src_stride]);
353*09537850SAkhilesh Sanikop // Load into 2 128 bit lanes.
354*09537850SAkhilesh Sanikop const __m256i src_long = SetrM128i(this_row, next_row);
355*09537850SAkhilesh Sanikop const __m256i result = SimpleHorizontalTaps<num_taps>(&src_long, v_tap);
356*09537850SAkhilesh Sanikop Store4(&dest8[0], _mm256_castsi256_si128(result));
357*09537850SAkhilesh Sanikop Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
358*09537850SAkhilesh Sanikop }
359*09537850SAkhilesh Sanikop src += src_stride * 2;
360*09537850SAkhilesh Sanikop dest8 += pred_stride * 2;
361*09537850SAkhilesh Sanikop dest16 += pred_stride * 2;
362*09537850SAkhilesh Sanikop y -= 2;
363*09537850SAkhilesh Sanikop } while (y != 0);
364*09537850SAkhilesh Sanikop
365*09537850SAkhilesh Sanikop // The 2d filters have an odd |height| during the horizontal pass, so
366*09537850SAkhilesh Sanikop // filter the remaining row.
367*09537850SAkhilesh Sanikop if (is_2d) {
368*09537850SAkhilesh Sanikop const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
369*09537850SAkhilesh Sanikop const __m256i result = HorizontalTaps8To16<num_taps>(&src_long, v_tap);
370*09537850SAkhilesh Sanikop StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
371*09537850SAkhilesh Sanikop }
372*09537850SAkhilesh Sanikop }
373*09537850SAkhilesh Sanikop }
374*09537850SAkhilesh Sanikop
375*09537850SAkhilesh Sanikop template <int num_taps, bool is_2d_vertical = false>
SetupTaps(const __m128i * const filter,__m256i * v_tap)376*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
377*09537850SAkhilesh Sanikop __m256i* v_tap) {
378*09537850SAkhilesh Sanikop if (num_taps == 8) {
379*09537850SAkhilesh Sanikop if (is_2d_vertical) {
380*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastd_epi32(*filter); // k1k0
381*09537850SAkhilesh Sanikop v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
382*09537850SAkhilesh Sanikop v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
383*09537850SAkhilesh Sanikop v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12)); // k7k6
384*09537850SAkhilesh Sanikop } else {
385*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
386*09537850SAkhilesh Sanikop v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
387*09537850SAkhilesh Sanikop v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
388*09537850SAkhilesh Sanikop v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
389*09537850SAkhilesh Sanikop }
390*09537850SAkhilesh Sanikop } else if (num_taps == 6) {
391*09537850SAkhilesh Sanikop if (is_2d_vertical) {
392*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2)); // k2k1
393*09537850SAkhilesh Sanikop v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
394*09537850SAkhilesh Sanikop v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10)); // k6k5
395*09537850SAkhilesh Sanikop } else {
396*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
397*09537850SAkhilesh Sanikop v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
398*09537850SAkhilesh Sanikop v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
399*09537850SAkhilesh Sanikop }
400*09537850SAkhilesh Sanikop } else if (num_taps == 4) {
401*09537850SAkhilesh Sanikop if (is_2d_vertical) {
402*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
403*09537850SAkhilesh Sanikop v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
404*09537850SAkhilesh Sanikop } else {
405*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
406*09537850SAkhilesh Sanikop v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
407*09537850SAkhilesh Sanikop }
408*09537850SAkhilesh Sanikop } else { // num_taps == 2
409*09537850SAkhilesh Sanikop if (is_2d_vertical) {
410*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
411*09537850SAkhilesh Sanikop } else {
412*09537850SAkhilesh Sanikop v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
413*09537850SAkhilesh Sanikop }
414*09537850SAkhilesh Sanikop }
415*09537850SAkhilesh Sanikop }
416*09537850SAkhilesh Sanikop
417*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound>
SimpleSum2DVerticalTaps(const __m256i * const src,const __m256i * const taps)418*09537850SAkhilesh Sanikop __m256i SimpleSum2DVerticalTaps(const __m256i* const src,
419*09537850SAkhilesh Sanikop const __m256i* const taps) {
420*09537850SAkhilesh Sanikop __m256i sum_lo =
421*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
422*09537850SAkhilesh Sanikop __m256i sum_hi =
423*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
424*09537850SAkhilesh Sanikop if (num_taps >= 4) {
425*09537850SAkhilesh Sanikop __m256i madd_lo =
426*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
427*09537850SAkhilesh Sanikop __m256i madd_hi =
428*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
429*09537850SAkhilesh Sanikop sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
430*09537850SAkhilesh Sanikop sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
431*09537850SAkhilesh Sanikop if (num_taps >= 6) {
432*09537850SAkhilesh Sanikop madd_lo =
433*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
434*09537850SAkhilesh Sanikop madd_hi =
435*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
436*09537850SAkhilesh Sanikop sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
437*09537850SAkhilesh Sanikop sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
438*09537850SAkhilesh Sanikop if (num_taps == 8) {
439*09537850SAkhilesh Sanikop madd_lo =
440*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
441*09537850SAkhilesh Sanikop madd_hi =
442*09537850SAkhilesh Sanikop _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
443*09537850SAkhilesh Sanikop sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
444*09537850SAkhilesh Sanikop sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
445*09537850SAkhilesh Sanikop }
446*09537850SAkhilesh Sanikop }
447*09537850SAkhilesh Sanikop }
448*09537850SAkhilesh Sanikop
449*09537850SAkhilesh Sanikop if (is_compound) {
450*09537850SAkhilesh Sanikop return _mm256_packs_epi32(
451*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
452*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi,
453*09537850SAkhilesh Sanikop kInterRoundBitsCompoundVertical - 1));
454*09537850SAkhilesh Sanikop }
455*09537850SAkhilesh Sanikop
456*09537850SAkhilesh Sanikop return _mm256_packs_epi32(
457*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
458*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
459*09537850SAkhilesh Sanikop }
460*09537850SAkhilesh Sanikop
461*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
Filter2DVertical16xH(const uint16_t * LIBGAV1_RESTRICT src,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const __m256i * const taps)462*09537850SAkhilesh Sanikop void Filter2DVertical16xH(const uint16_t* LIBGAV1_RESTRICT src,
463*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
464*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int width,
465*09537850SAkhilesh Sanikop const int height, const __m256i* const taps) {
466*09537850SAkhilesh Sanikop assert(width >= 8);
467*09537850SAkhilesh Sanikop constexpr int next_row = num_taps - 1;
468*09537850SAkhilesh Sanikop // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
469*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = width;
470*09537850SAkhilesh Sanikop
471*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
472*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
473*09537850SAkhilesh Sanikop
474*09537850SAkhilesh Sanikop int x = 0;
475*09537850SAkhilesh Sanikop do {
476*09537850SAkhilesh Sanikop __m256i srcs[8];
477*09537850SAkhilesh Sanikop const uint16_t* src_x = src + x;
478*09537850SAkhilesh Sanikop srcs[0] = LoadAligned32(src_x);
479*09537850SAkhilesh Sanikop src_x += src_stride;
480*09537850SAkhilesh Sanikop if (num_taps >= 4) {
481*09537850SAkhilesh Sanikop srcs[1] = LoadAligned32(src_x);
482*09537850SAkhilesh Sanikop src_x += src_stride;
483*09537850SAkhilesh Sanikop srcs[2] = LoadAligned32(src_x);
484*09537850SAkhilesh Sanikop src_x += src_stride;
485*09537850SAkhilesh Sanikop if (num_taps >= 6) {
486*09537850SAkhilesh Sanikop srcs[3] = LoadAligned32(src_x);
487*09537850SAkhilesh Sanikop src_x += src_stride;
488*09537850SAkhilesh Sanikop srcs[4] = LoadAligned32(src_x);
489*09537850SAkhilesh Sanikop src_x += src_stride;
490*09537850SAkhilesh Sanikop if (num_taps == 8) {
491*09537850SAkhilesh Sanikop srcs[5] = LoadAligned32(src_x);
492*09537850SAkhilesh Sanikop src_x += src_stride;
493*09537850SAkhilesh Sanikop srcs[6] = LoadAligned32(src_x);
494*09537850SAkhilesh Sanikop src_x += src_stride;
495*09537850SAkhilesh Sanikop }
496*09537850SAkhilesh Sanikop }
497*09537850SAkhilesh Sanikop }
498*09537850SAkhilesh Sanikop
499*09537850SAkhilesh Sanikop auto* dst8_x = dst8 + x;
500*09537850SAkhilesh Sanikop auto* dst16_x = dst16 + x;
501*09537850SAkhilesh Sanikop int y = height;
502*09537850SAkhilesh Sanikop do {
503*09537850SAkhilesh Sanikop srcs[next_row] = LoadAligned32(src_x);
504*09537850SAkhilesh Sanikop src_x += src_stride;
505*09537850SAkhilesh Sanikop
506*09537850SAkhilesh Sanikop const __m256i sum =
507*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
508*09537850SAkhilesh Sanikop if (is_compound) {
509*09537850SAkhilesh Sanikop StoreUnaligned32(dst16_x, sum);
510*09537850SAkhilesh Sanikop dst16_x += dst_stride;
511*09537850SAkhilesh Sanikop } else {
512*09537850SAkhilesh Sanikop const __m128i packed_sum = _mm_packus_epi16(
513*09537850SAkhilesh Sanikop _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
514*09537850SAkhilesh Sanikop StoreUnaligned16(dst8_x, packed_sum);
515*09537850SAkhilesh Sanikop dst8_x += dst_stride;
516*09537850SAkhilesh Sanikop }
517*09537850SAkhilesh Sanikop
518*09537850SAkhilesh Sanikop srcs[0] = srcs[1];
519*09537850SAkhilesh Sanikop if (num_taps >= 4) {
520*09537850SAkhilesh Sanikop srcs[1] = srcs[2];
521*09537850SAkhilesh Sanikop srcs[2] = srcs[3];
522*09537850SAkhilesh Sanikop if (num_taps >= 6) {
523*09537850SAkhilesh Sanikop srcs[3] = srcs[4];
524*09537850SAkhilesh Sanikop srcs[4] = srcs[5];
525*09537850SAkhilesh Sanikop if (num_taps == 8) {
526*09537850SAkhilesh Sanikop srcs[5] = srcs[6];
527*09537850SAkhilesh Sanikop srcs[6] = srcs[7];
528*09537850SAkhilesh Sanikop }
529*09537850SAkhilesh Sanikop }
530*09537850SAkhilesh Sanikop }
531*09537850SAkhilesh Sanikop } while (--y != 0);
532*09537850SAkhilesh Sanikop x += 16;
533*09537850SAkhilesh Sanikop } while (x < width);
534*09537850SAkhilesh Sanikop }
535*09537850SAkhilesh Sanikop
536*09537850SAkhilesh Sanikop template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass2xH(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const int filter_id,const int filter_index)537*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
538*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
539*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
540*09537850SAkhilesh Sanikop const int width, const int height, const int filter_id,
541*09537850SAkhilesh Sanikop const int filter_index) {
542*09537850SAkhilesh Sanikop assert(filter_id != 0);
543*09537850SAkhilesh Sanikop __m128i v_tap[4];
544*09537850SAkhilesh Sanikop const __m128i v_horizontal_filter =
545*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
546*09537850SAkhilesh Sanikop
547*09537850SAkhilesh Sanikop if ((filter_index & 0x4) != 0) { // 4 tap.
548*09537850SAkhilesh Sanikop // ((filter_index == 4) | (filter_index == 5))
549*09537850SAkhilesh Sanikop SetupTaps<4>(&v_horizontal_filter, v_tap);
550*09537850SAkhilesh Sanikop FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
551*09537850SAkhilesh Sanikop width, height, v_tap);
552*09537850SAkhilesh Sanikop } else { // 2 tap.
553*09537850SAkhilesh Sanikop SetupTaps<2>(&v_horizontal_filter, v_tap);
554*09537850SAkhilesh Sanikop FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
555*09537850SAkhilesh Sanikop width, height, v_tap);
556*09537850SAkhilesh Sanikop }
557*09537850SAkhilesh Sanikop }
558*09537850SAkhilesh Sanikop
559*09537850SAkhilesh Sanikop template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const int filter_id,const int filter_index)560*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
561*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
562*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
563*09537850SAkhilesh Sanikop const int width, const int height, const int filter_id,
564*09537850SAkhilesh Sanikop const int filter_index) {
565*09537850SAkhilesh Sanikop assert(filter_id != 0);
566*09537850SAkhilesh Sanikop __m256i v_tap[4];
567*09537850SAkhilesh Sanikop const __m128i v_horizontal_filter =
568*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
569*09537850SAkhilesh Sanikop
570*09537850SAkhilesh Sanikop if (filter_index == 2) { // 8 tap.
571*09537850SAkhilesh Sanikop SetupTaps<8>(&v_horizontal_filter, v_tap);
572*09537850SAkhilesh Sanikop FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
573*09537850SAkhilesh Sanikop width, height, v_tap);
574*09537850SAkhilesh Sanikop } else if (filter_index == 1) { // 6 tap.
575*09537850SAkhilesh Sanikop SetupTaps<6>(&v_horizontal_filter, v_tap);
576*09537850SAkhilesh Sanikop FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
577*09537850SAkhilesh Sanikop width, height, v_tap);
578*09537850SAkhilesh Sanikop } else if (filter_index == 0) { // 6 tap.
579*09537850SAkhilesh Sanikop SetupTaps<6>(&v_horizontal_filter, v_tap);
580*09537850SAkhilesh Sanikop FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
581*09537850SAkhilesh Sanikop width, height, v_tap);
582*09537850SAkhilesh Sanikop } else if ((filter_index & 0x4) != 0) { // 4 tap.
583*09537850SAkhilesh Sanikop // ((filter_index == 4) | (filter_index == 5))
584*09537850SAkhilesh Sanikop SetupTaps<4>(&v_horizontal_filter, v_tap);
585*09537850SAkhilesh Sanikop FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
586*09537850SAkhilesh Sanikop width, height, v_tap);
587*09537850SAkhilesh Sanikop } else { // 2 tap.
588*09537850SAkhilesh Sanikop SetupTaps<2>(&v_horizontal_filter, v_tap);
589*09537850SAkhilesh Sanikop FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
590*09537850SAkhilesh Sanikop width, height, v_tap);
591*09537850SAkhilesh Sanikop }
592*09537850SAkhilesh Sanikop }
593*09537850SAkhilesh Sanikop
Convolve2D_AVX2(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)594*09537850SAkhilesh Sanikop void Convolve2D_AVX2(const void* LIBGAV1_RESTRICT const reference,
595*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride,
596*09537850SAkhilesh Sanikop const int horizontal_filter_index,
597*09537850SAkhilesh Sanikop const int vertical_filter_index,
598*09537850SAkhilesh Sanikop const int horizontal_filter_id,
599*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width,
600*09537850SAkhilesh Sanikop const int height, void* LIBGAV1_RESTRICT prediction,
601*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride) {
602*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
603*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
604*09537850SAkhilesh Sanikop const int vertical_taps =
605*09537850SAkhilesh Sanikop GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
606*09537850SAkhilesh Sanikop
607*09537850SAkhilesh Sanikop // The output of the horizontal filter is guaranteed to fit in 16 bits.
608*09537850SAkhilesh Sanikop alignas(32) uint16_t
609*09537850SAkhilesh Sanikop intermediate_result[kMaxSuperBlockSizeInPixels *
610*09537850SAkhilesh Sanikop (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
611*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
612*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
613*09537850SAkhilesh Sanikop memset(intermediate_result, 0x33, sizeof(intermediate_result));
614*09537850SAkhilesh Sanikop #endif
615*09537850SAkhilesh Sanikop const int intermediate_height = height + vertical_taps - 1;
616*09537850SAkhilesh Sanikop
617*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
618*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
619*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
620*09537850SAkhilesh Sanikop if (width > 2) {
621*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
622*09537850SAkhilesh Sanikop width, width, intermediate_height,
623*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
624*09537850SAkhilesh Sanikop } else {
625*09537850SAkhilesh Sanikop // Use non avx2 version for smaller widths.
626*09537850SAkhilesh Sanikop DoHorizontalPass2xH</*is_2d=*/true>(
627*09537850SAkhilesh Sanikop src, src_stride, intermediate_result, width, width, intermediate_height,
628*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
629*09537850SAkhilesh Sanikop }
630*09537850SAkhilesh Sanikop
631*09537850SAkhilesh Sanikop // Vertical filter.
632*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
633*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = pred_stride;
634*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
635*09537850SAkhilesh Sanikop
636*09537850SAkhilesh Sanikop const __m128i v_filter =
637*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
638*09537850SAkhilesh Sanikop
639*09537850SAkhilesh Sanikop // Use 256 bits for width > 8.
640*09537850SAkhilesh Sanikop if (width > 8) {
641*09537850SAkhilesh Sanikop __m256i taps_256[4];
642*09537850SAkhilesh Sanikop const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
643*09537850SAkhilesh Sanikop
644*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
645*09537850SAkhilesh Sanikop SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
646*09537850SAkhilesh Sanikop Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
647*09537850SAkhilesh Sanikop height, taps_256);
648*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
649*09537850SAkhilesh Sanikop SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
650*09537850SAkhilesh Sanikop Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
651*09537850SAkhilesh Sanikop height, taps_256);
652*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
653*09537850SAkhilesh Sanikop SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
654*09537850SAkhilesh Sanikop Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
655*09537850SAkhilesh Sanikop height, taps_256);
656*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
657*09537850SAkhilesh Sanikop SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
658*09537850SAkhilesh Sanikop Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
659*09537850SAkhilesh Sanikop height, taps_256);
660*09537850SAkhilesh Sanikop }
661*09537850SAkhilesh Sanikop } else { // width <= 8
662*09537850SAkhilesh Sanikop __m128i taps[4];
663*09537850SAkhilesh Sanikop // Use 128 bit code.
664*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
665*09537850SAkhilesh Sanikop SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
666*09537850SAkhilesh Sanikop if (width == 2) {
667*09537850SAkhilesh Sanikop Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
668*09537850SAkhilesh Sanikop taps);
669*09537850SAkhilesh Sanikop } else if (width == 4) {
670*09537850SAkhilesh Sanikop Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
671*09537850SAkhilesh Sanikop taps);
672*09537850SAkhilesh Sanikop } else {
673*09537850SAkhilesh Sanikop Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
674*09537850SAkhilesh Sanikop height, taps);
675*09537850SAkhilesh Sanikop }
676*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
677*09537850SAkhilesh Sanikop SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
678*09537850SAkhilesh Sanikop if (width == 2) {
679*09537850SAkhilesh Sanikop Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
680*09537850SAkhilesh Sanikop taps);
681*09537850SAkhilesh Sanikop } else if (width == 4) {
682*09537850SAkhilesh Sanikop Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
683*09537850SAkhilesh Sanikop taps);
684*09537850SAkhilesh Sanikop } else {
685*09537850SAkhilesh Sanikop Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
686*09537850SAkhilesh Sanikop height, taps);
687*09537850SAkhilesh Sanikop }
688*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
689*09537850SAkhilesh Sanikop SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
690*09537850SAkhilesh Sanikop if (width == 2) {
691*09537850SAkhilesh Sanikop Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
692*09537850SAkhilesh Sanikop taps);
693*09537850SAkhilesh Sanikop } else if (width == 4) {
694*09537850SAkhilesh Sanikop Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
695*09537850SAkhilesh Sanikop taps);
696*09537850SAkhilesh Sanikop } else {
697*09537850SAkhilesh Sanikop Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
698*09537850SAkhilesh Sanikop height, taps);
699*09537850SAkhilesh Sanikop }
700*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
701*09537850SAkhilesh Sanikop SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
702*09537850SAkhilesh Sanikop if (width == 2) {
703*09537850SAkhilesh Sanikop Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
704*09537850SAkhilesh Sanikop taps);
705*09537850SAkhilesh Sanikop } else if (width == 4) {
706*09537850SAkhilesh Sanikop Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
707*09537850SAkhilesh Sanikop taps);
708*09537850SAkhilesh Sanikop } else {
709*09537850SAkhilesh Sanikop Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
710*09537850SAkhilesh Sanikop height, taps);
711*09537850SAkhilesh Sanikop }
712*09537850SAkhilesh Sanikop }
713*09537850SAkhilesh Sanikop }
714*09537850SAkhilesh Sanikop }
715*09537850SAkhilesh Sanikop
716*09537850SAkhilesh Sanikop // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
717*09537850SAkhilesh Sanikop // Vertical calculations.
Compound1DShift(const __m256i sum)718*09537850SAkhilesh Sanikop __m256i Compound1DShift(const __m256i sum) {
719*09537850SAkhilesh Sanikop return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
720*09537850SAkhilesh Sanikop }
721*09537850SAkhilesh Sanikop
722*09537850SAkhilesh Sanikop template <int num_taps, bool unpack_high = false>
SumVerticalTaps(const __m256i * const srcs,const __m256i * const v_tap)723*09537850SAkhilesh Sanikop __m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
724*09537850SAkhilesh Sanikop __m256i v_src[4];
725*09537850SAkhilesh Sanikop
726*09537850SAkhilesh Sanikop if (!unpack_high) {
727*09537850SAkhilesh Sanikop if (num_taps == 6) {
728*09537850SAkhilesh Sanikop // 6 taps.
729*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
730*09537850SAkhilesh Sanikop v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
731*09537850SAkhilesh Sanikop v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
732*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
733*09537850SAkhilesh Sanikop // 8 taps.
734*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
735*09537850SAkhilesh Sanikop v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
736*09537850SAkhilesh Sanikop v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
737*09537850SAkhilesh Sanikop v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
738*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
739*09537850SAkhilesh Sanikop // 2 taps.
740*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
741*09537850SAkhilesh Sanikop } else {
742*09537850SAkhilesh Sanikop // 4 taps.
743*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
744*09537850SAkhilesh Sanikop v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
745*09537850SAkhilesh Sanikop }
746*09537850SAkhilesh Sanikop } else {
747*09537850SAkhilesh Sanikop if (num_taps == 6) {
748*09537850SAkhilesh Sanikop // 6 taps.
749*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
750*09537850SAkhilesh Sanikop v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
751*09537850SAkhilesh Sanikop v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
752*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
753*09537850SAkhilesh Sanikop // 8 taps.
754*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
755*09537850SAkhilesh Sanikop v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
756*09537850SAkhilesh Sanikop v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
757*09537850SAkhilesh Sanikop v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
758*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
759*09537850SAkhilesh Sanikop // 2 taps.
760*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
761*09537850SAkhilesh Sanikop } else {
762*09537850SAkhilesh Sanikop // 4 taps.
763*09537850SAkhilesh Sanikop v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
764*09537850SAkhilesh Sanikop v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
765*09537850SAkhilesh Sanikop }
766*09537850SAkhilesh Sanikop }
767*09537850SAkhilesh Sanikop return SumOnePassTaps<num_taps>(v_src, v_tap);
768*09537850SAkhilesh Sanikop }
769*09537850SAkhilesh Sanikop
770*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
FilterVertical32xH(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const __m256i * const v_tap)771*09537850SAkhilesh Sanikop void FilterVertical32xH(const uint8_t* LIBGAV1_RESTRICT src,
772*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
773*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
774*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int width,
775*09537850SAkhilesh Sanikop const int height, const __m256i* const v_tap) {
776*09537850SAkhilesh Sanikop const int next_row = num_taps - 1;
777*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
778*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
779*09537850SAkhilesh Sanikop assert(width >= 32);
780*09537850SAkhilesh Sanikop int x = 0;
781*09537850SAkhilesh Sanikop do {
782*09537850SAkhilesh Sanikop const uint8_t* src_x = src + x;
783*09537850SAkhilesh Sanikop __m256i srcs[8];
784*09537850SAkhilesh Sanikop srcs[0] = LoadUnaligned32(src_x);
785*09537850SAkhilesh Sanikop src_x += src_stride;
786*09537850SAkhilesh Sanikop if (num_taps >= 4) {
787*09537850SAkhilesh Sanikop srcs[1] = LoadUnaligned32(src_x);
788*09537850SAkhilesh Sanikop src_x += src_stride;
789*09537850SAkhilesh Sanikop srcs[2] = LoadUnaligned32(src_x);
790*09537850SAkhilesh Sanikop src_x += src_stride;
791*09537850SAkhilesh Sanikop if (num_taps >= 6) {
792*09537850SAkhilesh Sanikop srcs[3] = LoadUnaligned32(src_x);
793*09537850SAkhilesh Sanikop src_x += src_stride;
794*09537850SAkhilesh Sanikop srcs[4] = LoadUnaligned32(src_x);
795*09537850SAkhilesh Sanikop src_x += src_stride;
796*09537850SAkhilesh Sanikop if (num_taps == 8) {
797*09537850SAkhilesh Sanikop srcs[5] = LoadUnaligned32(src_x);
798*09537850SAkhilesh Sanikop src_x += src_stride;
799*09537850SAkhilesh Sanikop srcs[6] = LoadUnaligned32(src_x);
800*09537850SAkhilesh Sanikop src_x += src_stride;
801*09537850SAkhilesh Sanikop }
802*09537850SAkhilesh Sanikop }
803*09537850SAkhilesh Sanikop }
804*09537850SAkhilesh Sanikop
805*09537850SAkhilesh Sanikop auto* dst8_x = dst8 + x;
806*09537850SAkhilesh Sanikop auto* dst16_x = dst16 + x;
807*09537850SAkhilesh Sanikop int y = height;
808*09537850SAkhilesh Sanikop do {
809*09537850SAkhilesh Sanikop srcs[next_row] = LoadUnaligned32(src_x);
810*09537850SAkhilesh Sanikop src_x += src_stride;
811*09537850SAkhilesh Sanikop
812*09537850SAkhilesh Sanikop const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
813*09537850SAkhilesh Sanikop const __m256i sums_hi =
814*09537850SAkhilesh Sanikop SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
815*09537850SAkhilesh Sanikop if (is_compound) {
816*09537850SAkhilesh Sanikop const __m256i results =
817*09537850SAkhilesh Sanikop Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
818*09537850SAkhilesh Sanikop const __m256i results_hi =
819*09537850SAkhilesh Sanikop Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
820*09537850SAkhilesh Sanikop StoreUnaligned32(dst16_x, results);
821*09537850SAkhilesh Sanikop StoreUnaligned32(dst16_x + 16, results_hi);
822*09537850SAkhilesh Sanikop dst16_x += dst_stride;
823*09537850SAkhilesh Sanikop } else {
824*09537850SAkhilesh Sanikop const __m256i results =
825*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1);
826*09537850SAkhilesh Sanikop const __m256i results_hi =
827*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
828*09537850SAkhilesh Sanikop const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
829*09537850SAkhilesh Sanikop
830*09537850SAkhilesh Sanikop StoreUnaligned32(dst8_x, packed_results);
831*09537850SAkhilesh Sanikop dst8_x += dst_stride;
832*09537850SAkhilesh Sanikop }
833*09537850SAkhilesh Sanikop
834*09537850SAkhilesh Sanikop srcs[0] = srcs[1];
835*09537850SAkhilesh Sanikop if (num_taps >= 4) {
836*09537850SAkhilesh Sanikop srcs[1] = srcs[2];
837*09537850SAkhilesh Sanikop srcs[2] = srcs[3];
838*09537850SAkhilesh Sanikop if (num_taps >= 6) {
839*09537850SAkhilesh Sanikop srcs[3] = srcs[4];
840*09537850SAkhilesh Sanikop srcs[4] = srcs[5];
841*09537850SAkhilesh Sanikop if (num_taps == 8) {
842*09537850SAkhilesh Sanikop srcs[5] = srcs[6];
843*09537850SAkhilesh Sanikop srcs[6] = srcs[7];
844*09537850SAkhilesh Sanikop }
845*09537850SAkhilesh Sanikop }
846*09537850SAkhilesh Sanikop }
847*09537850SAkhilesh Sanikop } while (--y != 0);
848*09537850SAkhilesh Sanikop x += 32;
849*09537850SAkhilesh Sanikop } while (x < width);
850*09537850SAkhilesh Sanikop }
851*09537850SAkhilesh Sanikop
852*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
FilterVertical16xH(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int,const int height,const __m256i * const v_tap)853*09537850SAkhilesh Sanikop void FilterVertical16xH(const uint8_t* LIBGAV1_RESTRICT src,
854*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
855*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
856*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int /*width*/,
857*09537850SAkhilesh Sanikop const int height, const __m256i* const v_tap) {
858*09537850SAkhilesh Sanikop const int next_row = num_taps;
859*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
860*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
861*09537850SAkhilesh Sanikop
862*09537850SAkhilesh Sanikop const uint8_t* src_x = src;
863*09537850SAkhilesh Sanikop __m256i srcs[8 + 1];
864*09537850SAkhilesh Sanikop // The upper 128 bits hold the filter data for the next row.
865*09537850SAkhilesh Sanikop srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
866*09537850SAkhilesh Sanikop src_x += src_stride;
867*09537850SAkhilesh Sanikop if (num_taps >= 4) {
868*09537850SAkhilesh Sanikop srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
869*09537850SAkhilesh Sanikop src_x += src_stride;
870*09537850SAkhilesh Sanikop srcs[0] =
871*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
872*09537850SAkhilesh Sanikop srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
873*09537850SAkhilesh Sanikop src_x += src_stride;
874*09537850SAkhilesh Sanikop srcs[1] =
875*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
876*09537850SAkhilesh Sanikop if (num_taps >= 6) {
877*09537850SAkhilesh Sanikop srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
878*09537850SAkhilesh Sanikop src_x += src_stride;
879*09537850SAkhilesh Sanikop srcs[2] =
880*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
881*09537850SAkhilesh Sanikop srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
882*09537850SAkhilesh Sanikop src_x += src_stride;
883*09537850SAkhilesh Sanikop srcs[3] =
884*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
885*09537850SAkhilesh Sanikop if (num_taps == 8) {
886*09537850SAkhilesh Sanikop srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
887*09537850SAkhilesh Sanikop src_x += src_stride;
888*09537850SAkhilesh Sanikop srcs[4] = _mm256_inserti128_si256(srcs[4],
889*09537850SAkhilesh Sanikop _mm256_castsi256_si128(srcs[5]), 1);
890*09537850SAkhilesh Sanikop srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
891*09537850SAkhilesh Sanikop src_x += src_stride;
892*09537850SAkhilesh Sanikop srcs[5] = _mm256_inserti128_si256(srcs[5],
893*09537850SAkhilesh Sanikop _mm256_castsi256_si128(srcs[6]), 1);
894*09537850SAkhilesh Sanikop }
895*09537850SAkhilesh Sanikop }
896*09537850SAkhilesh Sanikop }
897*09537850SAkhilesh Sanikop
898*09537850SAkhilesh Sanikop int y = height;
899*09537850SAkhilesh Sanikop do {
900*09537850SAkhilesh Sanikop srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
901*09537850SAkhilesh Sanikop src_x += src_stride;
902*09537850SAkhilesh Sanikop
903*09537850SAkhilesh Sanikop srcs[next_row - 2] = _mm256_inserti128_si256(
904*09537850SAkhilesh Sanikop srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
905*09537850SAkhilesh Sanikop
906*09537850SAkhilesh Sanikop srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
907*09537850SAkhilesh Sanikop src_x += src_stride;
908*09537850SAkhilesh Sanikop
909*09537850SAkhilesh Sanikop srcs[next_row - 1] = _mm256_inserti128_si256(
910*09537850SAkhilesh Sanikop srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
911*09537850SAkhilesh Sanikop
912*09537850SAkhilesh Sanikop const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
913*09537850SAkhilesh Sanikop const __m256i sums_hi =
914*09537850SAkhilesh Sanikop SumVerticalTaps<num_taps, /*unpack_high=*/true>(srcs, v_tap);
915*09537850SAkhilesh Sanikop if (is_compound) {
916*09537850SAkhilesh Sanikop const __m256i results =
917*09537850SAkhilesh Sanikop Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
918*09537850SAkhilesh Sanikop const __m256i results_hi =
919*09537850SAkhilesh Sanikop Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
920*09537850SAkhilesh Sanikop
921*09537850SAkhilesh Sanikop StoreUnaligned32(dst16, results);
922*09537850SAkhilesh Sanikop StoreUnaligned32(dst16 + dst_stride, results_hi);
923*09537850SAkhilesh Sanikop dst16 += dst_stride << 1;
924*09537850SAkhilesh Sanikop } else {
925*09537850SAkhilesh Sanikop const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
926*09537850SAkhilesh Sanikop const __m256i results_hi =
927*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
928*09537850SAkhilesh Sanikop const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
929*09537850SAkhilesh Sanikop const __m128i this_dst = _mm256_castsi256_si128(packed_results);
930*09537850SAkhilesh Sanikop const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
931*09537850SAkhilesh Sanikop
932*09537850SAkhilesh Sanikop StoreUnaligned16(dst8, this_dst);
933*09537850SAkhilesh Sanikop StoreUnaligned16(dst8 + dst_stride, next_dst);
934*09537850SAkhilesh Sanikop dst8 += dst_stride << 1;
935*09537850SAkhilesh Sanikop }
936*09537850SAkhilesh Sanikop
937*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
938*09537850SAkhilesh Sanikop if (num_taps >= 4) {
939*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
940*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
941*09537850SAkhilesh Sanikop if (num_taps >= 6) {
942*09537850SAkhilesh Sanikop srcs[3] = srcs[5];
943*09537850SAkhilesh Sanikop srcs[4] = srcs[6];
944*09537850SAkhilesh Sanikop if (num_taps == 8) {
945*09537850SAkhilesh Sanikop srcs[5] = srcs[7];
946*09537850SAkhilesh Sanikop srcs[6] = srcs[8];
947*09537850SAkhilesh Sanikop }
948*09537850SAkhilesh Sanikop }
949*09537850SAkhilesh Sanikop }
950*09537850SAkhilesh Sanikop y -= 2;
951*09537850SAkhilesh Sanikop } while (y != 0);
952*09537850SAkhilesh Sanikop }
953*09537850SAkhilesh Sanikop
954*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
FilterVertical8xH(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int,const int height,const __m256i * const v_tap)955*09537850SAkhilesh Sanikop void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
956*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
957*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
958*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int /*width*/,
959*09537850SAkhilesh Sanikop const int height, const __m256i* const v_tap) {
960*09537850SAkhilesh Sanikop const int next_row = num_taps;
961*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
962*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
963*09537850SAkhilesh Sanikop
964*09537850SAkhilesh Sanikop const uint8_t* src_x = src;
965*09537850SAkhilesh Sanikop __m256i srcs[8 + 1];
966*09537850SAkhilesh Sanikop // The upper 128 bits hold the filter data for the next row.
967*09537850SAkhilesh Sanikop srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
968*09537850SAkhilesh Sanikop src_x += src_stride;
969*09537850SAkhilesh Sanikop if (num_taps >= 4) {
970*09537850SAkhilesh Sanikop srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
971*09537850SAkhilesh Sanikop src_x += src_stride;
972*09537850SAkhilesh Sanikop srcs[0] =
973*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
974*09537850SAkhilesh Sanikop srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
975*09537850SAkhilesh Sanikop src_x += src_stride;
976*09537850SAkhilesh Sanikop srcs[1] =
977*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
978*09537850SAkhilesh Sanikop if (num_taps >= 6) {
979*09537850SAkhilesh Sanikop srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
980*09537850SAkhilesh Sanikop src_x += src_stride;
981*09537850SAkhilesh Sanikop srcs[2] =
982*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
983*09537850SAkhilesh Sanikop srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
984*09537850SAkhilesh Sanikop src_x += src_stride;
985*09537850SAkhilesh Sanikop srcs[3] =
986*09537850SAkhilesh Sanikop _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
987*09537850SAkhilesh Sanikop if (num_taps == 8) {
988*09537850SAkhilesh Sanikop srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
989*09537850SAkhilesh Sanikop src_x += src_stride;
990*09537850SAkhilesh Sanikop srcs[4] = _mm256_inserti128_si256(srcs[4],
991*09537850SAkhilesh Sanikop _mm256_castsi256_si128(srcs[5]), 1);
992*09537850SAkhilesh Sanikop srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
993*09537850SAkhilesh Sanikop src_x += src_stride;
994*09537850SAkhilesh Sanikop srcs[5] = _mm256_inserti128_si256(srcs[5],
995*09537850SAkhilesh Sanikop _mm256_castsi256_si128(srcs[6]), 1);
996*09537850SAkhilesh Sanikop }
997*09537850SAkhilesh Sanikop }
998*09537850SAkhilesh Sanikop }
999*09537850SAkhilesh Sanikop
1000*09537850SAkhilesh Sanikop int y = height;
1001*09537850SAkhilesh Sanikop do {
1002*09537850SAkhilesh Sanikop srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
1003*09537850SAkhilesh Sanikop src_x += src_stride;
1004*09537850SAkhilesh Sanikop
1005*09537850SAkhilesh Sanikop srcs[next_row - 2] = _mm256_inserti128_si256(
1006*09537850SAkhilesh Sanikop srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
1007*09537850SAkhilesh Sanikop
1008*09537850SAkhilesh Sanikop srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
1009*09537850SAkhilesh Sanikop src_x += src_stride;
1010*09537850SAkhilesh Sanikop
1011*09537850SAkhilesh Sanikop srcs[next_row - 1] = _mm256_inserti128_si256(
1012*09537850SAkhilesh Sanikop srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
1013*09537850SAkhilesh Sanikop
1014*09537850SAkhilesh Sanikop const __m256i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
1015*09537850SAkhilesh Sanikop if (is_compound) {
1016*09537850SAkhilesh Sanikop const __m256i results = Compound1DShift(sums);
1017*09537850SAkhilesh Sanikop const __m128i this_dst = _mm256_castsi256_si128(results);
1018*09537850SAkhilesh Sanikop const auto next_dst = _mm256_extracti128_si256(results, 1);
1019*09537850SAkhilesh Sanikop
1020*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, this_dst);
1021*09537850SAkhilesh Sanikop StoreUnaligned16(dst16 + dst_stride, next_dst);
1022*09537850SAkhilesh Sanikop dst16 += dst_stride << 1;
1023*09537850SAkhilesh Sanikop } else {
1024*09537850SAkhilesh Sanikop const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
1025*09537850SAkhilesh Sanikop const __m256i packed_results = _mm256_packus_epi16(results, results);
1026*09537850SAkhilesh Sanikop const __m128i this_dst = _mm256_castsi256_si128(packed_results);
1027*09537850SAkhilesh Sanikop const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
1028*09537850SAkhilesh Sanikop
1029*09537850SAkhilesh Sanikop StoreLo8(dst8, this_dst);
1030*09537850SAkhilesh Sanikop StoreLo8(dst8 + dst_stride, next_dst);
1031*09537850SAkhilesh Sanikop dst8 += dst_stride << 1;
1032*09537850SAkhilesh Sanikop }
1033*09537850SAkhilesh Sanikop
1034*09537850SAkhilesh Sanikop srcs[0] = srcs[2];
1035*09537850SAkhilesh Sanikop if (num_taps >= 4) {
1036*09537850SAkhilesh Sanikop srcs[1] = srcs[3];
1037*09537850SAkhilesh Sanikop srcs[2] = srcs[4];
1038*09537850SAkhilesh Sanikop if (num_taps >= 6) {
1039*09537850SAkhilesh Sanikop srcs[3] = srcs[5];
1040*09537850SAkhilesh Sanikop srcs[4] = srcs[6];
1041*09537850SAkhilesh Sanikop if (num_taps == 8) {
1042*09537850SAkhilesh Sanikop srcs[5] = srcs[7];
1043*09537850SAkhilesh Sanikop srcs[6] = srcs[8];
1044*09537850SAkhilesh Sanikop }
1045*09537850SAkhilesh Sanikop }
1046*09537850SAkhilesh Sanikop }
1047*09537850SAkhilesh Sanikop y -= 2;
1048*09537850SAkhilesh Sanikop } while (y != 0);
1049*09537850SAkhilesh Sanikop }
1050*09537850SAkhilesh Sanikop
1051*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
FilterVertical8xH(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int,const int height,const __m128i * const v_tap)1052*09537850SAkhilesh Sanikop void FilterVertical8xH(const uint8_t* LIBGAV1_RESTRICT src,
1053*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
1054*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
1055*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int /*width*/,
1056*09537850SAkhilesh Sanikop const int height, const __m128i* const v_tap) {
1057*09537850SAkhilesh Sanikop const int next_row = num_taps - 1;
1058*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
1059*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
1060*09537850SAkhilesh Sanikop
1061*09537850SAkhilesh Sanikop const uint8_t* src_x = src;
1062*09537850SAkhilesh Sanikop __m128i srcs[8];
1063*09537850SAkhilesh Sanikop srcs[0] = LoadLo8(src_x);
1064*09537850SAkhilesh Sanikop src_x += src_stride;
1065*09537850SAkhilesh Sanikop if (num_taps >= 4) {
1066*09537850SAkhilesh Sanikop srcs[1] = LoadLo8(src_x);
1067*09537850SAkhilesh Sanikop src_x += src_stride;
1068*09537850SAkhilesh Sanikop srcs[2] = LoadLo8(src_x);
1069*09537850SAkhilesh Sanikop src_x += src_stride;
1070*09537850SAkhilesh Sanikop if (num_taps >= 6) {
1071*09537850SAkhilesh Sanikop srcs[3] = LoadLo8(src_x);
1072*09537850SAkhilesh Sanikop src_x += src_stride;
1073*09537850SAkhilesh Sanikop srcs[4] = LoadLo8(src_x);
1074*09537850SAkhilesh Sanikop src_x += src_stride;
1075*09537850SAkhilesh Sanikop if (num_taps == 8) {
1076*09537850SAkhilesh Sanikop srcs[5] = LoadLo8(src_x);
1077*09537850SAkhilesh Sanikop src_x += src_stride;
1078*09537850SAkhilesh Sanikop srcs[6] = LoadLo8(src_x);
1079*09537850SAkhilesh Sanikop src_x += src_stride;
1080*09537850SAkhilesh Sanikop }
1081*09537850SAkhilesh Sanikop }
1082*09537850SAkhilesh Sanikop }
1083*09537850SAkhilesh Sanikop
1084*09537850SAkhilesh Sanikop int y = height;
1085*09537850SAkhilesh Sanikop do {
1086*09537850SAkhilesh Sanikop srcs[next_row] = LoadLo8(src_x);
1087*09537850SAkhilesh Sanikop src_x += src_stride;
1088*09537850SAkhilesh Sanikop
1089*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
1090*09537850SAkhilesh Sanikop if (is_compound) {
1091*09537850SAkhilesh Sanikop const __m128i results = Compound1DShift(sums);
1092*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, results);
1093*09537850SAkhilesh Sanikop dst16 += dst_stride;
1094*09537850SAkhilesh Sanikop } else {
1095*09537850SAkhilesh Sanikop const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
1096*09537850SAkhilesh Sanikop StoreLo8(dst8, _mm_packus_epi16(results, results));
1097*09537850SAkhilesh Sanikop dst8 += dst_stride;
1098*09537850SAkhilesh Sanikop }
1099*09537850SAkhilesh Sanikop
1100*09537850SAkhilesh Sanikop srcs[0] = srcs[1];
1101*09537850SAkhilesh Sanikop if (num_taps >= 4) {
1102*09537850SAkhilesh Sanikop srcs[1] = srcs[2];
1103*09537850SAkhilesh Sanikop srcs[2] = srcs[3];
1104*09537850SAkhilesh Sanikop if (num_taps >= 6) {
1105*09537850SAkhilesh Sanikop srcs[3] = srcs[4];
1106*09537850SAkhilesh Sanikop srcs[4] = srcs[5];
1107*09537850SAkhilesh Sanikop if (num_taps == 8) {
1108*09537850SAkhilesh Sanikop srcs[5] = srcs[6];
1109*09537850SAkhilesh Sanikop srcs[6] = srcs[7];
1110*09537850SAkhilesh Sanikop }
1111*09537850SAkhilesh Sanikop }
1112*09537850SAkhilesh Sanikop }
1113*09537850SAkhilesh Sanikop } while (--y != 0);
1114*09537850SAkhilesh Sanikop }
1115*09537850SAkhilesh Sanikop
ConvolveVertical_AVX2(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)1116*09537850SAkhilesh Sanikop void ConvolveVertical_AVX2(const void* LIBGAV1_RESTRICT const reference,
1117*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride,
1118*09537850SAkhilesh Sanikop const int /*horizontal_filter_index*/,
1119*09537850SAkhilesh Sanikop const int vertical_filter_index,
1120*09537850SAkhilesh Sanikop const int /*horizontal_filter_id*/,
1121*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width,
1122*09537850SAkhilesh Sanikop const int height, void* LIBGAV1_RESTRICT prediction,
1123*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride) {
1124*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(vertical_filter_index, height);
1125*09537850SAkhilesh Sanikop const int vertical_taps =
1126*09537850SAkhilesh Sanikop GetNumTapsInFilter(filter_index, vertical_filter_id);
1127*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
1128*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
1129*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride;
1130*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1131*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = pred_stride;
1132*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
1133*09537850SAkhilesh Sanikop
1134*09537850SAkhilesh Sanikop const __m128i v_filter =
1135*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
1136*09537850SAkhilesh Sanikop
1137*09537850SAkhilesh Sanikop // Use 256 bits for width > 4.
1138*09537850SAkhilesh Sanikop if (width > 4) {
1139*09537850SAkhilesh Sanikop __m256i taps_256[4];
1140*09537850SAkhilesh Sanikop if (vertical_taps == 6) { // 6 tap.
1141*09537850SAkhilesh Sanikop SetupTaps<6>(&v_filter, taps_256);
1142*09537850SAkhilesh Sanikop if (width == 8) {
1143*09537850SAkhilesh Sanikop FilterVertical8xH<6>(src, src_stride, dest, dest_stride, width, height,
1144*09537850SAkhilesh Sanikop taps_256);
1145*09537850SAkhilesh Sanikop } else if (width == 16) {
1146*09537850SAkhilesh Sanikop FilterVertical16xH<6>(src, src_stride, dest, dest_stride, width, height,
1147*09537850SAkhilesh Sanikop taps_256);
1148*09537850SAkhilesh Sanikop } else {
1149*09537850SAkhilesh Sanikop FilterVertical32xH<6>(src, src_stride, dest, dest_stride, width, height,
1150*09537850SAkhilesh Sanikop taps_256);
1151*09537850SAkhilesh Sanikop }
1152*09537850SAkhilesh Sanikop } else if (vertical_taps == 8) { // 8 tap.
1153*09537850SAkhilesh Sanikop SetupTaps<8>(&v_filter, taps_256);
1154*09537850SAkhilesh Sanikop if (width == 8) {
1155*09537850SAkhilesh Sanikop FilterVertical8xH<8>(src, src_stride, dest, dest_stride, width, height,
1156*09537850SAkhilesh Sanikop taps_256);
1157*09537850SAkhilesh Sanikop } else if (width == 16) {
1158*09537850SAkhilesh Sanikop FilterVertical16xH<8>(src, src_stride, dest, dest_stride, width, height,
1159*09537850SAkhilesh Sanikop taps_256);
1160*09537850SAkhilesh Sanikop } else {
1161*09537850SAkhilesh Sanikop FilterVertical32xH<8>(src, src_stride, dest, dest_stride, width, height,
1162*09537850SAkhilesh Sanikop taps_256);
1163*09537850SAkhilesh Sanikop }
1164*09537850SAkhilesh Sanikop } else if (vertical_taps == 2) { // 2 tap.
1165*09537850SAkhilesh Sanikop SetupTaps<2>(&v_filter, taps_256);
1166*09537850SAkhilesh Sanikop if (width == 8) {
1167*09537850SAkhilesh Sanikop FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
1168*09537850SAkhilesh Sanikop taps_256);
1169*09537850SAkhilesh Sanikop } else if (width == 16) {
1170*09537850SAkhilesh Sanikop FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
1171*09537850SAkhilesh Sanikop taps_256);
1172*09537850SAkhilesh Sanikop } else {
1173*09537850SAkhilesh Sanikop FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
1174*09537850SAkhilesh Sanikop taps_256);
1175*09537850SAkhilesh Sanikop }
1176*09537850SAkhilesh Sanikop } else { // 4 tap.
1177*09537850SAkhilesh Sanikop SetupTaps<4>(&v_filter, taps_256);
1178*09537850SAkhilesh Sanikop if (width == 8) {
1179*09537850SAkhilesh Sanikop FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
1180*09537850SAkhilesh Sanikop taps_256);
1181*09537850SAkhilesh Sanikop } else if (width == 16) {
1182*09537850SAkhilesh Sanikop FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
1183*09537850SAkhilesh Sanikop taps_256);
1184*09537850SAkhilesh Sanikop } else {
1185*09537850SAkhilesh Sanikop FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
1186*09537850SAkhilesh Sanikop taps_256);
1187*09537850SAkhilesh Sanikop }
1188*09537850SAkhilesh Sanikop }
1189*09537850SAkhilesh Sanikop } else { // width <= 8
1190*09537850SAkhilesh Sanikop // Use 128 bit code.
1191*09537850SAkhilesh Sanikop __m128i taps[4];
1192*09537850SAkhilesh Sanikop
1193*09537850SAkhilesh Sanikop if (vertical_taps == 6) { // 6 tap.
1194*09537850SAkhilesh Sanikop SetupTaps<6>(&v_filter, taps);
1195*09537850SAkhilesh Sanikop if (width == 2) {
1196*09537850SAkhilesh Sanikop FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
1197*09537850SAkhilesh Sanikop } else {
1198*09537850SAkhilesh Sanikop FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
1199*09537850SAkhilesh Sanikop }
1200*09537850SAkhilesh Sanikop } else if (vertical_taps == 8) { // 8 tap.
1201*09537850SAkhilesh Sanikop SetupTaps<8>(&v_filter, taps);
1202*09537850SAkhilesh Sanikop if (width == 2) {
1203*09537850SAkhilesh Sanikop FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
1204*09537850SAkhilesh Sanikop } else {
1205*09537850SAkhilesh Sanikop FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
1206*09537850SAkhilesh Sanikop }
1207*09537850SAkhilesh Sanikop } else if (vertical_taps == 2) { // 2 tap.
1208*09537850SAkhilesh Sanikop SetupTaps<2>(&v_filter, taps);
1209*09537850SAkhilesh Sanikop if (width == 2) {
1210*09537850SAkhilesh Sanikop FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
1211*09537850SAkhilesh Sanikop } else {
1212*09537850SAkhilesh Sanikop FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
1213*09537850SAkhilesh Sanikop }
1214*09537850SAkhilesh Sanikop } else { // 4 tap.
1215*09537850SAkhilesh Sanikop SetupTaps<4>(&v_filter, taps);
1216*09537850SAkhilesh Sanikop if (width == 2) {
1217*09537850SAkhilesh Sanikop FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
1218*09537850SAkhilesh Sanikop } else {
1219*09537850SAkhilesh Sanikop FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
1220*09537850SAkhilesh Sanikop }
1221*09537850SAkhilesh Sanikop }
1222*09537850SAkhilesh Sanikop }
1223*09537850SAkhilesh Sanikop }
1224*09537850SAkhilesh Sanikop
ConvolveCompoundVertical_AVX2(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t)1225*09537850SAkhilesh Sanikop void ConvolveCompoundVertical_AVX2(
1226*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1227*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
1228*09537850SAkhilesh Sanikop const int vertical_filter_index, const int /*horizontal_filter_id*/,
1229*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
1230*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
1231*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(vertical_filter_index, height);
1232*09537850SAkhilesh Sanikop const int vertical_taps =
1233*09537850SAkhilesh Sanikop GetNumTapsInFilter(filter_index, vertical_filter_id);
1234*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
1235*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
1236*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride;
1237*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1238*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = width;
1239*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
1240*09537850SAkhilesh Sanikop
1241*09537850SAkhilesh Sanikop const __m128i v_filter =
1242*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
1243*09537850SAkhilesh Sanikop
1244*09537850SAkhilesh Sanikop // Use 256 bits for width > 4.
1245*09537850SAkhilesh Sanikop if (width > 4) {
1246*09537850SAkhilesh Sanikop __m256i taps_256[4];
1247*09537850SAkhilesh Sanikop if (vertical_taps == 6) { // 6 tap.
1248*09537850SAkhilesh Sanikop SetupTaps<6>(&v_filter, taps_256);
1249*09537850SAkhilesh Sanikop if (width == 8) {
1250*09537850SAkhilesh Sanikop FilterVertical8xH<6, /*is_compound=*/true>(
1251*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1252*09537850SAkhilesh Sanikop } else if (width == 16) {
1253*09537850SAkhilesh Sanikop FilterVertical16xH<6, /*is_compound=*/true>(
1254*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1255*09537850SAkhilesh Sanikop } else {
1256*09537850SAkhilesh Sanikop FilterVertical32xH<6, /*is_compound=*/true>(
1257*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1258*09537850SAkhilesh Sanikop }
1259*09537850SAkhilesh Sanikop } else if (vertical_taps == 8) { // 8 tap.
1260*09537850SAkhilesh Sanikop SetupTaps<8>(&v_filter, taps_256);
1261*09537850SAkhilesh Sanikop if (width == 8) {
1262*09537850SAkhilesh Sanikop FilterVertical8xH<8, /*is_compound=*/true>(
1263*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1264*09537850SAkhilesh Sanikop } else if (width == 16) {
1265*09537850SAkhilesh Sanikop FilterVertical16xH<8, /*is_compound=*/true>(
1266*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1267*09537850SAkhilesh Sanikop } else {
1268*09537850SAkhilesh Sanikop FilterVertical32xH<8, /*is_compound=*/true>(
1269*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1270*09537850SAkhilesh Sanikop }
1271*09537850SAkhilesh Sanikop } else if (vertical_taps == 2) { // 2 tap.
1272*09537850SAkhilesh Sanikop SetupTaps<2>(&v_filter, taps_256);
1273*09537850SAkhilesh Sanikop if (width == 8) {
1274*09537850SAkhilesh Sanikop FilterVertical8xH<2, /*is_compound=*/true>(
1275*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1276*09537850SAkhilesh Sanikop } else if (width == 16) {
1277*09537850SAkhilesh Sanikop FilterVertical16xH<2, /*is_compound=*/true>(
1278*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1279*09537850SAkhilesh Sanikop } else {
1280*09537850SAkhilesh Sanikop FilterVertical32xH<2, /*is_compound=*/true>(
1281*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1282*09537850SAkhilesh Sanikop }
1283*09537850SAkhilesh Sanikop } else { // 4 tap.
1284*09537850SAkhilesh Sanikop SetupTaps<4>(&v_filter, taps_256);
1285*09537850SAkhilesh Sanikop if (width == 8) {
1286*09537850SAkhilesh Sanikop FilterVertical8xH<4, /*is_compound=*/true>(
1287*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1288*09537850SAkhilesh Sanikop } else if (width == 16) {
1289*09537850SAkhilesh Sanikop FilterVertical16xH<4, /*is_compound=*/true>(
1290*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1291*09537850SAkhilesh Sanikop } else {
1292*09537850SAkhilesh Sanikop FilterVertical32xH<4, /*is_compound=*/true>(
1293*09537850SAkhilesh Sanikop src, src_stride, dest, dest_stride, width, height, taps_256);
1294*09537850SAkhilesh Sanikop }
1295*09537850SAkhilesh Sanikop }
1296*09537850SAkhilesh Sanikop } else { // width <= 4
1297*09537850SAkhilesh Sanikop // Use 128 bit code.
1298*09537850SAkhilesh Sanikop __m128i taps[4];
1299*09537850SAkhilesh Sanikop
1300*09537850SAkhilesh Sanikop if (vertical_taps == 6) { // 6 tap.
1301*09537850SAkhilesh Sanikop SetupTaps<6>(&v_filter, taps);
1302*09537850SAkhilesh Sanikop FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest,
1303*09537850SAkhilesh Sanikop dest_stride, height, taps);
1304*09537850SAkhilesh Sanikop } else if (vertical_taps == 8) { // 8 tap.
1305*09537850SAkhilesh Sanikop SetupTaps<8>(&v_filter, taps);
1306*09537850SAkhilesh Sanikop FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest,
1307*09537850SAkhilesh Sanikop dest_stride, height, taps);
1308*09537850SAkhilesh Sanikop } else if (vertical_taps == 2) { // 2 tap.
1309*09537850SAkhilesh Sanikop SetupTaps<2>(&v_filter, taps);
1310*09537850SAkhilesh Sanikop FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest,
1311*09537850SAkhilesh Sanikop dest_stride, height, taps);
1312*09537850SAkhilesh Sanikop } else { // 4 tap.
1313*09537850SAkhilesh Sanikop SetupTaps<4>(&v_filter, taps);
1314*09537850SAkhilesh Sanikop FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest,
1315*09537850SAkhilesh Sanikop dest_stride, height, taps);
1316*09537850SAkhilesh Sanikop }
1317*09537850SAkhilesh Sanikop }
1318*09537850SAkhilesh Sanikop }
1319*09537850SAkhilesh Sanikop
ConvolveHorizontal_AVX2(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)1320*09537850SAkhilesh Sanikop void ConvolveHorizontal_AVX2(
1321*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1322*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
1323*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int horizontal_filter_id,
1324*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
1325*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
1326*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1327*09537850SAkhilesh Sanikop // Set |src| to the outermost tap.
1328*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1329*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1330*09537850SAkhilesh Sanikop
1331*09537850SAkhilesh Sanikop if (width > 2) {
1332*09537850SAkhilesh Sanikop DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
1333*09537850SAkhilesh Sanikop horizontal_filter_id, filter_index);
1334*09537850SAkhilesh Sanikop } else {
1335*09537850SAkhilesh Sanikop // Use non avx2 version for smaller widths.
1336*09537850SAkhilesh Sanikop DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
1337*09537850SAkhilesh Sanikop horizontal_filter_id, filter_index);
1338*09537850SAkhilesh Sanikop }
1339*09537850SAkhilesh Sanikop }
1340*09537850SAkhilesh Sanikop
ConvolveCompoundHorizontal_AVX2(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)1341*09537850SAkhilesh Sanikop void ConvolveCompoundHorizontal_AVX2(
1342*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1343*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
1344*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int horizontal_filter_id,
1345*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
1346*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
1347*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(horizontal_filter_index, width);
1348*09537850SAkhilesh Sanikop // Set |src| to the outermost tap.
1349*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
1350*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1351*09537850SAkhilesh Sanikop // All compound functions output to the predictor buffer with |pred_stride|
1352*09537850SAkhilesh Sanikop // equal to |width|.
1353*09537850SAkhilesh Sanikop assert(pred_stride == width);
1354*09537850SAkhilesh Sanikop // Compound functions start at 4x4.
1355*09537850SAkhilesh Sanikop assert(width >= 4 && height >= 4);
1356*09537850SAkhilesh Sanikop
1357*09537850SAkhilesh Sanikop #ifdef NDEBUG
1358*09537850SAkhilesh Sanikop // Quiet compiler error.
1359*09537850SAkhilesh Sanikop (void)pred_stride;
1360*09537850SAkhilesh Sanikop #endif
1361*09537850SAkhilesh Sanikop
1362*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
1363*09537850SAkhilesh Sanikop src, reference_stride, dest, width, width, height, horizontal_filter_id,
1364*09537850SAkhilesh Sanikop filter_index);
1365*09537850SAkhilesh Sanikop }
1366*09537850SAkhilesh Sanikop
ConvolveCompound2D_AVX2(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)1367*09537850SAkhilesh Sanikop void ConvolveCompound2D_AVX2(
1368*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1369*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
1370*09537850SAkhilesh Sanikop const int vertical_filter_index, const int horizontal_filter_id,
1371*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
1372*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
1373*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1374*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1375*09537850SAkhilesh Sanikop const int vertical_taps =
1376*09537850SAkhilesh Sanikop GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
1377*09537850SAkhilesh Sanikop
1378*09537850SAkhilesh Sanikop // The output of the horizontal filter is guaranteed to fit in 16 bits.
1379*09537850SAkhilesh Sanikop alignas(32) uint16_t
1380*09537850SAkhilesh Sanikop intermediate_result[kMaxSuperBlockSizeInPixels *
1381*09537850SAkhilesh Sanikop (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
1382*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
1383*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
1384*09537850SAkhilesh Sanikop memset(intermediate_result, 0x33, sizeof(intermediate_result));
1385*09537850SAkhilesh Sanikop #endif
1386*09537850SAkhilesh Sanikop const int intermediate_height = height + vertical_taps - 1;
1387*09537850SAkhilesh Sanikop
1388*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
1389*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
1390*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
1391*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
1392*09537850SAkhilesh Sanikop src, src_stride, intermediate_result, width, width, intermediate_height,
1393*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
1394*09537850SAkhilesh Sanikop
1395*09537850SAkhilesh Sanikop // Vertical filter.
1396*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1397*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = pred_stride;
1398*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
1399*09537850SAkhilesh Sanikop
1400*09537850SAkhilesh Sanikop const __m128i v_filter =
1401*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
1402*09537850SAkhilesh Sanikop
1403*09537850SAkhilesh Sanikop // Use 256 bits for width > 8.
1404*09537850SAkhilesh Sanikop if (width > 8) {
1405*09537850SAkhilesh Sanikop __m256i taps_256[4];
1406*09537850SAkhilesh Sanikop const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
1407*09537850SAkhilesh Sanikop
1408*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
1409*09537850SAkhilesh Sanikop SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
1410*09537850SAkhilesh Sanikop Filter2DVertical16xH<8, /*is_compound=*/true>(
1411*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps_256);
1412*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
1413*09537850SAkhilesh Sanikop SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
1414*09537850SAkhilesh Sanikop Filter2DVertical16xH<6, /*is_compound=*/true>(
1415*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps_256);
1416*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
1417*09537850SAkhilesh Sanikop SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
1418*09537850SAkhilesh Sanikop Filter2DVertical16xH<4, /*is_compound=*/true>(
1419*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps_256);
1420*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
1421*09537850SAkhilesh Sanikop SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
1422*09537850SAkhilesh Sanikop Filter2DVertical16xH<2, /*is_compound=*/true>(
1423*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps_256);
1424*09537850SAkhilesh Sanikop }
1425*09537850SAkhilesh Sanikop } else { // width <= 8
1426*09537850SAkhilesh Sanikop __m128i taps[4];
1427*09537850SAkhilesh Sanikop // Use 128 bit code.
1428*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
1429*09537850SAkhilesh Sanikop SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
1430*09537850SAkhilesh Sanikop if (width == 4) {
1431*09537850SAkhilesh Sanikop Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
1432*09537850SAkhilesh Sanikop dest_stride, height, taps);
1433*09537850SAkhilesh Sanikop } else {
1434*09537850SAkhilesh Sanikop Filter2DVertical<8, /*is_compound=*/true>(
1435*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
1436*09537850SAkhilesh Sanikop }
1437*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
1438*09537850SAkhilesh Sanikop SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
1439*09537850SAkhilesh Sanikop if (width == 4) {
1440*09537850SAkhilesh Sanikop Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
1441*09537850SAkhilesh Sanikop dest_stride, height, taps);
1442*09537850SAkhilesh Sanikop } else {
1443*09537850SAkhilesh Sanikop Filter2DVertical<6, /*is_compound=*/true>(
1444*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
1445*09537850SAkhilesh Sanikop }
1446*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
1447*09537850SAkhilesh Sanikop SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
1448*09537850SAkhilesh Sanikop if (width == 4) {
1449*09537850SAkhilesh Sanikop Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
1450*09537850SAkhilesh Sanikop dest_stride, height, taps);
1451*09537850SAkhilesh Sanikop } else {
1452*09537850SAkhilesh Sanikop Filter2DVertical<4, /*is_compound=*/true>(
1453*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
1454*09537850SAkhilesh Sanikop }
1455*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
1456*09537850SAkhilesh Sanikop SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
1457*09537850SAkhilesh Sanikop if (width == 4) {
1458*09537850SAkhilesh Sanikop Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
1459*09537850SAkhilesh Sanikop dest_stride, height, taps);
1460*09537850SAkhilesh Sanikop } else {
1461*09537850SAkhilesh Sanikop Filter2DVertical<2, /*is_compound=*/true>(
1462*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
1463*09537850SAkhilesh Sanikop }
1464*09537850SAkhilesh Sanikop }
1465*09537850SAkhilesh Sanikop }
1466*09537850SAkhilesh Sanikop }
1467*09537850SAkhilesh Sanikop
Init8bpp()1468*09537850SAkhilesh Sanikop void Init8bpp() {
1469*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1470*09537850SAkhilesh Sanikop assert(dsp != nullptr);
1471*09537850SAkhilesh Sanikop dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
1472*09537850SAkhilesh Sanikop dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
1473*09537850SAkhilesh Sanikop dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
1474*09537850SAkhilesh Sanikop
1475*09537850SAkhilesh Sanikop dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
1476*09537850SAkhilesh Sanikop dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
1477*09537850SAkhilesh Sanikop dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
1478*09537850SAkhilesh Sanikop }
1479*09537850SAkhilesh Sanikop
1480*09537850SAkhilesh Sanikop } // namespace
1481*09537850SAkhilesh Sanikop } // namespace low_bitdepth
1482*09537850SAkhilesh Sanikop
ConvolveInit_AVX2()1483*09537850SAkhilesh Sanikop void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
1484*09537850SAkhilesh Sanikop
1485*09537850SAkhilesh Sanikop } // namespace dsp
1486*09537850SAkhilesh Sanikop } // namespace libgav1
1487*09537850SAkhilesh Sanikop
1488*09537850SAkhilesh Sanikop #else // !LIBGAV1_TARGETING_AVX2
1489*09537850SAkhilesh Sanikop namespace libgav1 {
1490*09537850SAkhilesh Sanikop namespace dsp {
1491*09537850SAkhilesh Sanikop
ConvolveInit_AVX2()1492*09537850SAkhilesh Sanikop void ConvolveInit_AVX2() {}
1493*09537850SAkhilesh Sanikop
1494*09537850SAkhilesh Sanikop } // namespace dsp
1495*09537850SAkhilesh Sanikop } // namespace libgav1
1496*09537850SAkhilesh Sanikop #endif // LIBGAV1_TARGETING_AVX2
1497