xref: /aosp_15_r20/external/libaom/third_party/SVT-AV1/convolve_avx2.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "EbMemory_AVX2.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "EbMemory_SSE4_1.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "synonyms.h"
18*77c1e3ccSAndroid Build Coastguard Worker 
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_filter.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/convolve_avx2.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/mem_sse2.h"
22*77c1e3ccSAndroid Build Coastguard Worker 
populate_coeffs_4tap_avx2(const __m128i coeffs_128,__m256i coeffs[2])23*77c1e3ccSAndroid Build Coastguard Worker static inline void populate_coeffs_4tap_avx2(const __m128i coeffs_128,
24*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i coeffs[2]) {
25*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
26*77c1e3ccSAndroid Build Coastguard Worker 
27*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
28*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
29*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
30*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
31*77c1e3ccSAndroid Build Coastguard Worker }
32*77c1e3ccSAndroid Build Coastguard Worker 
populate_coeffs_6tap_avx2(const __m128i coeffs_128,__m256i coeffs[3])33*77c1e3ccSAndroid Build Coastguard Worker static inline void populate_coeffs_6tap_avx2(const __m128i coeffs_128,
34*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i coeffs[3]) {
35*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
36*77c1e3ccSAndroid Build Coastguard Worker 
37*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 1 2 1 2 1 2 1 2
38*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0402u));
39*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
40*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0806u));
41*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 5 6 5 6 5 6 5 6
42*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0C0Au));
43*77c1e3ccSAndroid Build Coastguard Worker }
44*77c1e3ccSAndroid Build Coastguard Worker 
populate_coeffs_8tap_avx2(const __m128i coeffs_128,__m256i coeffs[4])45*77c1e3ccSAndroid Build Coastguard Worker static inline void populate_coeffs_8tap_avx2(const __m128i coeffs_128,
46*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i coeffs[4]) {
47*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeffs_256 = _mm256_broadcastsi128_si256(coeffs_128);
48*77c1e3ccSAndroid Build Coastguard Worker 
49*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 0 1 0 1 0 1 0 1
50*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0200u));
51*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
52*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0604u));
53*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
54*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0a08u));
55*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 6 7 6 7 6 7 6 7
56*77c1e3ccSAndroid Build Coastguard Worker   coeffs[3] = _mm256_shuffle_epi8(coeffs_256, _mm256_set1_epi16(0x0e0cu));
57*77c1e3ccSAndroid Build Coastguard Worker }
58*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_2tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)59*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_2tap_ssse3(
60*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
61*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [1] */) {
62*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
63*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
64*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
65*77c1e3ccSAndroid Build Coastguard Worker 
66*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
67*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
68*77c1e3ccSAndroid Build Coastguard Worker   // the result.
69*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
70*77c1e3ccSAndroid Build Coastguard Worker   // end result
71*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
72*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
73*77c1e3ccSAndroid Build Coastguard Worker 
74*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
75*77c1e3ccSAndroid Build Coastguard Worker 
76*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
77*77c1e3ccSAndroid Build Coastguard Worker   *coeffs = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
78*77c1e3ccSAndroid Build Coastguard Worker }
79*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_4tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)80*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_4tap_ssse3(
81*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
82*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [2] */) {
83*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
84*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
85*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
86*77c1e3ccSAndroid Build Coastguard Worker 
87*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
88*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
89*77c1e3ccSAndroid Build Coastguard Worker   // the result.
90*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
91*77c1e3ccSAndroid Build Coastguard Worker   // end result
92*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
93*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
94*77c1e3ccSAndroid Build Coastguard Worker 
95*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
96*77c1e3ccSAndroid Build Coastguard Worker 
97*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
98*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
99*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
100*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
101*77c1e3ccSAndroid Build Coastguard Worker }
102*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_6tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)103*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_6tap_ssse3(
104*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
105*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [3] */) {
106*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
107*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
108*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
109*77c1e3ccSAndroid Build Coastguard Worker 
110*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
111*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
112*77c1e3ccSAndroid Build Coastguard Worker   // the result.
113*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
114*77c1e3ccSAndroid Build Coastguard Worker   // end result
115*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
116*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
117*77c1e3ccSAndroid Build Coastguard Worker 
118*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
119*77c1e3ccSAndroid Build Coastguard Worker 
120*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 1 2 1 2 1 2 1 2
121*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0402u));
122*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
123*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0806u));
124*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 5 6 5 6 5 6 5 6
125*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0C0Au));
126*77c1e3ccSAndroid Build Coastguard Worker }
127*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_8tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)128*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_8tap_ssse3(
129*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
130*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [4] */) {
131*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
132*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
133*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
134*77c1e3ccSAndroid Build Coastguard Worker 
135*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
136*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
137*77c1e3ccSAndroid Build Coastguard Worker   // the result.
138*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
139*77c1e3ccSAndroid Build Coastguard Worker   // end result
140*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
141*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
142*77c1e3ccSAndroid Build Coastguard Worker 
143*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
144*77c1e3ccSAndroid Build Coastguard Worker 
145*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 0 1 0 1 0 1 0 1
146*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0200u));
147*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
148*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0604u));
149*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
150*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0a08u));
151*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 6 7 6 7 6 7 6 7
152*77c1e3ccSAndroid Build Coastguard Worker   coeffs[3] = _mm_shuffle_epi8(coeffs_1, _mm_set1_epi16(0x0e0cu));
153*77c1e3ccSAndroid Build Coastguard Worker }
154*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_2tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)155*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_2tap_avx2(
156*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
157*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [1] */) {
158*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
159*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
160*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
161*77c1e3ccSAndroid Build Coastguard Worker   const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
162*77c1e3ccSAndroid Build Coastguard Worker 
163*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
164*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
165*77c1e3ccSAndroid Build Coastguard Worker   // the result.
166*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
167*77c1e3ccSAndroid Build Coastguard Worker   // end result
168*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
169*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
170*77c1e3ccSAndroid Build Coastguard Worker 
171*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
172*77c1e3ccSAndroid Build Coastguard Worker 
173*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
174*77c1e3ccSAndroid Build Coastguard Worker   *coeffs = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
175*77c1e3ccSAndroid Build Coastguard Worker }
176*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_4tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)177*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_4tap_avx2(
178*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
179*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [2] */) {
180*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
181*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
182*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
183*77c1e3ccSAndroid Build Coastguard Worker 
184*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
185*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
186*77c1e3ccSAndroid Build Coastguard Worker   // the result.
187*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
188*77c1e3ccSAndroid Build Coastguard Worker   // end result
189*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
190*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
191*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
192*77c1e3ccSAndroid Build Coastguard Worker   populate_coeffs_4tap_avx2(coeffs_1, coeffs);
193*77c1e3ccSAndroid Build Coastguard Worker }
194*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_6tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)195*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_6tap_avx2(
196*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
197*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [3] */) {
198*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
199*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
200*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
201*77c1e3ccSAndroid Build Coastguard Worker 
202*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
203*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
204*77c1e3ccSAndroid Build Coastguard Worker   // the result.
205*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
206*77c1e3ccSAndroid Build Coastguard Worker   // end result
207*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
208*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
209*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
210*77c1e3ccSAndroid Build Coastguard Worker   populate_coeffs_6tap_avx2(coeffs_1, coeffs);
211*77c1e3ccSAndroid Build Coastguard Worker }
212*77c1e3ccSAndroid Build Coastguard Worker 
prepare_half_coeffs_8tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)213*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_half_coeffs_8tap_avx2(
214*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
215*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [4] */) {
216*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
217*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
218*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
219*77c1e3ccSAndroid Build Coastguard Worker 
220*77c1e3ccSAndroid Build Coastguard Worker   // right shift all filter co-efficients by 1 to reduce the bits required.
221*77c1e3ccSAndroid Build Coastguard Worker   // This extra right shift will be taken care of at the end while rounding
222*77c1e3ccSAndroid Build Coastguard Worker   // the result.
223*77c1e3ccSAndroid Build Coastguard Worker   // Since all filter co-efficients are even, this change will not affect the
224*77c1e3ccSAndroid Build Coastguard Worker   // end result
225*77c1e3ccSAndroid Build Coastguard Worker   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
226*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi16((short)0xffff)));
227*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_1 = _mm_srai_epi16(coeffs_8, 1);
228*77c1e3ccSAndroid Build Coastguard Worker   populate_coeffs_8tap_avx2(coeffs_1, coeffs);
229*77c1e3ccSAndroid Build Coastguard Worker }
230*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_2tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)231*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_2tap_sse2(
232*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
233*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [1] */) {
234*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
235*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
236*77c1e3ccSAndroid Build Coastguard Worker 
237*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff = _mm_cvtsi32_si128(loadu_int32(filter + 3));
238*77c1e3ccSAndroid Build Coastguard Worker 
239*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
240*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
241*77c1e3ccSAndroid Build Coastguard Worker }
242*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_4tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)243*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_4tap_sse2(
244*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
245*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [2] */) {
246*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
247*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
248*77c1e3ccSAndroid Build Coastguard Worker 
249*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
250*77c1e3ccSAndroid Build Coastguard Worker 
251*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
252*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi32(coeff, 0x55);
253*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
254*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm_shuffle_epi32(coeff, 0xaa);
255*77c1e3ccSAndroid Build Coastguard Worker }
256*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_6tap_ssse3(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)257*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_6tap_ssse3(
258*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
259*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [3] */) {
260*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
261*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
262*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
263*77c1e3ccSAndroid Build Coastguard Worker 
264*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 1 2 1 2 1 2 1 2
265*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x05040302u));
266*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
267*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x09080706u));
268*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 5 6 5 6 5 6 5 6
269*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm_shuffle_epi8(coeff, _mm_set1_epi32(0x0D0C0B0Au));
270*77c1e3ccSAndroid Build Coastguard Worker }
271*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_8tap_sse2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m128i * const coeffs)272*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_8tap_sse2(
273*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
274*77c1e3ccSAndroid Build Coastguard Worker     __m128i *const coeffs /* [4] */) {
275*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
276*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
277*77c1e3ccSAndroid Build Coastguard Worker 
278*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
279*77c1e3ccSAndroid Build Coastguard Worker 
280*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 0 1 0 1 0 1 0 1
281*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
282*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
283*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
284*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
285*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
286*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 6 7 6 7 6 7 6 7
287*77c1e3ccSAndroid Build Coastguard Worker   coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
288*77c1e3ccSAndroid Build Coastguard Worker }
289*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_2tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)290*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_2tap_avx2(
291*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
292*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [1] */) {
293*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
294*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
295*77c1e3ccSAndroid Build Coastguard Worker 
296*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff_8 = _mm_cvtsi32_si128(loadu_int32(filter + 3));
297*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
298*77c1e3ccSAndroid Build Coastguard Worker 
299*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
300*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
301*77c1e3ccSAndroid Build Coastguard Worker }
302*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_4tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)303*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_4tap_avx2(
304*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
305*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [2] */) {
306*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
307*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
308*77c1e3ccSAndroid Build Coastguard Worker 
309*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
310*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
311*77c1e3ccSAndroid Build Coastguard Worker 
312*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
313*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x55);
314*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
315*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm256_shuffle_epi32(coeff, 0xaa);
316*77c1e3ccSAndroid Build Coastguard Worker }
317*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_6tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)318*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_6tap_avx2(
319*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
320*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [3]*/) {
321*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
322*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
323*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
324*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeff = _mm256_broadcastsi128_si256(coeffs_8);
325*77c1e3ccSAndroid Build Coastguard Worker 
326*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 1 2 1 2 1 2 1 2
327*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x05040302u));
328*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 3 4 3 4 3 4 3 4
329*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x09080706u));
330*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 5 6 5 6 5 6 5 6
331*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm256_shuffle_epi8(coeff, _mm256_set1_epi32(0x0D0C0B0Au));
332*77c1e3ccSAndroid Build Coastguard Worker }
333*77c1e3ccSAndroid Build Coastguard Worker 
prepare_coeffs_8tap_avx2(const InterpFilterParams * const filter_params,const int32_t subpel_q4,__m256i * const coeffs)334*77c1e3ccSAndroid Build Coastguard Worker static inline void prepare_coeffs_8tap_avx2(
335*77c1e3ccSAndroid Build Coastguard Worker     const InterpFilterParams *const filter_params, const int32_t subpel_q4,
336*77c1e3ccSAndroid Build Coastguard Worker     __m256i *const coeffs /* [4] */) {
337*77c1e3ccSAndroid Build Coastguard Worker   const int16_t *filter = av1_get_interp_filter_subpel_kernel(
338*77c1e3ccSAndroid Build Coastguard Worker       filter_params, subpel_q4 & SUBPEL_MASK);
339*77c1e3ccSAndroid Build Coastguard Worker 
340*77c1e3ccSAndroid Build Coastguard Worker   const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
341*77c1e3ccSAndroid Build Coastguard Worker   const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
342*77c1e3ccSAndroid Build Coastguard Worker 
343*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 0 1 0 1 0 1 0 1
344*77c1e3ccSAndroid Build Coastguard Worker   coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
345*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 2 3 2 3 2 3 2 3
346*77c1e3ccSAndroid Build Coastguard Worker   coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
347*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 4 5 4 5 4 5 4 5
348*77c1e3ccSAndroid Build Coastguard Worker   coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
349*77c1e3ccSAndroid Build Coastguard Worker   // coeffs 6 7 6 7 6 7 6 7
350*77c1e3ccSAndroid Build Coastguard Worker   coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
351*77c1e3ccSAndroid Build Coastguard Worker }
352*77c1e3ccSAndroid Build Coastguard Worker 
load_16bit_5rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[5])353*77c1e3ccSAndroid Build Coastguard Worker static inline void load_16bit_5rows_avx2(const int16_t *const src,
354*77c1e3ccSAndroid Build Coastguard Worker                                          const ptrdiff_t stride,
355*77c1e3ccSAndroid Build Coastguard Worker                                          __m256i dst[5]) {
356*77c1e3ccSAndroid Build Coastguard Worker   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
357*77c1e3ccSAndroid Build Coastguard Worker   dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
358*77c1e3ccSAndroid Build Coastguard Worker   dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
359*77c1e3ccSAndroid Build Coastguard Worker   dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
360*77c1e3ccSAndroid Build Coastguard Worker   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
361*77c1e3ccSAndroid Build Coastguard Worker }
362*77c1e3ccSAndroid Build Coastguard Worker 
load_16bit_7rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[7])363*77c1e3ccSAndroid Build Coastguard Worker static inline void load_16bit_7rows_avx2(const int16_t *const src,
364*77c1e3ccSAndroid Build Coastguard Worker                                          const ptrdiff_t stride,
365*77c1e3ccSAndroid Build Coastguard Worker                                          __m256i dst[7]) {
366*77c1e3ccSAndroid Build Coastguard Worker   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
367*77c1e3ccSAndroid Build Coastguard Worker   dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
368*77c1e3ccSAndroid Build Coastguard Worker   dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
369*77c1e3ccSAndroid Build Coastguard Worker   dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
370*77c1e3ccSAndroid Build Coastguard Worker   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
371*77c1e3ccSAndroid Build Coastguard Worker   dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
372*77c1e3ccSAndroid Build Coastguard Worker   dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
373*77c1e3ccSAndroid Build Coastguard Worker }
374*77c1e3ccSAndroid Build Coastguard Worker 
load_16bit_8rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i dst[8])375*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void load_16bit_8rows_avx2(const int16_t *const src,
376*77c1e3ccSAndroid Build Coastguard Worker                                                    const ptrdiff_t stride,
377*77c1e3ccSAndroid Build Coastguard Worker                                                    __m256i dst[8]) {
378*77c1e3ccSAndroid Build Coastguard Worker   dst[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
379*77c1e3ccSAndroid Build Coastguard Worker   dst[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
380*77c1e3ccSAndroid Build Coastguard Worker   dst[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
381*77c1e3ccSAndroid Build Coastguard Worker   dst[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
382*77c1e3ccSAndroid Build Coastguard Worker   dst[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
383*77c1e3ccSAndroid Build Coastguard Worker   dst[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
384*77c1e3ccSAndroid Build Coastguard Worker   dst[6] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
385*77c1e3ccSAndroid Build Coastguard Worker   dst[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
386*77c1e3ccSAndroid Build Coastguard Worker }
387*77c1e3ccSAndroid Build Coastguard Worker 
loadu_unpack_16bit_5rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[5],__m256i ss_256[5],__m256i tt_256[5])388*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void loadu_unpack_16bit_5rows_avx2(
389*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[5],
390*77c1e3ccSAndroid Build Coastguard Worker     __m256i ss_256[5], __m256i tt_256[5]) {
391*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
392*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
393*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
394*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
395*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
396*77c1e3ccSAndroid Build Coastguard Worker 
397*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
398*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
399*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
400*77c1e3ccSAndroid Build Coastguard Worker   ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
401*77c1e3ccSAndroid Build Coastguard Worker 
402*77c1e3ccSAndroid Build Coastguard Worker   tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
403*77c1e3ccSAndroid Build Coastguard Worker   tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[4]);
404*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
405*77c1e3ccSAndroid Build Coastguard Worker   tt_256[4] = _mm256_unpackhi_epi16(s_256[3], s_256[4]);
406*77c1e3ccSAndroid Build Coastguard Worker }
407*77c1e3ccSAndroid Build Coastguard Worker 
loadu_unpack_16bit_3rows_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[3],__m256i ss_256[3],__m256i tt_256[3])408*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void loadu_unpack_16bit_3rows_avx2(
409*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[3],
410*77c1e3ccSAndroid Build Coastguard Worker     __m256i ss_256[3], __m256i tt_256[3]) {
411*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 0 * stride));
412*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
413*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
414*77c1e3ccSAndroid Build Coastguard Worker 
415*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
416*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
417*77c1e3ccSAndroid Build Coastguard Worker 
418*77c1e3ccSAndroid Build Coastguard Worker   tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
419*77c1e3ccSAndroid Build Coastguard Worker   tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
420*77c1e3ccSAndroid Build Coastguard Worker }
421*77c1e3ccSAndroid Build Coastguard Worker 
convolve_8tap_unpack_avx2(const __m256i s[6],__m256i ss[7])422*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_8tap_unpack_avx2(const __m256i s[6],
423*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i ss[7]) {
424*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm256_unpacklo_epi16(s[0], s[1]);
425*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm256_unpacklo_epi16(s[2], s[3]);
426*77c1e3ccSAndroid Build Coastguard Worker   ss[2] = _mm256_unpacklo_epi16(s[4], s[5]);
427*77c1e3ccSAndroid Build Coastguard Worker   ss[4] = _mm256_unpackhi_epi16(s[0], s[1]);
428*77c1e3ccSAndroid Build Coastguard Worker   ss[5] = _mm256_unpackhi_epi16(s[2], s[3]);
429*77c1e3ccSAndroid Build Coastguard Worker   ss[6] = _mm256_unpackhi_epi16(s[4], s[5]);
430*77c1e3ccSAndroid Build Coastguard Worker }
431*77c1e3ccSAndroid Build Coastguard Worker 
convolve_2tap_ssse3(const __m128i ss[1],const __m128i coeffs[1])432*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve_2tap_ssse3(const __m128i ss[1],
433*77c1e3ccSAndroid Build Coastguard Worker                                           const __m128i coeffs[1]) {
434*77c1e3ccSAndroid Build Coastguard Worker   return _mm_maddubs_epi16(ss[0], coeffs[0]);
435*77c1e3ccSAndroid Build Coastguard Worker }
436*77c1e3ccSAndroid Build Coastguard Worker 
convolve_4tap_ssse3(const __m128i ss[2],const __m128i coeffs[2])437*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve_4tap_ssse3(const __m128i ss[2],
438*77c1e3ccSAndroid Build Coastguard Worker                                           const __m128i coeffs[2]) {
439*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_23 = _mm_maddubs_epi16(ss[0], coeffs[0]);
440*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_45 = _mm_maddubs_epi16(ss[1], coeffs[1]);
441*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi16(res_23, res_45);
442*77c1e3ccSAndroid Build Coastguard Worker }
443*77c1e3ccSAndroid Build Coastguard Worker 
convolve_6tap_ssse3(const __m128i ss[3],const __m128i coeffs[3])444*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve_6tap_ssse3(const __m128i ss[3],
445*77c1e3ccSAndroid Build Coastguard Worker                                           const __m128i coeffs[3]) {
446*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_12 = _mm_maddubs_epi16(ss[0], coeffs[0]);
447*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_34 = _mm_maddubs_epi16(ss[1], coeffs[1]);
448*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_56 = _mm_maddubs_epi16(ss[2], coeffs[2]);
449*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_1256 = _mm_add_epi16(res_12, res_56);
450*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi16(res_1256, res_34);
451*77c1e3ccSAndroid Build Coastguard Worker }
452*77c1e3ccSAndroid Build Coastguard Worker 
convolve_8tap_ssse3(const __m128i ss[4],const __m128i coeffs[4])453*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve_8tap_ssse3(const __m128i ss[4],
454*77c1e3ccSAndroid Build Coastguard Worker                                           const __m128i coeffs[4]) {
455*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_01 = _mm_maddubs_epi16(ss[0], coeffs[0]);
456*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_23 = _mm_maddubs_epi16(ss[1], coeffs[1]);
457*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_45 = _mm_maddubs_epi16(ss[2], coeffs[2]);
458*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_67 = _mm_maddubs_epi16(ss[3], coeffs[3]);
459*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_0145 = _mm_add_epi16(res_01, res_45);
460*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_2367 = _mm_add_epi16(res_23, res_67);
461*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi16(res_0145, res_2367);
462*77c1e3ccSAndroid Build Coastguard Worker }
463*77c1e3ccSAndroid Build Coastguard Worker 
convolve_2tap_avx2(const __m256i ss[1],const __m256i coeffs[1])464*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve_2tap_avx2(const __m256i ss[1],
465*77c1e3ccSAndroid Build Coastguard Worker                                          const __m256i coeffs[1]) {
466*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_maddubs_epi16(ss[0], coeffs[0]);
467*77c1e3ccSAndroid Build Coastguard Worker }
468*77c1e3ccSAndroid Build Coastguard Worker 
convolve_4tap_avx2(const __m256i ss[2],const __m256i coeffs[2])469*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve_4tap_avx2(const __m256i ss[2],
470*77c1e3ccSAndroid Build Coastguard Worker                                          const __m256i coeffs[2]) {
471*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_23 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
472*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_45 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
473*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi16(res_23, res_45);
474*77c1e3ccSAndroid Build Coastguard Worker }
475*77c1e3ccSAndroid Build Coastguard Worker 
convolve_6tap_avx2(const __m256i ss[3],const __m256i coeffs[3])476*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve_6tap_avx2(const __m256i ss[3],
477*77c1e3ccSAndroid Build Coastguard Worker                                          const __m256i coeffs[3]) {
478*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
479*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
480*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
481*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
482*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi16(res_0145, res_23);
483*77c1e3ccSAndroid Build Coastguard Worker }
484*77c1e3ccSAndroid Build Coastguard Worker 
convolve_8tap_avx2(const __m256i ss[4],const __m256i coeffs[4])485*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve_8tap_avx2(const __m256i ss[4],
486*77c1e3ccSAndroid Build Coastguard Worker                                          const __m256i coeffs[4]) {
487*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_01 = _mm256_maddubs_epi16(ss[0], coeffs[0]);
488*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_23 = _mm256_maddubs_epi16(ss[1], coeffs[1]);
489*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_45 = _mm256_maddubs_epi16(ss[2], coeffs[2]);
490*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_67 = _mm256_maddubs_epi16(ss[3], coeffs[3]);
491*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_0145 = _mm256_add_epi16(res_01, res_45);
492*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_2367 = _mm256_add_epi16(res_23, res_67);
493*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi16(res_0145, res_2367);
494*77c1e3ccSAndroid Build Coastguard Worker }
495*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_2tap_sse2(const __m128i ss[1],const __m128i coeffs[1])496*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve16_2tap_sse2(const __m128i ss[1],
497*77c1e3ccSAndroid Build Coastguard Worker                                            const __m128i coeffs[1]) {
498*77c1e3ccSAndroid Build Coastguard Worker   return _mm_madd_epi16(ss[0], coeffs[0]);
499*77c1e3ccSAndroid Build Coastguard Worker }
500*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_4tap_sse2(const __m128i ss[2],const __m128i coeffs[2])501*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve16_4tap_sse2(const __m128i ss[2],
502*77c1e3ccSAndroid Build Coastguard Worker                                            const __m128i coeffs[2]) {
503*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
504*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
505*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi32(res_01, res_23);
506*77c1e3ccSAndroid Build Coastguard Worker }
507*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_6tap_sse2(const __m128i ss[3],const __m128i coeffs[3])508*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve16_6tap_sse2(const __m128i ss[3],
509*77c1e3ccSAndroid Build Coastguard Worker                                            const __m128i coeffs[3]) {
510*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
511*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
512*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
513*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
514*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi32(res_0123, res_45);
515*77c1e3ccSAndroid Build Coastguard Worker }
516*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_8tap_sse2(const __m128i ss[4],const __m128i coeffs[4])517*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convolve16_8tap_sse2(const __m128i ss[4],
518*77c1e3ccSAndroid Build Coastguard Worker                                            const __m128i coeffs[4]) {
519*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_01 = _mm_madd_epi16(ss[0], coeffs[0]);
520*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_23 = _mm_madd_epi16(ss[1], coeffs[1]);
521*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_45 = _mm_madd_epi16(ss[2], coeffs[2]);
522*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_67 = _mm_madd_epi16(ss[3], coeffs[3]);
523*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_0123 = _mm_add_epi32(res_01, res_23);
524*77c1e3ccSAndroid Build Coastguard Worker   const __m128i res_4567 = _mm_add_epi32(res_45, res_67);
525*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi32(res_0123, res_4567);
526*77c1e3ccSAndroid Build Coastguard Worker }
527*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_2tap_avx2(const __m256i ss[1],const __m256i coeffs[1])528*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve16_2tap_avx2(const __m256i ss[1],
529*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[1]) {
530*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_madd_epi16(ss[0], coeffs[0]);
531*77c1e3ccSAndroid Build Coastguard Worker }
532*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_4tap_avx2(const __m256i ss[2],const __m256i coeffs[2])533*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve16_4tap_avx2(const __m256i ss[2],
534*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[2]) {
535*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_1 = _mm256_madd_epi16(ss[0], coeffs[0]);
536*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_2 = _mm256_madd_epi16(ss[1], coeffs[1]);
537*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi32(res_1, res_2);
538*77c1e3ccSAndroid Build Coastguard Worker }
539*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_6tap_avx2(const __m256i ss[3],const __m256i coeffs[3])540*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve16_6tap_avx2(const __m256i ss[3],
541*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[3]) {
542*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
543*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
544*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
545*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
546*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi32(res_0123, res_45);
547*77c1e3ccSAndroid Build Coastguard Worker }
548*77c1e3ccSAndroid Build Coastguard Worker 
convolve16_8tap_avx2(const __m256i ss[4],const __m256i coeffs[4])549*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convolve16_8tap_avx2(const __m256i ss[4],
550*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[4]) {
551*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_01 = _mm256_madd_epi16(ss[0], coeffs[0]);
552*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_23 = _mm256_madd_epi16(ss[1], coeffs[1]);
553*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_45 = _mm256_madd_epi16(ss[2], coeffs[2]);
554*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_67 = _mm256_madd_epi16(ss[3], coeffs[3]);
555*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_0123 = _mm256_add_epi32(res_01, res_23);
556*77c1e3ccSAndroid Build Coastguard Worker   const __m256i res_4567 = _mm256_add_epi32(res_45, res_67);
557*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi32(res_0123, res_4567);
558*77c1e3ccSAndroid Build Coastguard Worker }
559*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_4tap_avx2(const __m256i data,const __m256i coeffs[2],const __m256i filt[2])560*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_4tap_avx2(const __m256i data,
561*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[2],
562*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i filt[2]) {
563*77c1e3ccSAndroid Build Coastguard Worker   __m256i ss[2];
564*77c1e3ccSAndroid Build Coastguard Worker 
565*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm256_shuffle_epi8(data, filt[0]);
566*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm256_shuffle_epi8(data, filt[1]);
567*77c1e3ccSAndroid Build Coastguard Worker 
568*77c1e3ccSAndroid Build Coastguard Worker   return convolve_4tap_avx2(ss, coeffs);
569*77c1e3ccSAndroid Build Coastguard Worker }
570*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_6tap_avx2(const __m256i data,const __m256i coeffs[3],const __m256i filt[3])571*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_6tap_avx2(const __m256i data,
572*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[3],
573*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i filt[3]) {
574*77c1e3ccSAndroid Build Coastguard Worker   __m256i ss[3];
575*77c1e3ccSAndroid Build Coastguard Worker 
576*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm256_shuffle_epi8(data, filt[0]);
577*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm256_shuffle_epi8(data, filt[1]);
578*77c1e3ccSAndroid Build Coastguard Worker   ss[2] = _mm256_shuffle_epi8(data, filt[2]);
579*77c1e3ccSAndroid Build Coastguard Worker 
580*77c1e3ccSAndroid Build Coastguard Worker   return convolve_6tap_avx2(ss, coeffs);
581*77c1e3ccSAndroid Build Coastguard Worker }
582*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_8tap_avx2(const __m256i data,const __m256i coeffs[4],const __m256i filt[4])583*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_8tap_avx2(const __m256i data,
584*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[4],
585*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i filt[4]) {
586*77c1e3ccSAndroid Build Coastguard Worker   __m256i ss[4];
587*77c1e3ccSAndroid Build Coastguard Worker 
588*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm256_shuffle_epi8(data, filt[0]);
589*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm256_shuffle_epi8(data, filt[1]);
590*77c1e3ccSAndroid Build Coastguard Worker   ss[2] = _mm256_shuffle_epi8(data, filt[2]);
591*77c1e3ccSAndroid Build Coastguard Worker   ss[3] = _mm256_shuffle_epi8(data, filt[3]);
592*77c1e3ccSAndroid Build Coastguard Worker 
593*77c1e3ccSAndroid Build Coastguard Worker   return convolve_8tap_avx2(ss, coeffs);
594*77c1e3ccSAndroid Build Coastguard Worker }
595*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_round_avx2(const __m256i src)596*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i sr_y_round_avx2(const __m256i src) {
597*77c1e3ccSAndroid Build Coastguard Worker   const __m256i round = _mm256_set1_epi16(32);
598*77c1e3ccSAndroid Build Coastguard Worker   const __m256i dst = _mm256_add_epi16(src, round);
599*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_srai_epi16(dst, FILTER_BITS - 1);
600*77c1e3ccSAndroid Build Coastguard Worker }
601*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_sse2(const __m128i src)602*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_x_round_sse2(const __m128i src) {
603*77c1e3ccSAndroid Build Coastguard Worker   const __m128i round = _mm_set1_epi16(2);
604*77c1e3ccSAndroid Build Coastguard Worker   const __m128i dst = _mm_add_epi16(src, round);
605*77c1e3ccSAndroid Build Coastguard Worker   return _mm_srai_epi16(dst, 2);
606*77c1e3ccSAndroid Build Coastguard Worker }
607*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_avx2(const __m256i src)608*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_x_round_avx2(const __m256i src) {
609*77c1e3ccSAndroid Build Coastguard Worker   const __m256i round = _mm256_set1_epi16(2);
610*77c1e3ccSAndroid Build Coastguard Worker   const __m256i dst = _mm256_add_epi16(src, round);
611*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_srai_epi16(dst, 2);
612*77c1e3ccSAndroid Build Coastguard Worker }
613*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_store_2x2_sse2(const __m128i res,int16_t * const dst)614*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_round_store_2x2_sse2(const __m128i res,
615*77c1e3ccSAndroid Build Coastguard Worker                                              int16_t *const dst) {
616*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d = xy_x_round_sse2(res);
617*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)dst, d);
618*77c1e3ccSAndroid Build Coastguard Worker }
619*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_store_4x2_sse2(const __m128i res,int16_t * const dst)620*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_round_store_4x2_sse2(const __m128i res,
621*77c1e3ccSAndroid Build Coastguard Worker                                              int16_t *const dst) {
622*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d = xy_x_round_sse2(res);
623*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)dst, d);
624*77c1e3ccSAndroid Build Coastguard Worker }
625*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_store_8x2_sse2(const __m128i res[2],int16_t * const dst)626*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_round_store_8x2_sse2(const __m128i res[2],
627*77c1e3ccSAndroid Build Coastguard Worker                                              int16_t *const dst) {
628*77c1e3ccSAndroid Build Coastguard Worker   __m128i r[2];
629*77c1e3ccSAndroid Build Coastguard Worker 
630*77c1e3ccSAndroid Build Coastguard Worker   r[0] = xy_x_round_sse2(res[0]);
631*77c1e3ccSAndroid Build Coastguard Worker   r[1] = xy_x_round_sse2(res[1]);
632*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)dst, r[0]);
633*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 8), r[1]);
634*77c1e3ccSAndroid Build Coastguard Worker }
635*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_store_8x2_avx2(const __m256i res,int16_t * const dst)636*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_round_store_8x2_avx2(const __m256i res,
637*77c1e3ccSAndroid Build Coastguard Worker                                              int16_t *const dst) {
638*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = xy_x_round_avx2(res);
639*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d);
640*77c1e3ccSAndroid Build Coastguard Worker }
641*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_round_store_32_avx2(const __m256i res[2],int16_t * const dst)642*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_round_store_32_avx2(const __m256i res[2],
643*77c1e3ccSAndroid Build Coastguard Worker                                             int16_t *const dst) {
644*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
645*77c1e3ccSAndroid Build Coastguard Worker 
646*77c1e3ccSAndroid Build Coastguard Worker   r[0] = xy_x_round_avx2(res[0]);
647*77c1e3ccSAndroid Build Coastguard Worker   r[1] = xy_x_round_avx2(res[1]);
648*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d0 =
649*77c1e3ccSAndroid Build Coastguard Worker       _mm256_inserti128_si256(r[0], _mm256_castsi256_si128(r[1]), 1);
650*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d1 =
651*77c1e3ccSAndroid Build Coastguard Worker       _mm256_inserti128_si256(r[1], _mm256_extracti128_si256(r[0], 1), 0);
652*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d0);
653*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
654*77c1e3ccSAndroid Build Coastguard Worker }
655*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_sse2(const __m128i src)656*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_round_sse2(const __m128i src) {
657*77c1e3ccSAndroid Build Coastguard Worker   const __m128i round = _mm_set1_epi32(1024);
658*77c1e3ccSAndroid Build Coastguard Worker   const __m128i dst = _mm_add_epi32(src, round);
659*77c1e3ccSAndroid Build Coastguard Worker   return _mm_srai_epi32(dst, 11);
660*77c1e3ccSAndroid Build Coastguard Worker }
661*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_half_pel_sse2(const __m128i src)662*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_round_half_pel_sse2(const __m128i src) {
663*77c1e3ccSAndroid Build Coastguard Worker   const __m128i round = _mm_set1_epi16(16);
664*77c1e3ccSAndroid Build Coastguard Worker   const __m128i dst = _mm_add_epi16(src, round);
665*77c1e3ccSAndroid Build Coastguard Worker   return _mm_srai_epi16(dst, 5);
666*77c1e3ccSAndroid Build Coastguard Worker }
667*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_avx2(const __m256i src)668*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_round_avx2(const __m256i src) {
669*77c1e3ccSAndroid Build Coastguard Worker   const __m256i round = _mm256_set1_epi32(1024);
670*77c1e3ccSAndroid Build Coastguard Worker   const __m256i dst = _mm256_add_epi32(src, round);
671*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_srai_epi32(dst, 11);
672*77c1e3ccSAndroid Build Coastguard Worker }
673*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_16_avx2(const __m256i r[2])674*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_round_16_avx2(const __m256i r[2]) {
675*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r0 = xy_y_round_avx2(r[0]);
676*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r1 = xy_y_round_avx2(r[1]);
677*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_packs_epi32(r0, r1);
678*77c1e3ccSAndroid Build Coastguard Worker }
679*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_half_pel_avx2(const __m256i src)680*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_round_half_pel_avx2(const __m256i src) {
681*77c1e3ccSAndroid Build Coastguard Worker   const __m256i round = _mm256_set1_epi16(16);
682*77c1e3ccSAndroid Build Coastguard Worker   const __m256i dst = _mm256_add_epi16(src, round);
683*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_srai_epi16(dst, 5);
684*77c1e3ccSAndroid Build Coastguard Worker }
685*77c1e3ccSAndroid Build Coastguard Worker 
pack_store_2x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)686*77c1e3ccSAndroid Build Coastguard Worker static inline void pack_store_2x2_sse2(const __m128i res, uint8_t *const dst,
687*77c1e3ccSAndroid Build Coastguard Worker                                        const ptrdiff_t stride) {
688*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d = _mm_packus_epi16(res, res);
689*77c1e3ccSAndroid Build Coastguard Worker   *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d);
690*77c1e3ccSAndroid Build Coastguard Worker   *(int16_t *)(dst + stride) = (int16_t)_mm_extract_epi16(d, 1);
691*77c1e3ccSAndroid Build Coastguard Worker }
692*77c1e3ccSAndroid Build Coastguard Worker 
pack_store_4x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)693*77c1e3ccSAndroid Build Coastguard Worker static inline void pack_store_4x2_sse2(const __m128i res, uint8_t *const dst,
694*77c1e3ccSAndroid Build Coastguard Worker                                        const ptrdiff_t stride) {
695*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d = _mm_packus_epi16(res, res);
696*77c1e3ccSAndroid Build Coastguard Worker   store_u8_4x2_sse2(d, dst, stride);
697*77c1e3ccSAndroid Build Coastguard Worker }
698*77c1e3ccSAndroid Build Coastguard Worker 
pack_store_4x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)699*77c1e3ccSAndroid Build Coastguard Worker static inline void pack_store_4x2_avx2(const __m256i res, uint8_t *const dst,
700*77c1e3ccSAndroid Build Coastguard Worker                                        const ptrdiff_t stride) {
701*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_packus_epi16(res, res);
702*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d0 = _mm256_castsi256_si128(d);
703*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d1 = _mm256_extracti128_si256(d, 1);
704*77c1e3ccSAndroid Build Coastguard Worker 
705*77c1e3ccSAndroid Build Coastguard Worker   xx_storel_32(dst, d0);
706*77c1e3ccSAndroid Build Coastguard Worker   xx_storel_32(dst + stride, d1);
707*77c1e3ccSAndroid Build Coastguard Worker }
708*77c1e3ccSAndroid Build Coastguard Worker 
pack_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)709*77c1e3ccSAndroid Build Coastguard Worker static inline void pack_store_8x2_avx2(const __m256i res, uint8_t *const dst,
710*77c1e3ccSAndroid Build Coastguard Worker                                        const ptrdiff_t stride) {
711*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_packus_epi16(res, res);
712*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d0 = _mm256_castsi256_si128(d);
713*77c1e3ccSAndroid Build Coastguard Worker   const __m128i d1 = _mm256_extracti128_si256(d, 1);
714*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)dst, d0);
715*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + stride), d1);
716*77c1e3ccSAndroid Build Coastguard Worker }
717*77c1e3ccSAndroid Build Coastguard Worker 
pack_store_16x2_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst,const ptrdiff_t stride)718*77c1e3ccSAndroid Build Coastguard Worker static inline void pack_store_16x2_avx2(const __m256i res0, const __m256i res1,
719*77c1e3ccSAndroid Build Coastguard Worker                                         uint8_t *const dst,
720*77c1e3ccSAndroid Build Coastguard Worker                                         const ptrdiff_t stride) {
721*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_packus_epi16(res0, res1);
722*77c1e3ccSAndroid Build Coastguard Worker   storeu_u8_16x2_avx2(d, dst, stride);
723*77c1e3ccSAndroid Build Coastguard Worker }
724*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_pack_store_16x2_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst,const ptrdiff_t stride)725*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_pack_store_16x2_avx2(const __m256i res0,
726*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i res1,
727*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *const dst,
728*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride) {
729*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t = _mm256_packus_epi16(res0, res1);
730*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
731*77c1e3ccSAndroid Build Coastguard Worker   storeu_u8_16x2_avx2(d, dst, stride);
732*77c1e3ccSAndroid Build Coastguard Worker }
733*77c1e3ccSAndroid Build Coastguard Worker 
pack_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)734*77c1e3ccSAndroid Build Coastguard Worker static inline void pack_store_32_avx2(const __m256i res0, const __m256i res1,
735*77c1e3ccSAndroid Build Coastguard Worker                                       uint8_t *const dst) {
736*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t = _mm256_packus_epi16(res0, res1);
737*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
738*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d);
739*77c1e3ccSAndroid Build Coastguard Worker }
740*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_store_2x2_sse2(const __m128i res,uint8_t * const dst,const ptrdiff_t stride)741*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_round_store_2x2_sse2(const __m128i res,
742*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *const dst,
743*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride) {
744*77c1e3ccSAndroid Build Coastguard Worker   const __m128i r = xy_y_round_sse2(res);
745*77c1e3ccSAndroid Build Coastguard Worker   const __m128i rr = _mm_packs_epi32(r, r);
746*77c1e3ccSAndroid Build Coastguard Worker   pack_store_2x2_sse2(rr, dst, stride);
747*77c1e3ccSAndroid Build Coastguard Worker }
748*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_store_4x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t stride)749*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_round_store_4x2_avx2(const __m256i res,
750*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *const dst,
751*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride) {
752*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = xy_y_round_avx2(res);
753*77c1e3ccSAndroid Build Coastguard Worker   const __m256i rr = _mm256_packs_epi32(r, r);
754*77c1e3ccSAndroid Build Coastguard Worker   pack_store_4x2_avx2(rr, dst, stride);
755*77c1e3ccSAndroid Build Coastguard Worker }
756*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_pack_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)757*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_pack_store_32_avx2(const __m256i res0,
758*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i res1,
759*77c1e3ccSAndroid Build Coastguard Worker                                            uint8_t *const dst) {
760*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_packus_epi16(res0, res1);
761*77c1e3ccSAndroid Build Coastguard Worker   // d = _mm256_permute4x64_epi64(d, 0xD8);
762*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d);
763*77c1e3ccSAndroid Build Coastguard Worker }
764*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_store_32_avx2(const __m256i r0[2],const __m256i r1[2],uint8_t * const dst)765*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_round_store_32_avx2(const __m256i r0[2],
766*77c1e3ccSAndroid Build Coastguard Worker                                             const __m256i r1[2],
767*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *const dst) {
768*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ra = xy_y_round_16_avx2(r0);
769*77c1e3ccSAndroid Build Coastguard Worker   const __m256i rb = xy_y_round_16_avx2(r1);
770*77c1e3ccSAndroid Build Coastguard Worker   xy_y_pack_store_32_avx2(ra, rb, dst);
771*77c1e3ccSAndroid Build Coastguard Worker }
772*77c1e3ccSAndroid Build Coastguard Worker 
convolve_store_32_avx2(const __m256i res0,const __m256i res1,uint8_t * const dst)773*77c1e3ccSAndroid Build Coastguard Worker static inline void convolve_store_32_avx2(const __m256i res0,
774*77c1e3ccSAndroid Build Coastguard Worker                                           const __m256i res1,
775*77c1e3ccSAndroid Build Coastguard Worker                                           uint8_t *const dst) {
776*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_packus_epi16(res0, res1);
777*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d);
778*77c1e3ccSAndroid Build Coastguard Worker }
779*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_round_sse2(const __m128i src)780*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i sr_x_round_sse2(const __m128i src) {
781*77c1e3ccSAndroid Build Coastguard Worker   const __m128i round = _mm_set1_epi16(34);
782*77c1e3ccSAndroid Build Coastguard Worker   const __m128i dst = _mm_add_epi16(src, round);
783*77c1e3ccSAndroid Build Coastguard Worker   return _mm_srai_epi16(dst, 6);
784*77c1e3ccSAndroid Build Coastguard Worker }
785*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_round_avx2(const __m256i src)786*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i sr_x_round_avx2(const __m256i src) {
787*77c1e3ccSAndroid Build Coastguard Worker   const __m256i round = _mm256_set1_epi16(34);
788*77c1e3ccSAndroid Build Coastguard Worker   const __m256i dst = _mm256_add_epi16(src, round);
789*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_srai_epi16(dst, 6);
790*77c1e3ccSAndroid Build Coastguard Worker }
791*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_round_sse2(const __m128i src)792*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i sr_y_round_sse2(const __m128i src) {
793*77c1e3ccSAndroid Build Coastguard Worker   const __m128i round = _mm_set1_epi16(32);
794*77c1e3ccSAndroid Build Coastguard Worker   const __m128i dst = _mm_add_epi16(src, round);
795*77c1e3ccSAndroid Build Coastguard Worker   return _mm_srai_epi16(dst, FILTER_BITS - 1);
796*77c1e3ccSAndroid Build Coastguard Worker }
797*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_round_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t dst_stride)798*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_x_round_store_8x2_avx2(const __m256i res,
799*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *const dst,
800*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t dst_stride) {
801*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = sr_x_round_avx2(res);
802*77c1e3ccSAndroid Build Coastguard Worker   pack_store_8x2_avx2(r, dst, dst_stride);
803*77c1e3ccSAndroid Build Coastguard Worker }
804*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_round_store_16x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t dst_stride)805*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_x_round_store_16x2_avx2(const __m256i res[2],
806*77c1e3ccSAndroid Build Coastguard Worker                                               uint8_t *const dst,
807*77c1e3ccSAndroid Build Coastguard Worker                                               const ptrdiff_t dst_stride) {
808*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
809*77c1e3ccSAndroid Build Coastguard Worker 
810*77c1e3ccSAndroid Build Coastguard Worker   r[0] = sr_x_round_avx2(res[0]);
811*77c1e3ccSAndroid Build Coastguard Worker   r[1] = sr_x_round_avx2(res[1]);
812*77c1e3ccSAndroid Build Coastguard Worker   pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
813*77c1e3ccSAndroid Build Coastguard Worker }
814*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_round_store_32_avx2(const __m256i res[2],uint8_t * const dst)815*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_x_round_store_32_avx2(const __m256i res[2],
816*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *const dst) {
817*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
818*77c1e3ccSAndroid Build Coastguard Worker 
819*77c1e3ccSAndroid Build Coastguard Worker   r[0] = sr_x_round_avx2(res[0]);
820*77c1e3ccSAndroid Build Coastguard Worker   r[1] = sr_x_round_avx2(res[1]);
821*77c1e3ccSAndroid Build Coastguard Worker   convolve_store_32_avx2(r[0], r[1], dst);
822*77c1e3ccSAndroid Build Coastguard Worker }
823*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_round_store_8x2_avx2(const __m256i res,uint8_t * const dst,const ptrdiff_t dst_stride)824*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_y_round_store_8x2_avx2(const __m256i res,
825*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *const dst,
826*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t dst_stride) {
827*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = sr_y_round_avx2(res);
828*77c1e3ccSAndroid Build Coastguard Worker   pack_store_8x2_avx2(r, dst, dst_stride);
829*77c1e3ccSAndroid Build Coastguard Worker }
830*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_round_store_16x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t dst_stride)831*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_y_round_store_16x2_avx2(const __m256i res[2],
832*77c1e3ccSAndroid Build Coastguard Worker                                               uint8_t *const dst,
833*77c1e3ccSAndroid Build Coastguard Worker                                               const ptrdiff_t dst_stride) {
834*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
835*77c1e3ccSAndroid Build Coastguard Worker 
836*77c1e3ccSAndroid Build Coastguard Worker   r[0] = sr_y_round_avx2(res[0]);
837*77c1e3ccSAndroid Build Coastguard Worker   r[1] = sr_y_round_avx2(res[1]);
838*77c1e3ccSAndroid Build Coastguard Worker   pack_store_16x2_avx2(r[0], r[1], dst, dst_stride);
839*77c1e3ccSAndroid Build Coastguard Worker }
840*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_2tap_32_avg_avx2(const uint8_t * const src,const __m256i s0,__m256i * const s1,uint8_t * const dst)841*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_y_2tap_32_avg_avx2(const uint8_t *const src,
842*77c1e3ccSAndroid Build Coastguard Worker                                          const __m256i s0, __m256i *const s1,
843*77c1e3ccSAndroid Build Coastguard Worker                                          uint8_t *const dst) {
844*77c1e3ccSAndroid Build Coastguard Worker   *s1 = _mm256_loadu_si256((__m256i *)src);
845*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_avg_epu8(s0, *s1);
846*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d);
847*77c1e3ccSAndroid Build Coastguard Worker }
848*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_2tap_32_avg_avx2(const uint8_t * const src,uint8_t * const dst)849*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_x_2tap_32_avg_avx2(const uint8_t *const src,
850*77c1e3ccSAndroid Build Coastguard Worker                                          uint8_t *const dst) {
851*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
852*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
853*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_avg_epu8(s0, s1);
854*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d);
855*77c1e3ccSAndroid Build Coastguard Worker }
856*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_2tap_2x2_sse4_1(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1])857*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i x_convolve_2tap_2x2_sse4_1(const uint8_t *const src,
858*77c1e3ccSAndroid Build Coastguard Worker                                                  const ptrdiff_t stride,
859*77c1e3ccSAndroid Build Coastguard Worker                                                  const __m128i coeffs[1]) {
860*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl =
861*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(0, 1, 1, 2, 4, 5, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0);
862*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s_128 = load_u8_4x2_sse4_1(src, stride);
863*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
864*77c1e3ccSAndroid Build Coastguard Worker   return convolve_2tap_ssse3(&ss, coeffs);
865*77c1e3ccSAndroid Build Coastguard Worker }
866*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_2tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1])867*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i x_convolve_2tap_4x2_ssse3(const uint8_t *const src,
868*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
869*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[1]) {
870*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl =
871*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
872*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s_128 = load_u8_8x2_sse2(src, stride);
873*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss = _mm_shuffle_epi8(s_128, sfl);
874*77c1e3ccSAndroid Build Coastguard Worker   return convolve_2tap_ssse3(&ss, coeffs);
875*77c1e3ccSAndroid Build Coastguard Worker }
876*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_2tap_8x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i r[2])877*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_2tap_8x2_ssse3(const uint8_t *const src,
878*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride,
879*77c1e3ccSAndroid Build Coastguard Worker                                              const __m128i coeffs[1],
880*77c1e3ccSAndroid Build Coastguard Worker                                              __m128i r[2]) {
881*77c1e3ccSAndroid Build Coastguard Worker   __m128i ss[2];
882*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s00 = _mm_loadu_si128((__m128i *)src);
883*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s10 = _mm_loadu_si128((__m128i *)(src + stride));
884*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s01 = _mm_srli_si128(s00, 1);
885*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s11 = _mm_srli_si128(s10, 1);
886*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm_unpacklo_epi8(s00, s01);
887*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm_unpacklo_epi8(s10, s11);
888*77c1e3ccSAndroid Build Coastguard Worker 
889*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_2tap_ssse3(&ss[0], coeffs);
890*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_2tap_ssse3(&ss[1], coeffs);
891*77c1e3ccSAndroid Build Coastguard Worker }
892*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_2tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1])893*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_2tap_8x2_avx2(const uint8_t *const src,
894*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
895*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[1]) {
896*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2][2];
897*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
898*77c1e3ccSAndroid Build Coastguard Worker 
899*77c1e3ccSAndroid Build Coastguard Worker   s_128[0][0] = _mm_loadu_si128((__m128i *)src);
900*77c1e3ccSAndroid Build Coastguard Worker   s_128[1][0] = _mm_loadu_si128((__m128i *)(src + stride));
901*77c1e3ccSAndroid Build Coastguard Worker   s_128[0][1] = _mm_srli_si128(s_128[0][0], 1);
902*77c1e3ccSAndroid Build Coastguard Worker   s_128[1][1] = _mm_srli_si128(s_128[1][0], 1);
903*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_128[0][0], s_128[1][0]);
904*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_128[0][1], s_128[1][1]);
905*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
906*77c1e3ccSAndroid Build Coastguard Worker   return convolve_2tap_avx2(&ss, coeffs);
907*77c1e3ccSAndroid Build Coastguard Worker }
908*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_2tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m256i r[2])909*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_2tap_16x2_avx2(const uint8_t *const src,
910*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride,
911*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[1],
912*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i r[2]) {
913*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0_256 = loadu_8bit_16x2_avx2(src, stride);
914*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1_256 = loadu_8bit_16x2_avx2(src + 1, stride);
915*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0 = _mm256_unpacklo_epi8(s0_256, s1_256);
916*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1 = _mm256_unpackhi_epi8(s0_256, s1_256);
917*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_2tap_avx2(&s0, coeffs);
918*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_2tap_avx2(&s1, coeffs);
919*77c1e3ccSAndroid Build Coastguard Worker }
920*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],__m256i r[2])921*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_2tap_32_avx2(const uint8_t *const src,
922*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[1],
923*77c1e3ccSAndroid Build Coastguard Worker                                            __m256i r[2]) {
924*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
925*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
926*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
927*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
928*77c1e3ccSAndroid Build Coastguard Worker 
929*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_2tap_avx2(&ss0, coeffs);
930*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_2tap_avx2(&ss1, coeffs);
931*77c1e3ccSAndroid Build Coastguard Worker }
932*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_4tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2])933*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i x_convolve_4tap_2x2_ssse3(const uint8_t *const src,
934*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
935*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[2]) {
936*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl0 =
937*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
938*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl1 =
939*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
940*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s = load_u8_8x2_sse2(src, stride);
941*77c1e3ccSAndroid Build Coastguard Worker   __m128i ss[2];
942*77c1e3ccSAndroid Build Coastguard Worker 
943*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm_shuffle_epi8(s, sfl0);
944*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm_shuffle_epi8(s, sfl1);
945*77c1e3ccSAndroid Build Coastguard Worker   return convolve_4tap_ssse3(ss, coeffs);
946*77c1e3ccSAndroid Build Coastguard Worker }
947*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_4tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2])948*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i x_convolve_4tap_4x2_ssse3(const uint8_t *const src,
949*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
950*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[2]) {
951*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s = load_u8_8x2_sse2(src, stride);
952*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl0 =
953*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12);
954*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl1 =
955*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 10, 11, 11, 12, 12, 13, 13, 14);
956*77c1e3ccSAndroid Build Coastguard Worker   __m128i ss[2];
957*77c1e3ccSAndroid Build Coastguard Worker 
958*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm_shuffle_epi8(s, sfl0);
959*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm_shuffle_epi8(s, sfl1);
960*77c1e3ccSAndroid Build Coastguard Worker   return convolve_4tap_ssse3(ss, coeffs);
961*77c1e3ccSAndroid Build Coastguard Worker }
962*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_4tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],const __m256i filt[2])963*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_4tap_8x2_avx2(const uint8_t *const src,
964*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
965*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[2],
966*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i filt[2]) {
967*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
968*77c1e3ccSAndroid Build Coastguard Worker   return x_convolve_4tap_avx2(s_256, coeffs, filt);
969*77c1e3ccSAndroid Build Coastguard Worker }
970*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_4tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[2],const __m256i filt[2],__m256i r[2])971*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_4tap_16x2_avx2(const uint8_t *const src,
972*77c1e3ccSAndroid Build Coastguard Worker                                              const int32_t src_stride,
973*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[2],
974*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i filt[2],
975*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i r[2]) {
976*77c1e3ccSAndroid Build Coastguard Worker   r[0] = x_convolve_4tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
977*77c1e3ccSAndroid Build Coastguard Worker   r[1] = x_convolve_4tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
978*77c1e3ccSAndroid Build Coastguard Worker }
979*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_4tap_32_avx2(const uint8_t * const src,const __m256i coeffs[2],const __m256i filt[2],__m256i r[2])980*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_4tap_32_avx2(const uint8_t *const src,
981*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[2],
982*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i filt[2],
983*77c1e3ccSAndroid Build Coastguard Worker                                            __m256i r[2]) {
984*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
985*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
986*77c1e3ccSAndroid Build Coastguard Worker 
987*77c1e3ccSAndroid Build Coastguard Worker   r[0] = x_convolve_4tap_avx2(s0_256, coeffs, filt);
988*77c1e3ccSAndroid Build Coastguard Worker   r[1] = x_convolve_4tap_avx2(s1_256, coeffs, filt);
989*77c1e3ccSAndroid Build Coastguard Worker }
990*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_6tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3])991*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i x_convolve_6tap_2x2_ssse3(const uint8_t *const src,
992*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
993*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[3]) {
994*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl0 =
995*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
996*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl1 =
997*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
998*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl2 =
999*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1000*77c1e3ccSAndroid Build Coastguard Worker 
1001*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s = load_u8_8x2_sse2(src, stride);
1002*77c1e3ccSAndroid Build Coastguard Worker   __m128i ss[3];
1003*77c1e3ccSAndroid Build Coastguard Worker 
1004*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm_shuffle_epi8(s, sfl0);
1005*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm_shuffle_epi8(s, sfl1);
1006*77c1e3ccSAndroid Build Coastguard Worker   ss[2] = _mm_shuffle_epi8(s, sfl2);
1007*77c1e3ccSAndroid Build Coastguard Worker   return convolve_6tap_ssse3(ss, coeffs);
1008*77c1e3ccSAndroid Build Coastguard Worker }
1009*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_6tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3])1010*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i x_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1011*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1012*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[3]) {
1013*77c1e3ccSAndroid Build Coastguard Worker   const __m128i s = load_u8_8x2_sse2(src, stride);
1014*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl0 =
1015*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(0, 1, 1, 2, 8, 9, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0);
1016*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl1 =
1017*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(2, 3, 3, 4, 10, 11, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0);
1018*77c1e3ccSAndroid Build Coastguard Worker   const __m128i sfl2 =
1019*77c1e3ccSAndroid Build Coastguard Worker       _mm_setr_epi8(4, 5, 5, 6, 12, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0);
1020*77c1e3ccSAndroid Build Coastguard Worker   __m128i ss[3];
1021*77c1e3ccSAndroid Build Coastguard Worker 
1022*77c1e3ccSAndroid Build Coastguard Worker   ss[0] = _mm_shuffle_epi8(s, sfl0);
1023*77c1e3ccSAndroid Build Coastguard Worker   ss[1] = _mm_shuffle_epi8(s, sfl1);
1024*77c1e3ccSAndroid Build Coastguard Worker   ss[2] = _mm_shuffle_epi8(s, sfl2);
1025*77c1e3ccSAndroid Build Coastguard Worker   return convolve_6tap_ssse3(ss, coeffs);
1026*77c1e3ccSAndroid Build Coastguard Worker }
1027*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_6tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],const __m256i filt[3])1028*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_6tap_8x2_avx2(const uint8_t *const src,
1029*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
1030*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[3],
1031*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i filt[3]) {
1032*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1033*77c1e3ccSAndroid Build Coastguard Worker   return x_convolve_6tap_avx2(s_256, coeffs, filt);
1034*77c1e3ccSAndroid Build Coastguard Worker }
1035*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],__m256i r[2])1036*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_6tap_16x2_avx2(const uint8_t *const src,
1037*77c1e3ccSAndroid Build Coastguard Worker                                              const int32_t src_stride,
1038*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[3],
1039*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i filt[3],
1040*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i r[2]) {
1041*77c1e3ccSAndroid Build Coastguard Worker   r[0] = x_convolve_6tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1042*77c1e3ccSAndroid Build Coastguard Worker   r[1] = x_convolve_6tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1043*77c1e3ccSAndroid Build Coastguard Worker }
1044*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],__m256i r[2])1045*77c1e3ccSAndroid Build Coastguard Worker static inline void x_convolve_6tap_32_avx2(const uint8_t *const src,
1046*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[3],
1047*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i filt[3],
1048*77c1e3ccSAndroid Build Coastguard Worker                                            __m256i r[2]) {
1049*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1050*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1051*77c1e3ccSAndroid Build Coastguard Worker 
1052*77c1e3ccSAndroid Build Coastguard Worker   r[0] = x_convolve_6tap_avx2(s0_256, coeffs, filt);
1053*77c1e3ccSAndroid Build Coastguard Worker   r[1] = x_convolve_6tap_avx2(s1_256, coeffs, filt);
1054*77c1e3ccSAndroid Build Coastguard Worker }
1055*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_8tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],const __m256i filt[4])1056*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i x_convolve_8tap_8x2_avx2(const uint8_t *const src,
1057*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
1058*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[4],
1059*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i filt[4]) {
1060*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s_256 = loadu_8bit_16x2_avx2(src, stride);
1061*77c1e3ccSAndroid Build Coastguard Worker   return x_convolve_8tap_avx2(s_256, coeffs, filt);
1062*77c1e3ccSAndroid Build Coastguard Worker }
1063*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],__m256i r[2])1064*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void x_convolve_8tap_16x2_avx2(const uint8_t *const src,
1065*77c1e3ccSAndroid Build Coastguard Worker                                                        const int32_t src_stride,
1066*77c1e3ccSAndroid Build Coastguard Worker                                                        const __m256i coeffs[4],
1067*77c1e3ccSAndroid Build Coastguard Worker                                                        const __m256i filt[4],
1068*77c1e3ccSAndroid Build Coastguard Worker                                                        __m256i r[2]) {
1069*77c1e3ccSAndroid Build Coastguard Worker   r[0] = x_convolve_8tap_8x2_avx2(src + 0, src_stride, coeffs, filt);
1070*77c1e3ccSAndroid Build Coastguard Worker   r[1] = x_convolve_8tap_8x2_avx2(src + 8, src_stride, coeffs, filt);
1071*77c1e3ccSAndroid Build Coastguard Worker }
1072*77c1e3ccSAndroid Build Coastguard Worker 
x_convolve_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],__m256i r[2])1073*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void x_convolve_8tap_32_avx2(const uint8_t *const src,
1074*77c1e3ccSAndroid Build Coastguard Worker                                                      const __m256i coeffs[4],
1075*77c1e3ccSAndroid Build Coastguard Worker                                                      const __m256i filt[4],
1076*77c1e3ccSAndroid Build Coastguard Worker                                                      __m256i r[2]) {
1077*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0_256 = _mm256_loadu_si256((__m256i *)src);
1078*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1_256 = _mm256_loadu_si256((__m256i *)(src + 8));
1079*77c1e3ccSAndroid Build Coastguard Worker 
1080*77c1e3ccSAndroid Build Coastguard Worker   r[0] = x_convolve_8tap_avx2(s0_256, coeffs, filt);
1081*77c1e3ccSAndroid Build Coastguard Worker   r[1] = x_convolve_8tap_avx2(s1_256, coeffs, filt);
1082*77c1e3ccSAndroid Build Coastguard Worker }
1083*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_2tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i s_16[2])1084*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_2tap_2x2_ssse3(const uint8_t *const src,
1085*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1086*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[1],
1087*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_16[2]) {
1088*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2];
1089*77c1e3ccSAndroid Build Coastguard Worker 
1090*77c1e3ccSAndroid Build Coastguard Worker   s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src + stride));
1091*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1092*77c1e3ccSAndroid Build Coastguard Worker   s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src + 2 * stride));
1093*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_unpacklo_epi16(s_16[1], s_16[0]);
1094*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1095*77c1e3ccSAndroid Build Coastguard Worker   return convolve_2tap_ssse3(&ss, coeffs);
1096*77c1e3ccSAndroid Build Coastguard Worker }
1097*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_2tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[1],__m128i s_32[2])1098*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_2tap_4x2_ssse3(const uint8_t *const src,
1099*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1100*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[1],
1101*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_32[2]) {
1102*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2];
1103*77c1e3ccSAndroid Build Coastguard Worker 
1104*77c1e3ccSAndroid Build Coastguard Worker   s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1105*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1106*77c1e3ccSAndroid Build Coastguard Worker   s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1107*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1108*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss = _mm_unpacklo_epi8(s_128[0], s_128[1]);
1109*77c1e3ccSAndroid Build Coastguard Worker   return convolve_2tap_ssse3(&ss, coeffs);
1110*77c1e3ccSAndroid Build Coastguard Worker }
1111*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_2tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m128i s_64[2])1112*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i y_convolve_2tap_8x2_avx2(const uint8_t *const src,
1113*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
1114*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[1],
1115*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i s_64[2]) {
1116*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1117*77c1e3ccSAndroid Build Coastguard Worker 
1118*77c1e3ccSAndroid Build Coastguard Worker   s_64[1] = _mm_loadl_epi64((__m128i *)(src + stride));
1119*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1120*77c1e3ccSAndroid Build Coastguard Worker   s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1121*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_64[1], s_64[0]);
1122*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1123*77c1e3ccSAndroid Build Coastguard Worker   return convolve_2tap_avx2(&ss, coeffs);
1124*77c1e3ccSAndroid Build Coastguard Worker }
1125*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_2tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[1],__m128i s_128[2],__m256i r[2])1126*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_2tap_16x2_avx2(const uint8_t *const src,
1127*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride,
1128*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[1],
1129*77c1e3ccSAndroid Build Coastguard Worker                                              __m128i s_128[2], __m256i r[2]) {
1130*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1131*77c1e3ccSAndroid Build Coastguard Worker 
1132*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_loadu_si128((__m128i *)(src + stride));
1133*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1134*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1135*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1136*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss0 = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1137*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss1 = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1138*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_2tap_avx2(&ss0, coeffs);
1139*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_2tap_avx2(&ss1, coeffs);
1140*77c1e3ccSAndroid Build Coastguard Worker }
1141*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],const __m256i s0,__m256i * const s1,__m256i r[2])1142*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_2tap_32_avx2(const uint8_t *const src,
1143*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i coeffs[1],
1144*77c1e3ccSAndroid Build Coastguard Worker                                            const __m256i s0, __m256i *const s1,
1145*77c1e3ccSAndroid Build Coastguard Worker                                            __m256i r[2]) {
1146*77c1e3ccSAndroid Build Coastguard Worker   *s1 = _mm256_loadu_si256((__m256i *)src);
1147*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss0 = _mm256_unpacklo_epi8(s0, *s1);
1148*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss1 = _mm256_unpackhi_epi8(s0, *s1);
1149*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_2tap_avx2(&ss0, coeffs);
1150*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_2tap_avx2(&ss1, coeffs);
1151*77c1e3ccSAndroid Build Coastguard Worker }
1152*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_4tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2],__m128i s_16[4],__m128i ss_128[2])1153*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_4tap_2x2_ssse3(const uint8_t *const src,
1154*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1155*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[2],
1156*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_16[4],
1157*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i ss_128[2]) {
1158*77c1e3ccSAndroid Build Coastguard Worker   s_16[3] = _mm_cvtsi32_si128(loadu_int16(src + stride));
1159*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1160*77c1e3ccSAndroid Build Coastguard Worker   s_16[2] = _mm_cvtsi32_si128(loadu_int16(src + 2 * stride));
1161*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[2]);
1162*77c1e3ccSAndroid Build Coastguard Worker   ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1163*77c1e3ccSAndroid Build Coastguard Worker   return convolve_4tap_ssse3(ss_128, coeffs);
1164*77c1e3ccSAndroid Build Coastguard Worker }
1165*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_4tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[2],__m128i s_32[4],__m128i ss_128[2])1166*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_4tap_4x2_ssse3(const uint8_t *const src,
1167*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1168*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[2],
1169*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_32[4],
1170*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i ss_128[2]) {
1171*77c1e3ccSAndroid Build Coastguard Worker   s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + stride));
1172*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1173*77c1e3ccSAndroid Build Coastguard Worker   s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 2 * stride));
1174*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1175*77c1e3ccSAndroid Build Coastguard Worker   ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1176*77c1e3ccSAndroid Build Coastguard Worker   return convolve_4tap_ssse3(ss_128, coeffs);
1177*77c1e3ccSAndroid Build Coastguard Worker }
1178*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_4tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m128i s_64[4],__m256i ss_256[2])1179*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i y_convolve_4tap_8x2_avx2(const uint8_t *const src,
1180*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
1181*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[2],
1182*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i s_64[4],
1183*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i ss_256[2]) {
1184*77c1e3ccSAndroid Build Coastguard Worker   s_64[3] = _mm_loadl_epi64((__m128i *)(src + stride));
1185*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1186*77c1e3ccSAndroid Build Coastguard Worker   s_64[2] = _mm_loadl_epi64((__m128i *)(src + 2 * stride));
1187*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[2]);
1188*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1189*77c1e3ccSAndroid Build Coastguard Worker   return convolve_4tap_avx2(ss_256, coeffs);
1190*77c1e3ccSAndroid Build Coastguard Worker }
1191*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_4tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m128i s_128[4],__m256i ss_256[4],__m256i r[2])1192*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_4tap_16x2_avx2(const uint8_t *const src,
1193*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride,
1194*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[2],
1195*77c1e3ccSAndroid Build Coastguard Worker                                              __m128i s_128[4],
1196*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i ss_256[4], __m256i r[2]) {
1197*77c1e3ccSAndroid Build Coastguard Worker   s_128[3] = _mm_loadu_si128((__m128i *)(src + stride));
1198*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1199*77c1e3ccSAndroid Build Coastguard Worker   s_128[2] = _mm_loadu_si128((__m128i *)(src + 2 * stride));
1200*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[2]);
1201*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1202*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi8(src23, src34);
1203*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_4tap_avx2(ss_256, coeffs);
1204*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1205*77c1e3ccSAndroid Build Coastguard Worker }
1206*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_6tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3],__m128i s_16[6],__m128i ss_128[3])1207*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_6tap_2x2_ssse3(const uint8_t *const src,
1208*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1209*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[3],
1210*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_16[6],
1211*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i ss_128[3]) {
1212*77c1e3ccSAndroid Build Coastguard Worker   s_16[5] = _mm_cvtsi32_si128(loadu_int16(src + 3 * stride));
1213*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1214*77c1e3ccSAndroid Build Coastguard Worker   s_16[4] = _mm_cvtsi32_si128(loadu_int16(src + 4 * stride));
1215*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[4]);
1216*77c1e3ccSAndroid Build Coastguard Worker   ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1217*77c1e3ccSAndroid Build Coastguard Worker   return convolve_6tap_ssse3(ss_128, coeffs);
1218*77c1e3ccSAndroid Build Coastguard Worker }
1219*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_4tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[2],__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],__m256i r[4])1220*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_4tap_32x2_avx2(
1221*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[2],
1222*77c1e3ccSAndroid Build Coastguard Worker     __m256i s_256[4], __m256i ss_256[4], __m256i tt_256[4], __m256i r[4]) {
1223*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 1 * stride));
1224*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1225*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1226*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * stride));
1227*77c1e3ccSAndroid Build Coastguard Worker   tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[2]);
1228*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = _mm256_unpackhi_epi8(s_256[3], s_256[2]);
1229*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_4tap_avx2(ss_256 + 0, coeffs);
1230*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_4tap_avx2(ss_256 + 2, coeffs);
1231*77c1e3ccSAndroid Build Coastguard Worker   r[2] = convolve_4tap_avx2(tt_256 + 0, coeffs);
1232*77c1e3ccSAndroid Build Coastguard Worker   r[3] = convolve_4tap_avx2(tt_256 + 2, coeffs);
1233*77c1e3ccSAndroid Build Coastguard Worker }
1234*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_6tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[3],__m128i s_32[6],__m128i ss_128[3])1235*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_6tap_4x2_ssse3(const uint8_t *const src,
1236*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1237*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[3],
1238*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_32[6],
1239*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i ss_128[3]) {
1240*77c1e3ccSAndroid Build Coastguard Worker   s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 3 * stride));
1241*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1242*77c1e3ccSAndroid Build Coastguard Worker   s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 4 * stride));
1243*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1244*77c1e3ccSAndroid Build Coastguard Worker   ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1245*77c1e3ccSAndroid Build Coastguard Worker   return convolve_6tap_ssse3(ss_128, coeffs);
1246*77c1e3ccSAndroid Build Coastguard Worker }
1247*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_6tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m128i s_64[6],__m256i ss_256[3])1248*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i y_convolve_6tap_8x2_avx2(const uint8_t *const src,
1249*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
1250*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[3],
1251*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i s_64[6],
1252*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i ss_256[3]) {
1253*77c1e3ccSAndroid Build Coastguard Worker   s_64[5] = _mm_loadl_epi64((__m128i *)(src + 3 * stride));
1254*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1255*77c1e3ccSAndroid Build Coastguard Worker   s_64[4] = _mm_loadl_epi64((__m128i *)(src + 4 * stride));
1256*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[4]);
1257*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1258*77c1e3ccSAndroid Build Coastguard Worker   return convolve_6tap_avx2(ss_256, coeffs);
1259*77c1e3ccSAndroid Build Coastguard Worker }
1260*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_6tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m128i s_128[6],__m256i ss_256[6],__m256i r[2])1261*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_6tap_16x2_avx2(const uint8_t *const src,
1262*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride,
1263*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[3],
1264*77c1e3ccSAndroid Build Coastguard Worker                                              __m128i s_128[6],
1265*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i ss_256[6], __m256i r[2]) {
1266*77c1e3ccSAndroid Build Coastguard Worker   s_128[5] = _mm_loadu_si128((__m128i *)(src + 3 * stride));
1267*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1268*77c1e3ccSAndroid Build Coastguard Worker   s_128[4] = _mm_loadu_si128((__m128i *)(src + 4 * stride));
1269*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[4]);
1270*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1271*77c1e3ccSAndroid Build Coastguard Worker   ss_256[5] = _mm256_unpackhi_epi8(src45, src56);
1272*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_6tap_avx2(ss_256, coeffs);
1273*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1274*77c1e3ccSAndroid Build Coastguard Worker }
1275*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_6tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[3],__m256i s_256[6],__m256i ss_256[6],__m256i tt_256[6],__m256i r[4])1276*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_6tap_32x2_avx2(
1277*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[3],
1278*77c1e3ccSAndroid Build Coastguard Worker     __m256i s_256[6], __m256i ss_256[6], __m256i tt_256[6], __m256i r[4]) {
1279*77c1e3ccSAndroid Build Coastguard Worker   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1280*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
1281*77c1e3ccSAndroid Build Coastguard Worker   ss_256[5] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
1282*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1283*77c1e3ccSAndroid Build Coastguard Worker   tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[4]);
1284*77c1e3ccSAndroid Build Coastguard Worker   tt_256[5] = _mm256_unpackhi_epi8(s_256[5], s_256[4]);
1285*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_6tap_avx2(ss_256 + 0, coeffs);
1286*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_6tap_avx2(ss_256 + 3, coeffs);
1287*77c1e3ccSAndroid Build Coastguard Worker   r[2] = convolve_6tap_avx2(tt_256 + 0, coeffs);
1288*77c1e3ccSAndroid Build Coastguard Worker   r[3] = convolve_6tap_avx2(tt_256 + 3, coeffs);
1289*77c1e3ccSAndroid Build Coastguard Worker }
1290*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_8tap_2x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[4],__m128i s_16[8],__m128i ss_128[4])1291*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_8tap_2x2_ssse3(const uint8_t *const src,
1292*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1293*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[4],
1294*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_16[8],
1295*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i ss_128[4]) {
1296*77c1e3ccSAndroid Build Coastguard Worker   s_16[7] = _mm_cvtsi32_si128(loadu_int16(src + 7 * stride));
1297*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src67 = _mm_unpacklo_epi16(s_16[6], s_16[7]);
1298*77c1e3ccSAndroid Build Coastguard Worker   s_16[6] = _mm_cvtsi32_si128(loadu_int16(src + 8 * stride));
1299*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src78 = _mm_unpacklo_epi16(s_16[7], s_16[6]);
1300*77c1e3ccSAndroid Build Coastguard Worker   ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1301*77c1e3ccSAndroid Build Coastguard Worker   return convolve_8tap_ssse3(ss_128, coeffs);
1302*77c1e3ccSAndroid Build Coastguard Worker }
1303*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_8tap_4x2_ssse3(const uint8_t * const src,const ptrdiff_t stride,const __m128i coeffs[4],__m128i s_32[8],__m128i ss_128[4])1304*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i y_convolve_8tap_4x2_ssse3(const uint8_t *const src,
1305*77c1e3ccSAndroid Build Coastguard Worker                                                 const ptrdiff_t stride,
1306*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m128i coeffs[4],
1307*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i s_32[8],
1308*77c1e3ccSAndroid Build Coastguard Worker                                                 __m128i ss_128[4]) {
1309*77c1e3ccSAndroid Build Coastguard Worker   s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * stride));
1310*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1311*77c1e3ccSAndroid Build Coastguard Worker   s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * stride));
1312*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1313*77c1e3ccSAndroid Build Coastguard Worker   ss_128[3] = _mm_unpacklo_epi8(src67, src78);
1314*77c1e3ccSAndroid Build Coastguard Worker   return convolve_8tap_ssse3(ss_128, coeffs);
1315*77c1e3ccSAndroid Build Coastguard Worker }
1316*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_8tap_8x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m128i s_64[8],__m256i ss_256[4])1317*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i y_convolve_8tap_8x2_avx2(const uint8_t *const src,
1318*77c1e3ccSAndroid Build Coastguard Worker                                                const ptrdiff_t stride,
1319*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[4],
1320*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i s_64[8],
1321*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i ss_256[4]) {
1322*77c1e3ccSAndroid Build Coastguard Worker   s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * stride));
1323*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src67 = _mm256_setr_m128i(s_64[6], s_64[7]);
1324*77c1e3ccSAndroid Build Coastguard Worker   s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * stride));
1325*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src78 = _mm256_setr_m128i(s_64[7], s_64[6]);
1326*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1327*77c1e3ccSAndroid Build Coastguard Worker   return convolve_8tap_avx2(ss_256, coeffs);
1328*77c1e3ccSAndroid Build Coastguard Worker }
1329*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_8tap_16x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m128i s_128[8],__m256i ss_256[8],__m256i r[2])1330*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_8tap_16x2_avx2(const uint8_t *const src,
1331*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride,
1332*77c1e3ccSAndroid Build Coastguard Worker                                              const __m256i coeffs[4],
1333*77c1e3ccSAndroid Build Coastguard Worker                                              __m128i s_128[8],
1334*77c1e3ccSAndroid Build Coastguard Worker                                              __m256i ss_256[8], __m256i r[2]) {
1335*77c1e3ccSAndroid Build Coastguard Worker   s_128[7] = _mm_loadu_si128((__m128i *)(src + 7 * stride));
1336*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src67 = _mm256_setr_m128i(s_128[6], s_128[7]);
1337*77c1e3ccSAndroid Build Coastguard Worker   s_128[6] = _mm_loadu_si128((__m128i *)(src + 8 * stride));
1338*77c1e3ccSAndroid Build Coastguard Worker   const __m256i src78 = _mm256_setr_m128i(s_128[7], s_128[6]);
1339*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpacklo_epi8(src67, src78);
1340*77c1e3ccSAndroid Build Coastguard Worker   ss_256[7] = _mm256_unpackhi_epi8(src67, src78);
1341*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_8tap_avx2(ss_256, coeffs);
1342*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1343*77c1e3ccSAndroid Build Coastguard Worker }
1344*77c1e3ccSAndroid Build Coastguard Worker 
y_convolve_8tap_32x2_avx2(const uint8_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i ss_256[8],__m256i tt_256[8],__m256i r[4])1345*77c1e3ccSAndroid Build Coastguard Worker static inline void y_convolve_8tap_32x2_avx2(
1346*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1347*77c1e3ccSAndroid Build Coastguard Worker     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1348*77c1e3ccSAndroid Build Coastguard Worker   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1349*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpacklo_epi8(s_256[6], s_256[7]);
1350*77c1e3ccSAndroid Build Coastguard Worker   ss_256[7] = _mm256_unpackhi_epi8(s_256[6], s_256[7]);
1351*77c1e3ccSAndroid Build Coastguard Worker   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1352*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = _mm256_unpacklo_epi8(s_256[7], s_256[6]);
1353*77c1e3ccSAndroid Build Coastguard Worker   tt_256[7] = _mm256_unpackhi_epi8(s_256[7], s_256[6]);
1354*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_8tap_avx2(ss_256 + 0, coeffs);
1355*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_8tap_avx2(ss_256 + 4, coeffs);
1356*77c1e3ccSAndroid Build Coastguard Worker   r[2] = convolve_8tap_avx2(tt_256 + 0, coeffs);
1357*77c1e3ccSAndroid Build Coastguard Worker   r[3] = convolve_8tap_avx2(tt_256 + 4, coeffs);
1358*77c1e3ccSAndroid Build Coastguard Worker }
1359*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_convolve_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],__m256i r[2])1360*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_convolve_2tap_32_avx2(const uint8_t *const src,
1361*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i coeffs[1],
1362*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i r[2]) {
1363*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s0 = _mm256_loadu_si256((__m256i *)src);
1364*77c1e3ccSAndroid Build Coastguard Worker   const __m256i s1 = _mm256_loadu_si256((__m256i *)(src + 1));
1365*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss0 = _mm256_unpacklo_epi8(s0, s1);
1366*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss1 = _mm256_unpackhi_epi8(s0, s1);
1367*77c1e3ccSAndroid Build Coastguard Worker 
1368*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve_2tap_avx2(&ss0, coeffs);
1369*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve_2tap_avx2(&ss1, coeffs);
1370*77c1e3ccSAndroid Build Coastguard Worker }
1371*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],int16_t * const dst)1372*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_2tap_32_avx2(const uint8_t *const src,
1373*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[1],
1374*77c1e3ccSAndroid Build Coastguard Worker                                      int16_t *const dst) {
1375*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
1376*77c1e3ccSAndroid Build Coastguard Worker 
1377*77c1e3ccSAndroid Build Coastguard Worker   xy_x_convolve_2tap_32_avx2(src, coeffs, r);
1378*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d0 = xy_x_round_avx2(r[0]);
1379*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d1 = xy_x_round_avx2(r[1]);
1380*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d0);
1381*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1382*77c1e3ccSAndroid Build Coastguard Worker }
1383*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_4tap_32_avx2(const uint8_t * const src,const __m256i coeffs[2],const __m256i filt[2],int16_t * const dst)1384*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_4tap_32_avx2(const uint8_t *const src,
1385*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[2],
1386*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i filt[2],
1387*77c1e3ccSAndroid Build Coastguard Worker                                      int16_t *const dst) {
1388*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
1389*77c1e3ccSAndroid Build Coastguard Worker 
1390*77c1e3ccSAndroid Build Coastguard Worker   x_convolve_4tap_32_avx2(src, coeffs, filt, r);
1391*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d0 = xy_x_round_avx2(r[0]);
1392*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d1 = xy_x_round_avx2(r[1]);
1393*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d0);
1394*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1395*77c1e3ccSAndroid Build Coastguard Worker }
1396*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],int16_t * const dst)1397*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_6tap_32_avx2(const uint8_t *const src,
1398*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[3],
1399*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i filt[3],
1400*77c1e3ccSAndroid Build Coastguard Worker                                      int16_t *const dst) {
1401*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
1402*77c1e3ccSAndroid Build Coastguard Worker 
1403*77c1e3ccSAndroid Build Coastguard Worker   x_convolve_6tap_32_avx2(src, coeffs, filt, r);
1404*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d0 = xy_x_round_avx2(r[0]);
1405*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d1 = xy_x_round_avx2(r[1]);
1406*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d0);
1407*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1408*77c1e3ccSAndroid Build Coastguard Worker }
1409*77c1e3ccSAndroid Build Coastguard Worker 
xy_x_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],int16_t * const dst)1410*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_x_8tap_32_avx2(const uint8_t *const src,
1411*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[4],
1412*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i filt[4],
1413*77c1e3ccSAndroid Build Coastguard Worker                                      int16_t *const dst) {
1414*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
1415*77c1e3ccSAndroid Build Coastguard Worker 
1416*77c1e3ccSAndroid Build Coastguard Worker   x_convolve_8tap_32_avx2(src, coeffs, filt, r);
1417*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d0 = xy_x_round_avx2(r[0]);
1418*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d1 = xy_x_round_avx2(r[1]);
1419*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dst, d0);
1420*77c1e3ccSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)(dst + 16), d1);
1421*77c1e3ccSAndroid Build Coastguard Worker }
1422*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_2x2_sse2(const int16_t * const src,__m128i s_32[2],const __m128i coeffs[1])1423*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_convolve_2tap_2x2_sse2(const int16_t *const src,
1424*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_32[2],
1425*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m128i coeffs[1]) {
1426*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2];
1427*77c1e3ccSAndroid Build Coastguard Worker 
1428*77c1e3ccSAndroid Build Coastguard Worker   s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1429*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1430*77c1e3ccSAndroid Build Coastguard Worker   s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1431*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1432*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1433*77c1e3ccSAndroid Build Coastguard Worker   return convolve16_2tap_sse2(&ss, coeffs);
1434*77c1e3ccSAndroid Build Coastguard Worker }
1435*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_2x2_half_pel_sse2(const int16_t * const src,__m128i s_32[2])1436*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_convolve_2tap_2x2_half_pel_sse2(
1437*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, __m128i s_32[2]) {
1438*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2];
1439*77c1e3ccSAndroid Build Coastguard Worker 
1440*77c1e3ccSAndroid Build Coastguard Worker   s_32[1] = _mm_cvtsi32_si128(loadu_int32(src + 2));
1441*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1442*77c1e3ccSAndroid Build Coastguard Worker   s_32[0] = _mm_cvtsi32_si128(loadu_int32(src + 2 * 2));
1443*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_unpacklo_epi32(s_32[1], s_32[0]);
1444*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi16(s_128[0], s_128[1]);
1445*77c1e3ccSAndroid Build Coastguard Worker }
1446*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_4x2_sse2(const int16_t * const src,__m128i s_64[2],const __m128i coeffs[1],__m128i r[2])1447*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_4x2_sse2(const int16_t *const src,
1448*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i s_64[2],
1449*77c1e3ccSAndroid Build Coastguard Worker                                                const __m128i coeffs[1],
1450*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i r[2]) {
1451*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2];
1452*77c1e3ccSAndroid Build Coastguard Worker 
1453*77c1e3ccSAndroid Build Coastguard Worker   s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1454*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1455*77c1e3ccSAndroid Build Coastguard Worker   s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1456*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1457*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss0 = _mm_unpacklo_epi16(s_128[0], s_128[1]);
1458*77c1e3ccSAndroid Build Coastguard Worker   const __m128i ss1 = _mm_unpackhi_epi16(s_128[0], s_128[1]);
1459*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve16_2tap_sse2(&ss0, coeffs);
1460*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve16_2tap_sse2(&ss1, coeffs);
1461*77c1e3ccSAndroid Build Coastguard Worker }
1462*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_4x2_half_pel_sse2(const int16_t * const src,__m128i s_64[2])1463*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_convolve_2tap_4x2_half_pel_sse2(
1464*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, __m128i s_64[2]) {
1465*77c1e3ccSAndroid Build Coastguard Worker   __m128i s_128[2];
1466*77c1e3ccSAndroid Build Coastguard Worker 
1467*77c1e3ccSAndroid Build Coastguard Worker   s_64[1] = _mm_loadl_epi64((__m128i *)(src + 4));
1468*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
1469*77c1e3ccSAndroid Build Coastguard Worker   s_64[0] = _mm_loadl_epi64((__m128i *)(src + 2 * 4));
1470*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
1471*77c1e3ccSAndroid Build Coastguard Worker   return _mm_add_epi16(s_128[0], s_128[1]);
1472*77c1e3ccSAndroid Build Coastguard Worker }
1473*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_16_avx2(const __m256i s0,const __m256i s1,const __m256i coeffs[1],__m256i r[2])1474*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_16_avx2(const __m256i s0,
1475*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i s1,
1476*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i coeffs[1],
1477*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i r[2]) {
1478*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss0 = _mm256_unpacklo_epi16(s0, s1);
1479*77c1e3ccSAndroid Build Coastguard Worker   const __m256i ss1 = _mm256_unpackhi_epi16(s0, s1);
1480*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve16_2tap_avx2(&ss0, coeffs);
1481*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve16_2tap_avx2(&ss1, coeffs);
1482*77c1e3ccSAndroid Build Coastguard Worker }
1483*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_8x2_avx2(const int16_t * const src,__m128i s_128[2],const __m256i coeffs[1],__m256i r[2])1484*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_8x2_avx2(const int16_t *const src,
1485*77c1e3ccSAndroid Build Coastguard Worker                                                __m128i s_128[2],
1486*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[1],
1487*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i r[2]) {
1488*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1489*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1490*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1491*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1492*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1493*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(s_256[0], s_256[1], coeffs, r);
1494*77c1e3ccSAndroid Build Coastguard Worker }
1495*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_8x2_half_pel_avx2(const int16_t * const src,__m128i s_128[2])1496*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_convolve_2tap_8x2_half_pel_avx2(
1497*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, __m128i s_128[2]) {
1498*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1499*77c1e3ccSAndroid Build Coastguard Worker   s_128[1] = _mm_loadu_si128((__m128i *)(src + 8));
1500*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_128[0], s_128[1]);
1501*77c1e3ccSAndroid Build Coastguard Worker   s_128[0] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
1502*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_128[1], s_128[0]);
1503*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi16(s_256[0], s_256[1]);
1504*77c1e3ccSAndroid Build Coastguard Worker }
1505*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_16x2_half_pel_avx2(const int16_t * const src,__m256i s_256[2],__m256i r[2])1506*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_16x2_half_pel_avx2(
1507*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, __m256i s_256[2], __m256i r[2]) {
1508*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1509*77c1e3ccSAndroid Build Coastguard Worker   r[0] = _mm256_add_epi16(s_256[0], s_256[1]);
1510*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1511*77c1e3ccSAndroid Build Coastguard Worker   r[1] = _mm256_add_epi16(s_256[1], s_256[0]);
1512*77c1e3ccSAndroid Build Coastguard Worker }
1513*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_store_16x2_avx2(const __m256i r[2],uint8_t * const dst,const ptrdiff_t stride)1514*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_store_16x2_avx2(const __m256i r[2], uint8_t *const dst,
1515*77c1e3ccSAndroid Build Coastguard Worker                                         const ptrdiff_t stride) {
1516*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t = _mm256_packus_epi16(r[0], r[1]);
1517*77c1e3ccSAndroid Build Coastguard Worker   const __m256i d = _mm256_permute4x64_epi64(t, 0xD8);
1518*77c1e3ccSAndroid Build Coastguard Worker   storeu_u8_16x2_avx2(d, dst, stride);
1519*77c1e3ccSAndroid Build Coastguard Worker }
1520*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_16x2_avx2(const int16_t * const src,__m256i s[2],const __m256i coeffs[1],__m256i r[4])1521*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_16x2_avx2(const int16_t *const src,
1522*77c1e3ccSAndroid Build Coastguard Worker                                                 __m256i s[2],
1523*77c1e3ccSAndroid Build Coastguard Worker                                                 const __m256i coeffs[1],
1524*77c1e3ccSAndroid Build Coastguard Worker                                                 __m256i r[4]) {
1525*77c1e3ccSAndroid Build Coastguard Worker   s[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1526*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(s[0], s[1], coeffs, r + 0);
1527*77c1e3ccSAndroid Build Coastguard Worker   s[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
1528*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(s[1], s[0], coeffs, r + 2);
1529*77c1e3ccSAndroid Build Coastguard Worker }
1530*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_32_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],const __m256i coeffs[1],__m256i r[4])1531*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_32_avx2(const int16_t *const src,
1532*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i s0[2],
1533*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i s1[2],
1534*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i coeffs[1],
1535*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i r[4]) {
1536*77c1e3ccSAndroid Build Coastguard Worker   s1[0] = _mm256_loadu_si256((__m256i *)src);
1537*77c1e3ccSAndroid Build Coastguard Worker   s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1538*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(s0[0], s1[0], coeffs, r + 0);
1539*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(s0[1], s1[1], coeffs, r + 2);
1540*77c1e3ccSAndroid Build Coastguard Worker }
1541*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_32_all_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],const __m256i coeffs[1],uint8_t * const dst)1542*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_32_all_avx2(const int16_t *const src,
1543*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m256i s0[2],
1544*77c1e3ccSAndroid Build Coastguard Worker                                                   __m256i s1[2],
1545*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m256i coeffs[1],
1546*77c1e3ccSAndroid Build Coastguard Worker                                                   uint8_t *const dst) {
1547*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[4];
1548*77c1e3ccSAndroid Build Coastguard Worker 
1549*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_32_avx2(src, s0, s1, coeffs, r);
1550*77c1e3ccSAndroid Build Coastguard Worker   xy_y_round_store_32_avx2(r + 0, r + 2, dst);
1551*77c1e3ccSAndroid Build Coastguard Worker }
1552*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_half_pel_32_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],__m256i r[2])1553*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_half_pel_32_avx2(const int16_t *const src,
1554*77c1e3ccSAndroid Build Coastguard Worker                                                        const __m256i s0[2],
1555*77c1e3ccSAndroid Build Coastguard Worker                                                        __m256i s1[2],
1556*77c1e3ccSAndroid Build Coastguard Worker                                                        __m256i r[2]) {
1557*77c1e3ccSAndroid Build Coastguard Worker   s1[0] = _mm256_loadu_si256((__m256i *)src);
1558*77c1e3ccSAndroid Build Coastguard Worker   s1[1] = _mm256_loadu_si256((__m256i *)(src + 16));
1559*77c1e3ccSAndroid Build Coastguard Worker   r[0] = _mm256_add_epi16(s0[0], s1[0]);
1560*77c1e3ccSAndroid Build Coastguard Worker   r[1] = _mm256_add_epi16(s0[1], s1[1]);
1561*77c1e3ccSAndroid Build Coastguard Worker }
1562*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_2tap_half_pel_32_all_avx2(const int16_t * const src,const __m256i s0[2],__m256i s1[2],uint8_t * const dst)1563*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_2tap_half_pel_32_all_avx2(
1564*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const __m256i s0[2], __m256i s1[2],
1565*77c1e3ccSAndroid Build Coastguard Worker     uint8_t *const dst) {
1566*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
1567*77c1e3ccSAndroid Build Coastguard Worker 
1568*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_half_pel_32_avx2(src, s0, s1, r);
1569*77c1e3ccSAndroid Build Coastguard Worker   r[0] = xy_y_round_half_pel_avx2(r[0]);
1570*77c1e3ccSAndroid Build Coastguard Worker   r[1] = xy_y_round_half_pel_avx2(r[1]);
1571*77c1e3ccSAndroid Build Coastguard Worker   xy_y_pack_store_32_avx2(r[0], r[1], dst);
1572*77c1e3ccSAndroid Build Coastguard Worker }
1573*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_2x2_sse2(const int16_t * const src,__m128i s_32[4],__m128i ss_128[2],const __m128i coeffs[2])1574*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_convolve_4tap_2x2_sse2(const int16_t *const src,
1575*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_32[4],
1576*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i ss_128[2],
1577*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m128i coeffs[2]) {
1578*77c1e3ccSAndroid Build Coastguard Worker   s_32[3] = _mm_cvtsi32_si128(loadu_int32(src + 3 * 2));
1579*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1580*77c1e3ccSAndroid Build Coastguard Worker   s_32[2] = _mm_cvtsi32_si128(loadu_int32(src + 4 * 2));
1581*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[2]);
1582*77c1e3ccSAndroid Build Coastguard Worker   ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1583*77c1e3ccSAndroid Build Coastguard Worker   const __m128i r = convolve16_4tap_sse2(ss_128, coeffs);
1584*77c1e3ccSAndroid Build Coastguard Worker   ss_128[0] = ss_128[1];
1585*77c1e3ccSAndroid Build Coastguard Worker   return r;
1586*77c1e3ccSAndroid Build Coastguard Worker }
1587*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_4x2_avx2(const int16_t * const src,__m128i s_64[4],__m256i ss_256[2],const __m256i coeffs[2])1588*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_convolve_4tap_4x2_avx2(const int16_t *const src,
1589*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_64[4],
1590*77c1e3ccSAndroid Build Coastguard Worker                                                   __m256i ss_256[2],
1591*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m256i coeffs[2]) {
1592*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1593*77c1e3ccSAndroid Build Coastguard Worker   s_64[3] = _mm_loadl_epi64((__m128i *)(src + 3 * 4));
1594*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_64[2], s_64[3]);
1595*77c1e3ccSAndroid Build Coastguard Worker   s_64[2] = _mm_loadl_epi64((__m128i *)(src + 4 * 4));
1596*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_64[3], s_64[2]);
1597*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1598*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = convolve16_4tap_avx2(ss_256, coeffs);
1599*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1600*77c1e3ccSAndroid Build Coastguard Worker   return r;
1601*77c1e3ccSAndroid Build Coastguard Worker }
1602*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_16_avx2(const __m256i * const ss,const __m256i coeffs[2],__m256i r[2])1603*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_4tap_16_avx2(const __m256i *const ss,
1604*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i coeffs[2],
1605*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i r[2]) {
1606*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve16_4tap_avx2(ss, coeffs);
1607*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve16_4tap_avx2(ss + 2, coeffs);
1608*77c1e3ccSAndroid Build Coastguard Worker }
1609*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_8x2_avx2(const int16_t * const src,__m256i ss_256[4],const __m256i coeffs[2],__m256i r[2])1610*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_4tap_8x2_avx2(const int16_t *const src,
1611*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i ss_256[4],
1612*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[2],
1613*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i r[2]) {
1614*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1615*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1616*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1617*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1618*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1619*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1620*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1621*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = ss_256[3];
1622*77c1e3ccSAndroid Build Coastguard Worker }
1623*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[1],__m256i s_256[4],__m256i r[2])1624*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_4tap_8x2_half_pel_avx2(
1625*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const __m256i coeffs[1], __m256i s_256[4],
1626*77c1e3ccSAndroid Build Coastguard Worker     __m256i r[2]) {
1627*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_256[2];
1628*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 8));
1629*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 8));
1630*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1631*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1632*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r);
1633*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = s_256[2];
1634*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = s_256[3];
1635*77c1e3ccSAndroid Build Coastguard Worker }
1636*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_16x2_avx2(const int16_t * const src,__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],const __m256i coeffs[2],__m256i r[4])1637*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_4tap_16x2_avx2(
1638*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, __m256i s_256[4], __m256i ss_256[4],
1639*77c1e3ccSAndroid Build Coastguard Worker     __m256i tt_256[4], const __m256i coeffs[2], __m256i r[4]) {
1640*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1641*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1642*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1643*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1644*77c1e3ccSAndroid Build Coastguard Worker   tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1645*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1646*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1647*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1648*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1649*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = ss_256[3];
1650*77c1e3ccSAndroid Build Coastguard Worker   tt_256[0] = tt_256[1];
1651*77c1e3ccSAndroid Build Coastguard Worker   tt_256[2] = tt_256[3];
1652*77c1e3ccSAndroid Build Coastguard Worker }
1653*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_32x2_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[4],__m256i ss_256[4],__m256i tt_256[4],const __m256i coeffs[2],__m256i r[4])1654*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_4tap_32x2_avx2(
1655*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[4],
1656*77c1e3ccSAndroid Build Coastguard Worker     __m256i ss_256[4], __m256i tt_256[4], const __m256i coeffs[2],
1657*77c1e3ccSAndroid Build Coastguard Worker     __m256i r[4]) {
1658*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * stride));
1659*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1660*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1661*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = _mm256_loadu_si256((__m256i *)(src + 4 * stride));
1662*77c1e3ccSAndroid Build Coastguard Worker   tt_256[1] = _mm256_unpacklo_epi16(s_256[3], s_256[2]);
1663*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = _mm256_unpackhi_epi16(s_256[3], s_256[2]);
1664*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1665*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(tt_256, coeffs, r + 2);
1666*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1667*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = ss_256[3];
1668*77c1e3ccSAndroid Build Coastguard Worker   tt_256[0] = tt_256[1];
1669*77c1e3ccSAndroid Build Coastguard Worker   tt_256[2] = tt_256[3];
1670*77c1e3ccSAndroid Build Coastguard Worker }
1671*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_4tap_16x2_half_pelavx2(const int16_t * const src,__m256i s_256[5],const __m256i coeffs[1],__m256i r[4])1672*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_4tap_16x2_half_pelavx2(
1673*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, __m256i s_256[5], const __m256i coeffs[1],
1674*77c1e3ccSAndroid Build Coastguard Worker     __m256i r[4]) {
1675*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_256[2];
1676*77c1e3ccSAndroid Build Coastguard Worker 
1677*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
1678*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
1679*77c1e3ccSAndroid Build Coastguard Worker 
1680*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[0], s_256[3]);
1681*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[1], s_256[2]);
1682*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 0);
1683*77c1e3ccSAndroid Build Coastguard Worker 
1684*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1685*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[2], s_256[3]);
1686*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_2tap_16_avx2(a_256[0], a_256[1], coeffs, r + 2);
1687*77c1e3ccSAndroid Build Coastguard Worker 
1688*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = s_256[2];
1689*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = s_256[3];
1690*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = s_256[4];
1691*77c1e3ccSAndroid Build Coastguard Worker }
1692*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_2x2_sse2(const int16_t * const src,__m128i s_32[6],__m128i ss_128[3],const __m128i coeffs[3])1693*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_convolve_6tap_2x2_sse2(const int16_t *const src,
1694*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_32[6],
1695*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i ss_128[3],
1696*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m128i coeffs[3]) {
1697*77c1e3ccSAndroid Build Coastguard Worker   s_32[5] = _mm_cvtsi32_si128(loadu_int32(src + 5 * 2));
1698*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1699*77c1e3ccSAndroid Build Coastguard Worker   s_32[4] = _mm_cvtsi32_si128(loadu_int32(src + 6 * 2));
1700*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[4]);
1701*77c1e3ccSAndroid Build Coastguard Worker   ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1702*77c1e3ccSAndroid Build Coastguard Worker   const __m128i r = convolve16_6tap_sse2(ss_128, coeffs);
1703*77c1e3ccSAndroid Build Coastguard Worker   ss_128[0] = ss_128[1];
1704*77c1e3ccSAndroid Build Coastguard Worker   ss_128[1] = ss_128[2];
1705*77c1e3ccSAndroid Build Coastguard Worker   return r;
1706*77c1e3ccSAndroid Build Coastguard Worker }
1707*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_4x2_avx2(const int16_t * const src,__m128i s_64[6],__m256i ss_256[3],const __m256i coeffs[3])1708*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_convolve_6tap_4x2_avx2(const int16_t *const src,
1709*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_64[6],
1710*77c1e3ccSAndroid Build Coastguard Worker                                                   __m256i ss_256[3],
1711*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m256i coeffs[3]) {
1712*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1713*77c1e3ccSAndroid Build Coastguard Worker   s_64[5] = _mm_loadl_epi64((__m128i *)(src + 5 * 4));
1714*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_64[4], s_64[5]);
1715*77c1e3ccSAndroid Build Coastguard Worker   s_64[4] = _mm_loadl_epi64((__m128i *)(src + 6 * 4));
1716*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_64[5], s_64[4]);
1717*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1718*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = convolve16_6tap_avx2(ss_256, coeffs);
1719*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1720*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = ss_256[2];
1721*77c1e3ccSAndroid Build Coastguard Worker   return r;
1722*77c1e3ccSAndroid Build Coastguard Worker }
1723*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_16_avx2(const __m256i ss[6],const __m256i coeffs[3],__m256i r[2])1724*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_6tap_16_avx2(const __m256i ss[6],
1725*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i coeffs[3],
1726*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i r[2]) {
1727*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve16_6tap_avx2(ss, coeffs);
1728*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve16_6tap_avx2(ss + 3, coeffs);
1729*77c1e3ccSAndroid Build Coastguard Worker }
1730*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_8x2_avx2(const int16_t * const src,__m256i ss_256[6],const __m256i coeffs[3],__m256i r[2])1731*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_6tap_8x2_avx2(const int16_t *const src,
1732*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i ss_256[6],
1733*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[3],
1734*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i r[2]) {
1735*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1736*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1737*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1738*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1739*77c1e3ccSAndroid Build Coastguard Worker   ss_256[5] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1740*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r);
1741*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1742*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = ss_256[2];
1743*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = ss_256[4];
1744*77c1e3ccSAndroid Build Coastguard Worker   ss_256[4] = ss_256[5];
1745*77c1e3ccSAndroid Build Coastguard Worker }
1746*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[2],__m256i s_256[6],__m256i r[2])1747*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_6tap_8x2_half_pel_avx2(
1748*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[6],
1749*77c1e3ccSAndroid Build Coastguard Worker     __m256i r[2]) {
1750*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_256[2], ss_256[4];
1751*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 8));
1752*77c1e3ccSAndroid Build Coastguard Worker   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 8));
1753*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1754*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1755*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1756*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1757*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1758*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1759*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1760*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = s_256[2];
1761*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = s_256[3];
1762*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = s_256[4];
1763*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = s_256[5];
1764*77c1e3ccSAndroid Build Coastguard Worker }
1765*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_16x2_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[6],__m256i ss_256[6],__m256i tt_256[6],const __m256i coeffs[3],__m256i r[4])1766*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_6tap_16x2_avx2(
1767*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1768*77c1e3ccSAndroid Build Coastguard Worker     __m256i ss_256[6], __m256i tt_256[6], const __m256i coeffs[3],
1769*77c1e3ccSAndroid Build Coastguard Worker     __m256i r[4]) {
1770*77c1e3ccSAndroid Build Coastguard Worker   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1771*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1772*77c1e3ccSAndroid Build Coastguard Worker   ss_256[5] = _mm256_unpackhi_epi16(s_256[4], s_256[5]);
1773*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1774*77c1e3ccSAndroid Build Coastguard Worker   tt_256[2] = _mm256_unpacklo_epi16(s_256[5], s_256[4]);
1775*77c1e3ccSAndroid Build Coastguard Worker   tt_256[5] = _mm256_unpackhi_epi16(s_256[5], s_256[4]);
1776*77c1e3ccSAndroid Build Coastguard Worker 
1777*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_6tap_16_avx2(ss_256, coeffs, r + 0);
1778*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_6tap_16_avx2(tt_256, coeffs, r + 2);
1779*77c1e3ccSAndroid Build Coastguard Worker 
1780*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1781*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = ss_256[2];
1782*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = ss_256[4];
1783*77c1e3ccSAndroid Build Coastguard Worker   ss_256[4] = ss_256[5];
1784*77c1e3ccSAndroid Build Coastguard Worker 
1785*77c1e3ccSAndroid Build Coastguard Worker   tt_256[0] = tt_256[1];
1786*77c1e3ccSAndroid Build Coastguard Worker   tt_256[1] = tt_256[2];
1787*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = tt_256[4];
1788*77c1e3ccSAndroid Build Coastguard Worker   tt_256[4] = tt_256[5];
1789*77c1e3ccSAndroid Build Coastguard Worker }
1790*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_6tap_16x2_half_pel_avx2(const int16_t * const src,const ptrdiff_t stride,__m256i s_256[6],__m256i ss_256[4],const __m256i coeffs[2],__m256i r[4])1791*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_6tap_16x2_half_pel_avx2(
1792*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, __m256i s_256[6],
1793*77c1e3ccSAndroid Build Coastguard Worker     __m256i ss_256[4], const __m256i coeffs[2], __m256i r[4]) {
1794*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_256[2];
1795*77c1e3ccSAndroid Build Coastguard Worker 
1796*77c1e3ccSAndroid Build Coastguard Worker   s_256[5] = _mm256_loadu_si256((__m256i *)(src + 5 * stride));
1797*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[0], s_256[5]);
1798*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[1], s_256[4]);
1799*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1800*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1801*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1802*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1803*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1804*77c1e3ccSAndroid Build Coastguard Worker 
1805*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[2], s_256[5]);
1806*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = s_256[2];
1807*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = s_256[4];
1808*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = _mm256_loadu_si256((__m256i *)(src + 6 * stride));
1809*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[1], s_256[4]);
1810*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = s_256[3];
1811*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = s_256[5];
1812*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1813*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1814*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1815*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1816*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1817*77c1e3ccSAndroid Build Coastguard Worker }
1818*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_2x2_sse2(const int16_t * const src,__m128i s_32[8],__m128i ss_128[4],const __m128i coeffs[4])1819*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i xy_y_convolve_8tap_2x2_sse2(const int16_t *const src,
1820*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_32[8],
1821*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i ss_128[4],
1822*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m128i coeffs[4]) {
1823*77c1e3ccSAndroid Build Coastguard Worker   s_32[7] = _mm_cvtsi32_si128(loadu_int32(src + 7 * 2));
1824*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src67 = _mm_unpacklo_epi32(s_32[6], s_32[7]);
1825*77c1e3ccSAndroid Build Coastguard Worker   s_32[6] = _mm_cvtsi32_si128(loadu_int32(src + 8 * 2));
1826*77c1e3ccSAndroid Build Coastguard Worker   const __m128i src78 = _mm_unpacklo_epi32(s_32[7], s_32[6]);
1827*77c1e3ccSAndroid Build Coastguard Worker   ss_128[3] = _mm_unpacklo_epi16(src67, src78);
1828*77c1e3ccSAndroid Build Coastguard Worker   const __m128i r = convolve16_8tap_sse2(ss_128, coeffs);
1829*77c1e3ccSAndroid Build Coastguard Worker   ss_128[0] = ss_128[1];
1830*77c1e3ccSAndroid Build Coastguard Worker   ss_128[1] = ss_128[2];
1831*77c1e3ccSAndroid Build Coastguard Worker   ss_128[2] = ss_128[3];
1832*77c1e3ccSAndroid Build Coastguard Worker   return r;
1833*77c1e3ccSAndroid Build Coastguard Worker }
1834*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_4x2_avx2(const int16_t * const src,__m128i s_64[8],__m256i ss_256[4],const __m256i coeffs[4])1835*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i xy_y_convolve_8tap_4x2_avx2(const int16_t *const src,
1836*77c1e3ccSAndroid Build Coastguard Worker                                                   __m128i s_64[8],
1837*77c1e3ccSAndroid Build Coastguard Worker                                                   __m256i ss_256[4],
1838*77c1e3ccSAndroid Build Coastguard Worker                                                   const __m256i coeffs[4]) {
1839*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1840*77c1e3ccSAndroid Build Coastguard Worker   s_64[7] = _mm_loadl_epi64((__m128i *)(src + 7 * 4));
1841*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_setr_m128i(s_64[6], s_64[7]);
1842*77c1e3ccSAndroid Build Coastguard Worker   s_64[6] = _mm_loadl_epi64((__m128i *)(src + 8 * 4));
1843*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_setr_m128i(s_64[7], s_64[6]);
1844*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1845*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = convolve16_8tap_avx2(ss_256, coeffs);
1846*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1847*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = ss_256[2];
1848*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = ss_256[3];
1849*77c1e3ccSAndroid Build Coastguard Worker   return r;
1850*77c1e3ccSAndroid Build Coastguard Worker }
1851*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_16_avx2(const __m256i * const ss,const __m256i coeffs[4],__m256i r[2])1852*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_8tap_16_avx2(const __m256i *const ss,
1853*77c1e3ccSAndroid Build Coastguard Worker                                               const __m256i coeffs[4],
1854*77c1e3ccSAndroid Build Coastguard Worker                                               __m256i r[2]) {
1855*77c1e3ccSAndroid Build Coastguard Worker   r[0] = convolve16_8tap_avx2(ss, coeffs);
1856*77c1e3ccSAndroid Build Coastguard Worker   r[1] = convolve16_8tap_avx2(ss + 4, coeffs);
1857*77c1e3ccSAndroid Build Coastguard Worker }
1858*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_8x2_avx2(const int16_t * const src,__m256i ss_256[8],const __m256i coeffs[4],__m256i r[2])1859*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_8tap_8x2_avx2(const int16_t *const src,
1860*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i ss_256[8],
1861*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[4],
1862*77c1e3ccSAndroid Build Coastguard Worker                                                __m256i r[2]) {
1863*77c1e3ccSAndroid Build Coastguard Worker   __m256i s_256[2];
1864*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1865*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1866*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1867*77c1e3ccSAndroid Build Coastguard Worker   ss_256[7] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1868*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r);
1869*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1870*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = ss_256[2];
1871*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = ss_256[3];
1872*77c1e3ccSAndroid Build Coastguard Worker   ss_256[4] = ss_256[5];
1873*77c1e3ccSAndroid Build Coastguard Worker   ss_256[5] = ss_256[6];
1874*77c1e3ccSAndroid Build Coastguard Worker   ss_256[6] = ss_256[7];
1875*77c1e3ccSAndroid Build Coastguard Worker }
1876*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_8x2_half_pel_avx2(const int16_t * const src,const __m256i coeffs[2],__m256i s_256[8],__m256i r[2])1877*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_8tap_8x2_half_pel_avx2(
1878*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const __m256i coeffs[2], __m256i s_256[8],
1879*77c1e3ccSAndroid Build Coastguard Worker     __m256i r[2]) {
1880*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_256[4], ss_256[4];
1881*77c1e3ccSAndroid Build Coastguard Worker 
1882*77c1e3ccSAndroid Build Coastguard Worker   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 8));
1883*77c1e3ccSAndroid Build Coastguard Worker   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 8));
1884*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1885*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1886*77c1e3ccSAndroid Build Coastguard Worker   a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1887*77c1e3ccSAndroid Build Coastguard Worker   a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1888*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1889*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1890*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1891*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1892*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r);
1893*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = s_256[2];
1894*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = s_256[3];
1895*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = s_256[4];
1896*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = s_256[5];
1897*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = s_256[6];
1898*77c1e3ccSAndroid Build Coastguard Worker   s_256[5] = s_256[7];
1899*77c1e3ccSAndroid Build Coastguard Worker }
1900*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_16x2_avx2(const int16_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i ss_256[8],__m256i tt_256[8],__m256i r[4])1901*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void xy_y_convolve_8tap_16x2_avx2(
1902*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1903*77c1e3ccSAndroid Build Coastguard Worker     __m256i s_256[8], __m256i ss_256[8], __m256i tt_256[8], __m256i r[4]) {
1904*77c1e3ccSAndroid Build Coastguard Worker   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1905*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpacklo_epi16(s_256[6], s_256[7]);
1906*77c1e3ccSAndroid Build Coastguard Worker   ss_256[7] = _mm256_unpackhi_epi16(s_256[6], s_256[7]);
1907*77c1e3ccSAndroid Build Coastguard Worker   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1908*77c1e3ccSAndroid Build Coastguard Worker   tt_256[3] = _mm256_unpacklo_epi16(s_256[7], s_256[6]);
1909*77c1e3ccSAndroid Build Coastguard Worker   tt_256[7] = _mm256_unpackhi_epi16(s_256[7], s_256[6]);
1910*77c1e3ccSAndroid Build Coastguard Worker 
1911*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_8tap_16_avx2(ss_256, coeffs, r + 0);
1912*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_8tap_16_avx2(tt_256, coeffs, r + 2);
1913*77c1e3ccSAndroid Build Coastguard Worker 
1914*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = ss_256[1];
1915*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = ss_256[2];
1916*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = ss_256[3];
1917*77c1e3ccSAndroid Build Coastguard Worker   ss_256[4] = ss_256[5];
1918*77c1e3ccSAndroid Build Coastguard Worker   ss_256[5] = ss_256[6];
1919*77c1e3ccSAndroid Build Coastguard Worker   ss_256[6] = ss_256[7];
1920*77c1e3ccSAndroid Build Coastguard Worker 
1921*77c1e3ccSAndroid Build Coastguard Worker   tt_256[0] = tt_256[1];
1922*77c1e3ccSAndroid Build Coastguard Worker   tt_256[1] = tt_256[2];
1923*77c1e3ccSAndroid Build Coastguard Worker   tt_256[2] = tt_256[3];
1924*77c1e3ccSAndroid Build Coastguard Worker   tt_256[4] = tt_256[5];
1925*77c1e3ccSAndroid Build Coastguard Worker   tt_256[5] = tt_256[6];
1926*77c1e3ccSAndroid Build Coastguard Worker   tt_256[6] = tt_256[7];
1927*77c1e3ccSAndroid Build Coastguard Worker }
1928*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_convolve_8tap_16x2_half_pel_avx2(const int16_t * const src,const ptrdiff_t stride,const __m256i coeffs[4],__m256i s_256[8],__m256i r[4])1929*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_convolve_8tap_16x2_half_pel_avx2(
1930*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const src, const ptrdiff_t stride, const __m256i coeffs[4],
1931*77c1e3ccSAndroid Build Coastguard Worker     __m256i s_256[8], __m256i r[4]) {
1932*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_256[4], ss_256[4];
1933*77c1e3ccSAndroid Build Coastguard Worker   s_256[7] = _mm256_loadu_si256((__m256i *)(src + 7 * stride));
1934*77c1e3ccSAndroid Build Coastguard Worker 
1935*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[0], s_256[7]);
1936*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[1], s_256[6]);
1937*77c1e3ccSAndroid Build Coastguard Worker   a_256[2] = _mm256_add_epi16(s_256[2], s_256[5]);
1938*77c1e3ccSAndroid Build Coastguard Worker   a_256[3] = _mm256_add_epi16(s_256[3], s_256[4]);
1939*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1940*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1941*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1942*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1943*77c1e3ccSAndroid Build Coastguard Worker 
1944*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 0);
1945*77c1e3ccSAndroid Build Coastguard Worker 
1946*77c1e3ccSAndroid Build Coastguard Worker   a_256[1] = _mm256_add_epi16(s_256[2], s_256[7]);
1947*77c1e3ccSAndroid Build Coastguard Worker   a_256[2] = _mm256_add_epi16(s_256[3], s_256[6]);
1948*77c1e3ccSAndroid Build Coastguard Worker   a_256[3] = _mm256_add_epi16(s_256[4], s_256[5]);
1949*77c1e3ccSAndroid Build Coastguard Worker   s_256[0] = s_256[2];
1950*77c1e3ccSAndroid Build Coastguard Worker   s_256[2] = s_256[4];
1951*77c1e3ccSAndroid Build Coastguard Worker   s_256[4] = s_256[6];
1952*77c1e3ccSAndroid Build Coastguard Worker   s_256[6] = _mm256_loadu_si256((__m256i *)(src + 8 * stride));
1953*77c1e3ccSAndroid Build Coastguard Worker 
1954*77c1e3ccSAndroid Build Coastguard Worker   a_256[0] = _mm256_add_epi16(s_256[1], s_256[6]);
1955*77c1e3ccSAndroid Build Coastguard Worker   s_256[1] = s_256[3];
1956*77c1e3ccSAndroid Build Coastguard Worker   s_256[3] = s_256[5];
1957*77c1e3ccSAndroid Build Coastguard Worker   s_256[5] = s_256[7];
1958*77c1e3ccSAndroid Build Coastguard Worker   ss_256[0] = _mm256_unpacklo_epi16(a_256[0], a_256[1]);
1959*77c1e3ccSAndroid Build Coastguard Worker   ss_256[1] = _mm256_unpacklo_epi16(a_256[2], a_256[3]);
1960*77c1e3ccSAndroid Build Coastguard Worker   ss_256[2] = _mm256_unpackhi_epi16(a_256[0], a_256[1]);
1961*77c1e3ccSAndroid Build Coastguard Worker   ss_256[3] = _mm256_unpackhi_epi16(a_256[2], a_256[3]);
1962*77c1e3ccSAndroid Build Coastguard Worker 
1963*77c1e3ccSAndroid Build Coastguard Worker   xy_y_convolve_4tap_16_avx2(ss_256, coeffs, r + 2);
1964*77c1e3ccSAndroid Build Coastguard Worker }
1965*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_store_8x2_avx2(const __m256i res[2],uint8_t * const dst,const ptrdiff_t stride)1966*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_round_store_8x2_avx2(const __m256i res[2],
1967*77c1e3ccSAndroid Build Coastguard Worker                                              uint8_t *const dst,
1968*77c1e3ccSAndroid Build Coastguard Worker                                              const ptrdiff_t stride) {
1969*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r = xy_y_round_16_avx2(res);
1970*77c1e3ccSAndroid Build Coastguard Worker   pack_store_8x2_avx2(r, dst, stride);
1971*77c1e3ccSAndroid Build Coastguard Worker }
1972*77c1e3ccSAndroid Build Coastguard Worker 
xy_y_round_store_16x2_avx2(const __m256i res[4],uint8_t * const dst,const ptrdiff_t stride)1973*77c1e3ccSAndroid Build Coastguard Worker static inline void xy_y_round_store_16x2_avx2(const __m256i res[4],
1974*77c1e3ccSAndroid Build Coastguard Worker                                               uint8_t *const dst,
1975*77c1e3ccSAndroid Build Coastguard Worker                                               const ptrdiff_t stride) {
1976*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r0 = xy_y_round_16_avx2(res + 0);
1977*77c1e3ccSAndroid Build Coastguard Worker   const __m256i r1 = xy_y_round_16_avx2(res + 2);
1978*77c1e3ccSAndroid Build Coastguard Worker   xy_y_pack_store_16x2_avx2(r0, r1, dst, stride);
1979*77c1e3ccSAndroid Build Coastguard Worker }
1980*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_round_store_32_avx2(const __m256i res[2],uint8_t * const dst)1981*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_y_round_store_32_avx2(const __m256i res[2],
1982*77c1e3ccSAndroid Build Coastguard Worker                                             uint8_t *const dst) {
1983*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
1984*77c1e3ccSAndroid Build Coastguard Worker 
1985*77c1e3ccSAndroid Build Coastguard Worker   r[0] = sr_y_round_avx2(res[0]);
1986*77c1e3ccSAndroid Build Coastguard Worker   r[1] = sr_y_round_avx2(res[1]);
1987*77c1e3ccSAndroid Build Coastguard Worker   convolve_store_32_avx2(r[0], r[1], dst);
1988*77c1e3ccSAndroid Build Coastguard Worker }
1989*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_round_store_32x2_avx2(const __m256i res[4],uint8_t * const dst,const int32_t dst_stride)1990*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_y_round_store_32x2_avx2(const __m256i res[4],
1991*77c1e3ccSAndroid Build Coastguard Worker                                               uint8_t *const dst,
1992*77c1e3ccSAndroid Build Coastguard Worker                                               const int32_t dst_stride) {
1993*77c1e3ccSAndroid Build Coastguard Worker   sr_y_round_store_32_avx2(res, dst);
1994*77c1e3ccSAndroid Build Coastguard Worker   sr_y_round_store_32_avx2(res + 2, dst + dst_stride);
1995*77c1e3ccSAndroid Build Coastguard Worker }
1996*77c1e3ccSAndroid Build Coastguard Worker 
sr_y_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],const __m256i s0,__m256i * const s1,uint8_t * const dst)1997*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_y_2tap_32_avx2(const uint8_t *const src,
1998*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[1], const __m256i s0,
1999*77c1e3ccSAndroid Build Coastguard Worker                                      __m256i *const s1, uint8_t *const dst) {
2000*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
2001*77c1e3ccSAndroid Build Coastguard Worker   y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
2002*77c1e3ccSAndroid Build Coastguard Worker   sr_y_round_store_32_avx2(r, dst);
2003*77c1e3ccSAndroid Build Coastguard Worker }
2004*77c1e3ccSAndroid Build Coastguard Worker 
av1_convolve_y_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_y,const int32_t subpel_y_q4)2005*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void av1_convolve_y_sr_specialized_avx2(
2006*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2007*77c1e3ccSAndroid Build Coastguard Worker     int32_t w, int32_t h, const InterpFilterParams *filter_params_y,
2008*77c1e3ccSAndroid Build Coastguard Worker     const int32_t subpel_y_q4) {
2009*77c1e3ccSAndroid Build Coastguard Worker   int32_t x, y;
2010*77c1e3ccSAndroid Build Coastguard Worker   __m128i coeffs_128[4];
2011*77c1e3ccSAndroid Build Coastguard Worker   __m256i coeffs_256[4];
2012*77c1e3ccSAndroid Build Coastguard Worker 
2013*77c1e3ccSAndroid Build Coastguard Worker   int vert_tap = get_filter_tap(filter_params_y, subpel_y_q4);
2014*77c1e3ccSAndroid Build Coastguard Worker 
2015*77c1e3ccSAndroid Build Coastguard Worker   if (vert_tap == 2) {
2016*77c1e3ccSAndroid Build Coastguard Worker     // vert_filt as 2 tap
2017*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src;
2018*77c1e3ccSAndroid Build Coastguard Worker 
2019*77c1e3ccSAndroid Build Coastguard Worker     y = h;
2020*77c1e3ccSAndroid Build Coastguard Worker 
2021*77c1e3ccSAndroid Build Coastguard Worker     if (subpel_y_q4 != 8) {
2022*77c1e3ccSAndroid Build Coastguard Worker       if (w <= 8) {
2023*77c1e3ccSAndroid Build Coastguard Worker         prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4,
2024*77c1e3ccSAndroid Build Coastguard Worker                                        coeffs_128);
2025*77c1e3ccSAndroid Build Coastguard Worker 
2026*77c1e3ccSAndroid Build Coastguard Worker         if (w == 2) {
2027*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_16[2];
2028*77c1e3ccSAndroid Build Coastguard Worker 
2029*77c1e3ccSAndroid Build Coastguard Worker           s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2030*77c1e3ccSAndroid Build Coastguard Worker 
2031*77c1e3ccSAndroid Build Coastguard Worker           do {
2032*77c1e3ccSAndroid Build Coastguard Worker             const __m128i res = y_convolve_2tap_2x2_ssse3(src_ptr, src_stride,
2033*77c1e3ccSAndroid Build Coastguard Worker                                                           coeffs_128, s_16);
2034*77c1e3ccSAndroid Build Coastguard Worker             const __m128i r = sr_y_round_sse2(res);
2035*77c1e3ccSAndroid Build Coastguard Worker             pack_store_2x2_sse2(r, dst, dst_stride);
2036*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2037*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2038*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2039*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2040*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 4) {
2041*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_32[2];
2042*77c1e3ccSAndroid Build Coastguard Worker 
2043*77c1e3ccSAndroid Build Coastguard Worker           s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2044*77c1e3ccSAndroid Build Coastguard Worker 
2045*77c1e3ccSAndroid Build Coastguard Worker           do {
2046*77c1e3ccSAndroid Build Coastguard Worker             const __m128i res = y_convolve_2tap_4x2_ssse3(src_ptr, src_stride,
2047*77c1e3ccSAndroid Build Coastguard Worker                                                           coeffs_128, s_32);
2048*77c1e3ccSAndroid Build Coastguard Worker             const __m128i r = sr_y_round_sse2(res);
2049*77c1e3ccSAndroid Build Coastguard Worker             pack_store_4x2_sse2(r, dst, dst_stride);
2050*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2051*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2052*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2053*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2054*77c1e3ccSAndroid Build Coastguard Worker         } else {
2055*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_64[2], s_128[2];
2056*77c1e3ccSAndroid Build Coastguard Worker 
2057*77c1e3ccSAndroid Build Coastguard Worker           assert(w == 8);
2058*77c1e3ccSAndroid Build Coastguard Worker 
2059*77c1e3ccSAndroid Build Coastguard Worker           s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2060*77c1e3ccSAndroid Build Coastguard Worker 
2061*77c1e3ccSAndroid Build Coastguard Worker           do {
2062*77c1e3ccSAndroid Build Coastguard Worker             // Note: Faster than binding to AVX2 registers.
2063*77c1e3ccSAndroid Build Coastguard Worker             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2064*77c1e3ccSAndroid Build Coastguard Worker             s_128[0] = _mm_unpacklo_epi64(s_64[0], s_64[1]);
2065*77c1e3ccSAndroid Build Coastguard Worker             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2066*77c1e3ccSAndroid Build Coastguard Worker             s_128[1] = _mm_unpacklo_epi64(s_64[1], s_64[0]);
2067*77c1e3ccSAndroid Build Coastguard Worker             const __m128i ss0 = _mm_unpacklo_epi8(s_128[0], s_128[1]);
2068*77c1e3ccSAndroid Build Coastguard Worker             const __m128i ss1 = _mm_unpackhi_epi8(s_128[0], s_128[1]);
2069*77c1e3ccSAndroid Build Coastguard Worker             const __m128i res0 = convolve_2tap_ssse3(&ss0, coeffs_128);
2070*77c1e3ccSAndroid Build Coastguard Worker             const __m128i res1 = convolve_2tap_ssse3(&ss1, coeffs_128);
2071*77c1e3ccSAndroid Build Coastguard Worker             const __m128i r0 = sr_y_round_sse2(res0);
2072*77c1e3ccSAndroid Build Coastguard Worker             const __m128i r1 = sr_y_round_sse2(res1);
2073*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d = _mm_packus_epi16(r0, r1);
2074*77c1e3ccSAndroid Build Coastguard Worker             _mm_storel_epi64((__m128i *)dst, d);
2075*77c1e3ccSAndroid Build Coastguard Worker             _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2076*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2077*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2078*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2079*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2080*77c1e3ccSAndroid Build Coastguard Worker         }
2081*77c1e3ccSAndroid Build Coastguard Worker       } else {
2082*77c1e3ccSAndroid Build Coastguard Worker         prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2083*77c1e3ccSAndroid Build Coastguard Worker 
2084*77c1e3ccSAndroid Build Coastguard Worker         if (w == 16) {
2085*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_128[2];
2086*77c1e3ccSAndroid Build Coastguard Worker 
2087*77c1e3ccSAndroid Build Coastguard Worker           s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2088*77c1e3ccSAndroid Build Coastguard Worker 
2089*77c1e3ccSAndroid Build Coastguard Worker           do {
2090*77c1e3ccSAndroid Build Coastguard Worker             __m256i r[2];
2091*77c1e3ccSAndroid Build Coastguard Worker 
2092*77c1e3ccSAndroid Build Coastguard Worker             y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2093*77c1e3ccSAndroid Build Coastguard Worker                                       r);
2094*77c1e3ccSAndroid Build Coastguard Worker             sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2095*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2096*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2097*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2098*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2099*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 32) {
2100*77c1e3ccSAndroid Build Coastguard Worker           __m256i s_256[2];
2101*77c1e3ccSAndroid Build Coastguard Worker 
2102*77c1e3ccSAndroid Build Coastguard Worker           s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2103*77c1e3ccSAndroid Build Coastguard Worker 
2104*77c1e3ccSAndroid Build Coastguard Worker           do {
2105*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0],
2106*77c1e3ccSAndroid Build Coastguard Worker                               &s_256[1], dst);
2107*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1],
2108*77c1e3ccSAndroid Build Coastguard Worker                               &s_256[0], dst + dst_stride);
2109*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2110*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2111*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2112*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2113*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 64) {
2114*77c1e3ccSAndroid Build Coastguard Worker           __m256i s_256[2][2];
2115*77c1e3ccSAndroid Build Coastguard Worker 
2116*77c1e3ccSAndroid Build Coastguard Worker           s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2117*77c1e3ccSAndroid Build Coastguard Worker           s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2118*77c1e3ccSAndroid Build Coastguard Worker 
2119*77c1e3ccSAndroid Build Coastguard Worker           do {
2120*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2121*77c1e3ccSAndroid Build Coastguard Worker                               &s_256[1][0], dst);
2122*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride + 32, coeffs_256,
2123*77c1e3ccSAndroid Build Coastguard Worker                               s_256[0][1], &s_256[1][1], dst + 32);
2124*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2125*77c1e3ccSAndroid Build Coastguard Worker                               &s_256[0][0], dst + dst_stride);
2126*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 32, coeffs_256,
2127*77c1e3ccSAndroid Build Coastguard Worker                               s_256[1][1], &s_256[0][1], dst + dst_stride + 32);
2128*77c1e3ccSAndroid Build Coastguard Worker 
2129*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2130*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2131*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2132*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2133*77c1e3ccSAndroid Build Coastguard Worker         } else {
2134*77c1e3ccSAndroid Build Coastguard Worker           __m256i s_256[2][4];
2135*77c1e3ccSAndroid Build Coastguard Worker 
2136*77c1e3ccSAndroid Build Coastguard Worker           assert(w == 128);
2137*77c1e3ccSAndroid Build Coastguard Worker 
2138*77c1e3ccSAndroid Build Coastguard Worker           s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2139*77c1e3ccSAndroid Build Coastguard Worker           s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2140*77c1e3ccSAndroid Build Coastguard Worker           s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2141*77c1e3ccSAndroid Build Coastguard Worker           s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2142*77c1e3ccSAndroid Build Coastguard Worker 
2143*77c1e3ccSAndroid Build Coastguard Worker           do {
2144*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride, coeffs_256, s_256[0][0],
2145*77c1e3ccSAndroid Build Coastguard Worker                               &s_256[1][0], dst);
2146*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride + 1 * 32, coeffs_256,
2147*77c1e3ccSAndroid Build Coastguard Worker                               s_256[0][1], &s_256[1][1], dst + 1 * 32);
2148*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride + 2 * 32, coeffs_256,
2149*77c1e3ccSAndroid Build Coastguard Worker                               s_256[0][2], &s_256[1][2], dst + 2 * 32);
2150*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + src_stride + 3 * 32, coeffs_256,
2151*77c1e3ccSAndroid Build Coastguard Worker                               s_256[0][3], &s_256[1][3], dst + 3 * 32);
2152*77c1e3ccSAndroid Build Coastguard Worker 
2153*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride, coeffs_256, s_256[1][0],
2154*77c1e3ccSAndroid Build Coastguard Worker                               &s_256[0][0], dst + dst_stride);
2155*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32, coeffs_256,
2156*77c1e3ccSAndroid Build Coastguard Worker                               s_256[1][1], &s_256[0][1],
2157*77c1e3ccSAndroid Build Coastguard Worker                               dst + dst_stride + 1 * 32);
2158*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32, coeffs_256,
2159*77c1e3ccSAndroid Build Coastguard Worker                               s_256[1][2], &s_256[0][2],
2160*77c1e3ccSAndroid Build Coastguard Worker                               dst + dst_stride + 2 * 32);
2161*77c1e3ccSAndroid Build Coastguard Worker             sr_y_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32, coeffs_256,
2162*77c1e3ccSAndroid Build Coastguard Worker                               s_256[1][3], &s_256[0][3],
2163*77c1e3ccSAndroid Build Coastguard Worker                               dst + dst_stride + 3 * 32);
2164*77c1e3ccSAndroid Build Coastguard Worker 
2165*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2166*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2167*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2168*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2169*77c1e3ccSAndroid Build Coastguard Worker         }
2170*77c1e3ccSAndroid Build Coastguard Worker       }
2171*77c1e3ccSAndroid Build Coastguard Worker     } else {
2172*77c1e3ccSAndroid Build Coastguard Worker       // average to get half pel
2173*77c1e3ccSAndroid Build Coastguard Worker       if (w <= 8) {
2174*77c1e3ccSAndroid Build Coastguard Worker         if (w == 2) {
2175*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_16[2];
2176*77c1e3ccSAndroid Build Coastguard Worker 
2177*77c1e3ccSAndroid Build Coastguard Worker           s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
2178*77c1e3ccSAndroid Build Coastguard Worker 
2179*77c1e3ccSAndroid Build Coastguard Worker           do {
2180*77c1e3ccSAndroid Build Coastguard Worker             s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + src_stride));
2181*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d0 = _mm_avg_epu8(s_16[0], s_16[1]);
2182*77c1e3ccSAndroid Build Coastguard Worker             *(int16_t *)dst = (int16_t)_mm_cvtsi128_si32(d0);
2183*77c1e3ccSAndroid Build Coastguard Worker             s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2184*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d1 = _mm_avg_epu8(s_16[1], s_16[0]);
2185*77c1e3ccSAndroid Build Coastguard Worker             *(int16_t *)(dst + dst_stride) = (int16_t)_mm_cvtsi128_si32(d1);
2186*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2187*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2188*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2189*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2190*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 4) {
2191*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_32[2];
2192*77c1e3ccSAndroid Build Coastguard Worker 
2193*77c1e3ccSAndroid Build Coastguard Worker           s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr));
2194*77c1e3ccSAndroid Build Coastguard Worker 
2195*77c1e3ccSAndroid Build Coastguard Worker           do {
2196*77c1e3ccSAndroid Build Coastguard Worker             s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + src_stride));
2197*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d0 = _mm_avg_epu8(s_32[0], s_32[1]);
2198*77c1e3ccSAndroid Build Coastguard Worker             xx_storel_32(dst, d0);
2199*77c1e3ccSAndroid Build Coastguard Worker             s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2200*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d1 = _mm_avg_epu8(s_32[1], s_32[0]);
2201*77c1e3ccSAndroid Build Coastguard Worker             xx_storel_32(dst + dst_stride, d1);
2202*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2203*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2204*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2205*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2206*77c1e3ccSAndroid Build Coastguard Worker         } else {
2207*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_64[2];
2208*77c1e3ccSAndroid Build Coastguard Worker 
2209*77c1e3ccSAndroid Build Coastguard Worker           assert(w == 8);
2210*77c1e3ccSAndroid Build Coastguard Worker 
2211*77c1e3ccSAndroid Build Coastguard Worker           s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
2212*77c1e3ccSAndroid Build Coastguard Worker 
2213*77c1e3ccSAndroid Build Coastguard Worker           do {
2214*77c1e3ccSAndroid Build Coastguard Worker             // Note: Faster than binding to AVX2 registers.
2215*77c1e3ccSAndroid Build Coastguard Worker             s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + src_stride));
2216*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d0 = _mm_avg_epu8(s_64[0], s_64[1]);
2217*77c1e3ccSAndroid Build Coastguard Worker             _mm_storel_epi64((__m128i *)dst, d0);
2218*77c1e3ccSAndroid Build Coastguard Worker             s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2219*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d1 = _mm_avg_epu8(s_64[1], s_64[0]);
2220*77c1e3ccSAndroid Build Coastguard Worker             _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
2221*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2222*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2223*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2224*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2225*77c1e3ccSAndroid Build Coastguard Worker         }
2226*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
2227*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_128[2];
2228*77c1e3ccSAndroid Build Coastguard Worker 
2229*77c1e3ccSAndroid Build Coastguard Worker         s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
2230*77c1e3ccSAndroid Build Coastguard Worker 
2231*77c1e3ccSAndroid Build Coastguard Worker         do {
2232*77c1e3ccSAndroid Build Coastguard Worker           s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
2233*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d0 = _mm_avg_epu8(s_128[0], s_128[1]);
2234*77c1e3ccSAndroid Build Coastguard Worker           _mm_storeu_si128((__m128i *)dst, d0);
2235*77c1e3ccSAndroid Build Coastguard Worker           s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2236*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d1 = _mm_avg_epu8(s_128[1], s_128[0]);
2237*77c1e3ccSAndroid Build Coastguard Worker           _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
2238*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2239*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2240*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2241*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2242*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 32) {
2243*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[2];
2244*77c1e3ccSAndroid Build Coastguard Worker 
2245*77c1e3ccSAndroid Build Coastguard Worker         s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
2246*77c1e3ccSAndroid Build Coastguard Worker 
2247*77c1e3ccSAndroid Build Coastguard Worker         do {
2248*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0], &s_256[1], dst);
2249*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1], &s_256[0],
2250*77c1e3ccSAndroid Build Coastguard Worker                                 dst + dst_stride);
2251*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2252*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2253*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2254*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2255*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 64) {
2256*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[2][2];
2257*77c1e3ccSAndroid Build Coastguard Worker 
2258*77c1e3ccSAndroid Build Coastguard Worker         s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2259*77c1e3ccSAndroid Build Coastguard Worker         s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2260*77c1e3ccSAndroid Build Coastguard Worker 
2261*77c1e3ccSAndroid Build Coastguard Worker         do {
2262*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2263*77c1e3ccSAndroid Build Coastguard Worker                                 dst);
2264*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 32, s_256[0][1],
2265*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[1][1], dst + 32);
2266*77c1e3ccSAndroid Build Coastguard Worker 
2267*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2268*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[0][0], dst + dst_stride);
2269*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 32, s_256[1][1],
2270*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[0][1], dst + dst_stride + 32);
2271*77c1e3ccSAndroid Build Coastguard Worker 
2272*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2273*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2274*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2275*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2276*77c1e3ccSAndroid Build Coastguard Worker       } else {
2277*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[2][4];
2278*77c1e3ccSAndroid Build Coastguard Worker 
2279*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 128);
2280*77c1e3ccSAndroid Build Coastguard Worker 
2281*77c1e3ccSAndroid Build Coastguard Worker         s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
2282*77c1e3ccSAndroid Build Coastguard Worker         s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
2283*77c1e3ccSAndroid Build Coastguard Worker         s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
2284*77c1e3ccSAndroid Build Coastguard Worker         s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
2285*77c1e3ccSAndroid Build Coastguard Worker 
2286*77c1e3ccSAndroid Build Coastguard Worker         do {
2287*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride, s_256[0][0], &s_256[1][0],
2288*77c1e3ccSAndroid Build Coastguard Worker                                 dst);
2289*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 1 * 32, s_256[0][1],
2290*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[1][1], dst + 1 * 32);
2291*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 2 * 32, s_256[0][2],
2292*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[1][2], dst + 2 * 32);
2293*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + src_stride + 3 * 32, s_256[0][3],
2294*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[1][3], dst + 3 * 32);
2295*77c1e3ccSAndroid Build Coastguard Worker 
2296*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride, s_256[1][0],
2297*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[0][0], dst + dst_stride);
2298*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 1 * 32, s_256[1][1],
2299*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[0][1], dst + dst_stride + 1 * 32);
2300*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 2 * 32, s_256[1][2],
2301*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[0][2], dst + dst_stride + 2 * 32);
2302*77c1e3ccSAndroid Build Coastguard Worker           sr_y_2tap_32_avg_avx2(src_ptr + 2 * src_stride + 3 * 32, s_256[1][3],
2303*77c1e3ccSAndroid Build Coastguard Worker                                 &s_256[0][3], dst + dst_stride + 3 * 32);
2304*77c1e3ccSAndroid Build Coastguard Worker 
2305*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2306*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2307*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2308*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2309*77c1e3ccSAndroid Build Coastguard Worker       }
2310*77c1e3ccSAndroid Build Coastguard Worker     }
2311*77c1e3ccSAndroid Build Coastguard Worker   } else if (vert_tap == 4) {
2312*77c1e3ccSAndroid Build Coastguard Worker     // vert_filt as 4 tap
2313*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src - src_stride;
2314*77c1e3ccSAndroid Build Coastguard Worker 
2315*77c1e3ccSAndroid Build Coastguard Worker     y = h;
2316*77c1e3ccSAndroid Build Coastguard Worker 
2317*77c1e3ccSAndroid Build Coastguard Worker     if (w <= 4) {
2318*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2319*77c1e3ccSAndroid Build Coastguard Worker 
2320*77c1e3ccSAndroid Build Coastguard Worker       if (w == 2) {
2321*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_16[4], ss_128[2];
2322*77c1e3ccSAndroid Build Coastguard Worker 
2323*77c1e3ccSAndroid Build Coastguard Worker         s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2324*77c1e3ccSAndroid Build Coastguard Worker         s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2325*77c1e3ccSAndroid Build Coastguard Worker         s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2326*77c1e3ccSAndroid Build Coastguard Worker 
2327*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2328*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2329*77c1e3ccSAndroid Build Coastguard Worker 
2330*77c1e3ccSAndroid Build Coastguard Worker         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2331*77c1e3ccSAndroid Build Coastguard Worker 
2332*77c1e3ccSAndroid Build Coastguard Worker         do {
2333*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2334*77c1e3ccSAndroid Build Coastguard Worker           const __m128i res = y_convolve_4tap_2x2_ssse3(
2335*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_128, s_16, ss_128);
2336*77c1e3ccSAndroid Build Coastguard Worker           const __m128i r = sr_y_round_sse2(res);
2337*77c1e3ccSAndroid Build Coastguard Worker           pack_store_2x2_sse2(r, dst, dst_stride);
2338*77c1e3ccSAndroid Build Coastguard Worker 
2339*77c1e3ccSAndroid Build Coastguard Worker           ss_128[0] = ss_128[1];
2340*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2341*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2342*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2343*77c1e3ccSAndroid Build Coastguard Worker       } else {
2344*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_32[4], ss_128[2];
2345*77c1e3ccSAndroid Build Coastguard Worker 
2346*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 4);
2347*77c1e3ccSAndroid Build Coastguard Worker 
2348*77c1e3ccSAndroid Build Coastguard Worker         s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2349*77c1e3ccSAndroid Build Coastguard Worker         s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2350*77c1e3ccSAndroid Build Coastguard Worker         s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2351*77c1e3ccSAndroid Build Coastguard Worker 
2352*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2353*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2354*77c1e3ccSAndroid Build Coastguard Worker 
2355*77c1e3ccSAndroid Build Coastguard Worker         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2356*77c1e3ccSAndroid Build Coastguard Worker 
2357*77c1e3ccSAndroid Build Coastguard Worker         do {
2358*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2359*77c1e3ccSAndroid Build Coastguard Worker           const __m128i res = y_convolve_4tap_4x2_ssse3(
2360*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_128, s_32, ss_128);
2361*77c1e3ccSAndroid Build Coastguard Worker           const __m128i r = sr_y_round_sse2(res);
2362*77c1e3ccSAndroid Build Coastguard Worker           pack_store_4x2_sse2(r, dst, dst_stride);
2363*77c1e3ccSAndroid Build Coastguard Worker 
2364*77c1e3ccSAndroid Build Coastguard Worker           ss_128[0] = ss_128[1];
2365*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2366*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2367*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2368*77c1e3ccSAndroid Build Coastguard Worker       }
2369*77c1e3ccSAndroid Build Coastguard Worker     } else {
2370*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2371*77c1e3ccSAndroid Build Coastguard Worker 
2372*77c1e3ccSAndroid Build Coastguard Worker       if (w == 8) {
2373*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_64[4];
2374*77c1e3ccSAndroid Build Coastguard Worker         __m256i ss_256[2];
2375*77c1e3ccSAndroid Build Coastguard Worker 
2376*77c1e3ccSAndroid Build Coastguard Worker         s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2377*77c1e3ccSAndroid Build Coastguard Worker         s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2378*77c1e3ccSAndroid Build Coastguard Worker         s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2379*77c1e3ccSAndroid Build Coastguard Worker 
2380*77c1e3ccSAndroid Build Coastguard Worker         // Load lines a and b. Line a to lower 128, line b to upper 128
2381*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2382*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2383*77c1e3ccSAndroid Build Coastguard Worker 
2384*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2385*77c1e3ccSAndroid Build Coastguard Worker 
2386*77c1e3ccSAndroid Build Coastguard Worker         do {
2387*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2388*77c1e3ccSAndroid Build Coastguard Worker           const __m256i res = y_convolve_4tap_8x2_avx2(
2389*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_256, s_64, ss_256);
2390*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2391*77c1e3ccSAndroid Build Coastguard Worker 
2392*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2393*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2394*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2395*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2396*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
2397*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_128[4];
2398*77c1e3ccSAndroid Build Coastguard Worker         __m256i ss_256[4], r[2];
2399*77c1e3ccSAndroid Build Coastguard Worker 
2400*77c1e3ccSAndroid Build Coastguard Worker         s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2401*77c1e3ccSAndroid Build Coastguard Worker         s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2402*77c1e3ccSAndroid Build Coastguard Worker         s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2403*77c1e3ccSAndroid Build Coastguard Worker 
2404*77c1e3ccSAndroid Build Coastguard Worker         // Load lines a and b. Line a to lower 128, line b to upper 128
2405*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2406*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2407*77c1e3ccSAndroid Build Coastguard Worker 
2408*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2409*77c1e3ccSAndroid Build Coastguard Worker         ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
2410*77c1e3ccSAndroid Build Coastguard Worker 
2411*77c1e3ccSAndroid Build Coastguard Worker         do {
2412*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2413*77c1e3ccSAndroid Build Coastguard Worker           y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2414*77c1e3ccSAndroid Build Coastguard Worker                                     ss_256, r);
2415*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2416*77c1e3ccSAndroid Build Coastguard Worker 
2417*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2418*77c1e3ccSAndroid Build Coastguard Worker           ss_256[2] = ss_256[3];
2419*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2420*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2421*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2422*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 32) {
2423*77c1e3ccSAndroid Build Coastguard Worker         // AV1 standard won't have 32x4 case.
2424*77c1e3ccSAndroid Build Coastguard Worker         // This only favors some optimization feature which
2425*77c1e3ccSAndroid Build Coastguard Worker         // subsamples 32x8 to 32x4 and triggers 4-tap filter.
2426*77c1e3ccSAndroid Build Coastguard Worker 
2427*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2428*77c1e3ccSAndroid Build Coastguard Worker 
2429*77c1e3ccSAndroid Build Coastguard Worker         s_256[0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * src_stride));
2430*77c1e3ccSAndroid Build Coastguard Worker         s_256[1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * src_stride));
2431*77c1e3ccSAndroid Build Coastguard Worker         s_256[2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * src_stride));
2432*77c1e3ccSAndroid Build Coastguard Worker 
2433*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2434*77c1e3ccSAndroid Build Coastguard Worker         ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2435*77c1e3ccSAndroid Build Coastguard Worker 
2436*77c1e3ccSAndroid Build Coastguard Worker         tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2437*77c1e3ccSAndroid Build Coastguard Worker         tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2438*77c1e3ccSAndroid Build Coastguard Worker 
2439*77c1e3ccSAndroid Build Coastguard Worker         do {
2440*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2441*77c1e3ccSAndroid Build Coastguard Worker           y_convolve_4tap_32x2_avx2(src_ptr, src_stride, coeffs_256, s_256,
2442*77c1e3ccSAndroid Build Coastguard Worker                                     ss_256, tt_256, r);
2443*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_32x2_avx2(r, dst, dst_stride);
2444*77c1e3ccSAndroid Build Coastguard Worker 
2445*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2446*77c1e3ccSAndroid Build Coastguard Worker           ss_256[2] = ss_256[3];
2447*77c1e3ccSAndroid Build Coastguard Worker 
2448*77c1e3ccSAndroid Build Coastguard Worker           tt_256[0] = tt_256[1];
2449*77c1e3ccSAndroid Build Coastguard Worker           tt_256[2] = tt_256[3];
2450*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2451*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2452*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2453*77c1e3ccSAndroid Build Coastguard Worker       } else {
2454*77c1e3ccSAndroid Build Coastguard Worker         assert(!(w % 32));
2455*77c1e3ccSAndroid Build Coastguard Worker 
2456*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[4], ss_256[4], tt_256[4], r[4];
2457*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
2458*77c1e3ccSAndroid Build Coastguard Worker         do {
2459*77c1e3ccSAndroid Build Coastguard Worker           const uint8_t *s = src_ptr + x;
2460*77c1e3ccSAndroid Build Coastguard Worker           uint8_t *d = dst + x;
2461*77c1e3ccSAndroid Build Coastguard Worker           s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2462*77c1e3ccSAndroid Build Coastguard Worker           s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2463*77c1e3ccSAndroid Build Coastguard Worker           s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2464*77c1e3ccSAndroid Build Coastguard Worker 
2465*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2466*77c1e3ccSAndroid Build Coastguard Worker           ss_256[2] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2467*77c1e3ccSAndroid Build Coastguard Worker 
2468*77c1e3ccSAndroid Build Coastguard Worker           tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2469*77c1e3ccSAndroid Build Coastguard Worker           tt_256[2] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2470*77c1e3ccSAndroid Build Coastguard Worker 
2471*77c1e3ccSAndroid Build Coastguard Worker           y = h;
2472*77c1e3ccSAndroid Build Coastguard Worker           do {
2473*77c1e3ccSAndroid Build Coastguard Worker             s += 2 * src_stride;
2474*77c1e3ccSAndroid Build Coastguard Worker             y_convolve_4tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2475*77c1e3ccSAndroid Build Coastguard Worker                                       tt_256, r);
2476*77c1e3ccSAndroid Build Coastguard Worker             sr_y_round_store_32x2_avx2(r, d, dst_stride);
2477*77c1e3ccSAndroid Build Coastguard Worker 
2478*77c1e3ccSAndroid Build Coastguard Worker             ss_256[0] = ss_256[1];
2479*77c1e3ccSAndroid Build Coastguard Worker             ss_256[2] = ss_256[3];
2480*77c1e3ccSAndroid Build Coastguard Worker 
2481*77c1e3ccSAndroid Build Coastguard Worker             tt_256[0] = tt_256[1];
2482*77c1e3ccSAndroid Build Coastguard Worker             tt_256[2] = tt_256[3];
2483*77c1e3ccSAndroid Build Coastguard Worker             d += 2 * dst_stride;
2484*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2485*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2486*77c1e3ccSAndroid Build Coastguard Worker           x += 32;
2487*77c1e3ccSAndroid Build Coastguard Worker         } while (x < w);
2488*77c1e3ccSAndroid Build Coastguard Worker       }
2489*77c1e3ccSAndroid Build Coastguard Worker     }
2490*77c1e3ccSAndroid Build Coastguard Worker   } else if (vert_tap == 6) {
2491*77c1e3ccSAndroid Build Coastguard Worker     // vert_filt as 6 tap
2492*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src - 2 * src_stride;
2493*77c1e3ccSAndroid Build Coastguard Worker 
2494*77c1e3ccSAndroid Build Coastguard Worker     if (w <= 4) {
2495*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2496*77c1e3ccSAndroid Build Coastguard Worker 
2497*77c1e3ccSAndroid Build Coastguard Worker       y = h;
2498*77c1e3ccSAndroid Build Coastguard Worker 
2499*77c1e3ccSAndroid Build Coastguard Worker       if (w == 2) {
2500*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_16[6], ss_128[3];
2501*77c1e3ccSAndroid Build Coastguard Worker 
2502*77c1e3ccSAndroid Build Coastguard Worker         s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2503*77c1e3ccSAndroid Build Coastguard Worker         s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2504*77c1e3ccSAndroid Build Coastguard Worker         s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2505*77c1e3ccSAndroid Build Coastguard Worker         s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2506*77c1e3ccSAndroid Build Coastguard Worker         s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2507*77c1e3ccSAndroid Build Coastguard Worker 
2508*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2509*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2510*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2511*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2512*77c1e3ccSAndroid Build Coastguard Worker 
2513*77c1e3ccSAndroid Build Coastguard Worker         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2514*77c1e3ccSAndroid Build Coastguard Worker         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2515*77c1e3ccSAndroid Build Coastguard Worker 
2516*77c1e3ccSAndroid Build Coastguard Worker         do {
2517*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2518*77c1e3ccSAndroid Build Coastguard Worker           const __m128i res = y_convolve_6tap_2x2_ssse3(
2519*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_128, s_16, ss_128);
2520*77c1e3ccSAndroid Build Coastguard Worker           const __m128i r = sr_y_round_sse2(res);
2521*77c1e3ccSAndroid Build Coastguard Worker           pack_store_2x2_sse2(r, dst, dst_stride);
2522*77c1e3ccSAndroid Build Coastguard Worker 
2523*77c1e3ccSAndroid Build Coastguard Worker           ss_128[0] = ss_128[1];
2524*77c1e3ccSAndroid Build Coastguard Worker           ss_128[1] = ss_128[2];
2525*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2526*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2527*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2528*77c1e3ccSAndroid Build Coastguard Worker       } else {
2529*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_32[6], ss_128[3];
2530*77c1e3ccSAndroid Build Coastguard Worker 
2531*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 4);
2532*77c1e3ccSAndroid Build Coastguard Worker 
2533*77c1e3ccSAndroid Build Coastguard Worker         s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2534*77c1e3ccSAndroid Build Coastguard Worker         s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2535*77c1e3ccSAndroid Build Coastguard Worker         s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2536*77c1e3ccSAndroid Build Coastguard Worker         s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2537*77c1e3ccSAndroid Build Coastguard Worker         s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2538*77c1e3ccSAndroid Build Coastguard Worker 
2539*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2540*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2541*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2542*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2543*77c1e3ccSAndroid Build Coastguard Worker 
2544*77c1e3ccSAndroid Build Coastguard Worker         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2545*77c1e3ccSAndroid Build Coastguard Worker         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2546*77c1e3ccSAndroid Build Coastguard Worker 
2547*77c1e3ccSAndroid Build Coastguard Worker         do {
2548*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2549*77c1e3ccSAndroid Build Coastguard Worker           const __m128i res = y_convolve_6tap_4x2_ssse3(
2550*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_128, s_32, ss_128);
2551*77c1e3ccSAndroid Build Coastguard Worker           const __m128i r = sr_y_round_sse2(res);
2552*77c1e3ccSAndroid Build Coastguard Worker           pack_store_4x2_sse2(r, dst, dst_stride);
2553*77c1e3ccSAndroid Build Coastguard Worker 
2554*77c1e3ccSAndroid Build Coastguard Worker           ss_128[0] = ss_128[1];
2555*77c1e3ccSAndroid Build Coastguard Worker           ss_128[1] = ss_128[2];
2556*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2557*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2558*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2559*77c1e3ccSAndroid Build Coastguard Worker       }
2560*77c1e3ccSAndroid Build Coastguard Worker     } else {
2561*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2562*77c1e3ccSAndroid Build Coastguard Worker 
2563*77c1e3ccSAndroid Build Coastguard Worker       if (w == 8) {
2564*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_64[6];
2565*77c1e3ccSAndroid Build Coastguard Worker         __m256i ss_256[3];
2566*77c1e3ccSAndroid Build Coastguard Worker 
2567*77c1e3ccSAndroid Build Coastguard Worker         s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2568*77c1e3ccSAndroid Build Coastguard Worker         s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2569*77c1e3ccSAndroid Build Coastguard Worker         s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2570*77c1e3ccSAndroid Build Coastguard Worker         s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2571*77c1e3ccSAndroid Build Coastguard Worker         s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2572*77c1e3ccSAndroid Build Coastguard Worker 
2573*77c1e3ccSAndroid Build Coastguard Worker         // Load lines a and b. Line a to lower 128, line b to upper 128
2574*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2575*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2576*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2577*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2578*77c1e3ccSAndroid Build Coastguard Worker 
2579*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2580*77c1e3ccSAndroid Build Coastguard Worker         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2581*77c1e3ccSAndroid Build Coastguard Worker 
2582*77c1e3ccSAndroid Build Coastguard Worker         y = h;
2583*77c1e3ccSAndroid Build Coastguard Worker         do {
2584*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2585*77c1e3ccSAndroid Build Coastguard Worker           const __m256i res = y_convolve_6tap_8x2_avx2(
2586*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_256, s_64, ss_256);
2587*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2588*77c1e3ccSAndroid Build Coastguard Worker 
2589*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2590*77c1e3ccSAndroid Build Coastguard Worker           ss_256[1] = ss_256[2];
2591*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2592*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2593*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2594*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
2595*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_128[6];
2596*77c1e3ccSAndroid Build Coastguard Worker         __m256i ss_256[6], r[2];
2597*77c1e3ccSAndroid Build Coastguard Worker 
2598*77c1e3ccSAndroid Build Coastguard Worker         s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2599*77c1e3ccSAndroid Build Coastguard Worker         s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2600*77c1e3ccSAndroid Build Coastguard Worker         s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2601*77c1e3ccSAndroid Build Coastguard Worker         s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2602*77c1e3ccSAndroid Build Coastguard Worker         s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2603*77c1e3ccSAndroid Build Coastguard Worker 
2604*77c1e3ccSAndroid Build Coastguard Worker         // Load lines a and b. Line a to lower 128, line b to upper 128
2605*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2606*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2607*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2608*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2609*77c1e3ccSAndroid Build Coastguard Worker 
2610*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2611*77c1e3ccSAndroid Build Coastguard Worker         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2612*77c1e3ccSAndroid Build Coastguard Worker 
2613*77c1e3ccSAndroid Build Coastguard Worker         ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
2614*77c1e3ccSAndroid Build Coastguard Worker         ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
2615*77c1e3ccSAndroid Build Coastguard Worker 
2616*77c1e3ccSAndroid Build Coastguard Worker         y = h;
2617*77c1e3ccSAndroid Build Coastguard Worker         do {
2618*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2619*77c1e3ccSAndroid Build Coastguard Worker           y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2620*77c1e3ccSAndroid Build Coastguard Worker                                     ss_256, r);
2621*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2622*77c1e3ccSAndroid Build Coastguard Worker 
2623*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2624*77c1e3ccSAndroid Build Coastguard Worker           ss_256[1] = ss_256[2];
2625*77c1e3ccSAndroid Build Coastguard Worker 
2626*77c1e3ccSAndroid Build Coastguard Worker           ss_256[3] = ss_256[4];
2627*77c1e3ccSAndroid Build Coastguard Worker           ss_256[4] = ss_256[5];
2628*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2629*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2630*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2631*77c1e3ccSAndroid Build Coastguard Worker       } else {
2632*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[6], ss_256[6], tt_256[6], r[4];
2633*77c1e3ccSAndroid Build Coastguard Worker 
2634*77c1e3ccSAndroid Build Coastguard Worker         assert(!(w % 32));
2635*77c1e3ccSAndroid Build Coastguard Worker 
2636*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
2637*77c1e3ccSAndroid Build Coastguard Worker         do {
2638*77c1e3ccSAndroid Build Coastguard Worker           const uint8_t *s = src_ptr + x;
2639*77c1e3ccSAndroid Build Coastguard Worker           uint8_t *d = dst + x;
2640*77c1e3ccSAndroid Build Coastguard Worker 
2641*77c1e3ccSAndroid Build Coastguard Worker           s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2642*77c1e3ccSAndroid Build Coastguard Worker           s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2643*77c1e3ccSAndroid Build Coastguard Worker           s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2644*77c1e3ccSAndroid Build Coastguard Worker           s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2645*77c1e3ccSAndroid Build Coastguard Worker           s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2646*77c1e3ccSAndroid Build Coastguard Worker 
2647*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2648*77c1e3ccSAndroid Build Coastguard Worker           ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2649*77c1e3ccSAndroid Build Coastguard Worker           ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2650*77c1e3ccSAndroid Build Coastguard Worker           ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2651*77c1e3ccSAndroid Build Coastguard Worker 
2652*77c1e3ccSAndroid Build Coastguard Worker           tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2653*77c1e3ccSAndroid Build Coastguard Worker           tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2654*77c1e3ccSAndroid Build Coastguard Worker           tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2655*77c1e3ccSAndroid Build Coastguard Worker           tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2656*77c1e3ccSAndroid Build Coastguard Worker 
2657*77c1e3ccSAndroid Build Coastguard Worker           y = h;
2658*77c1e3ccSAndroid Build Coastguard Worker           do {
2659*77c1e3ccSAndroid Build Coastguard Worker             s += 2 * src_stride;
2660*77c1e3ccSAndroid Build Coastguard Worker             y_convolve_6tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2661*77c1e3ccSAndroid Build Coastguard Worker                                       tt_256, r);
2662*77c1e3ccSAndroid Build Coastguard Worker             sr_y_round_store_32x2_avx2(r, d, dst_stride);
2663*77c1e3ccSAndroid Build Coastguard Worker 
2664*77c1e3ccSAndroid Build Coastguard Worker             ss_256[0] = ss_256[1];
2665*77c1e3ccSAndroid Build Coastguard Worker             ss_256[1] = ss_256[2];
2666*77c1e3ccSAndroid Build Coastguard Worker             ss_256[3] = ss_256[4];
2667*77c1e3ccSAndroid Build Coastguard Worker             ss_256[4] = ss_256[5];
2668*77c1e3ccSAndroid Build Coastguard Worker 
2669*77c1e3ccSAndroid Build Coastguard Worker             tt_256[0] = tt_256[1];
2670*77c1e3ccSAndroid Build Coastguard Worker             tt_256[1] = tt_256[2];
2671*77c1e3ccSAndroid Build Coastguard Worker             tt_256[3] = tt_256[4];
2672*77c1e3ccSAndroid Build Coastguard Worker             tt_256[4] = tt_256[5];
2673*77c1e3ccSAndroid Build Coastguard Worker             d += 2 * dst_stride;
2674*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2675*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2676*77c1e3ccSAndroid Build Coastguard Worker 
2677*77c1e3ccSAndroid Build Coastguard Worker           x += 32;
2678*77c1e3ccSAndroid Build Coastguard Worker         } while (x < w);
2679*77c1e3ccSAndroid Build Coastguard Worker       }
2680*77c1e3ccSAndroid Build Coastguard Worker     }
2681*77c1e3ccSAndroid Build Coastguard Worker   } else if (vert_tap == 8) {
2682*77c1e3ccSAndroid Build Coastguard Worker     // vert_filt as 8 tap
2683*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src - 3 * src_stride;
2684*77c1e3ccSAndroid Build Coastguard Worker 
2685*77c1e3ccSAndroid Build Coastguard Worker     if (w <= 4) {
2686*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2687*77c1e3ccSAndroid Build Coastguard Worker 
2688*77c1e3ccSAndroid Build Coastguard Worker       y = h;
2689*77c1e3ccSAndroid Build Coastguard Worker 
2690*77c1e3ccSAndroid Build Coastguard Worker       if (w == 2) {
2691*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_16[8], ss_128[4];
2692*77c1e3ccSAndroid Build Coastguard Worker 
2693*77c1e3ccSAndroid Build Coastguard Worker         s_16[0] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 0 * src_stride));
2694*77c1e3ccSAndroid Build Coastguard Worker         s_16[1] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 1 * src_stride));
2695*77c1e3ccSAndroid Build Coastguard Worker         s_16[2] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 2 * src_stride));
2696*77c1e3ccSAndroid Build Coastguard Worker         s_16[3] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 3 * src_stride));
2697*77c1e3ccSAndroid Build Coastguard Worker         s_16[4] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 4 * src_stride));
2698*77c1e3ccSAndroid Build Coastguard Worker         s_16[5] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 5 * src_stride));
2699*77c1e3ccSAndroid Build Coastguard Worker         s_16[6] = _mm_cvtsi32_si128(loadu_int16(src_ptr + 6 * src_stride));
2700*77c1e3ccSAndroid Build Coastguard Worker 
2701*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2702*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2703*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2704*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2705*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2706*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2707*77c1e3ccSAndroid Build Coastguard Worker 
2708*77c1e3ccSAndroid Build Coastguard Worker         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2709*77c1e3ccSAndroid Build Coastguard Worker         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2710*77c1e3ccSAndroid Build Coastguard Worker         ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2711*77c1e3ccSAndroid Build Coastguard Worker 
2712*77c1e3ccSAndroid Build Coastguard Worker         do {
2713*77c1e3ccSAndroid Build Coastguard Worker           const __m128i res = y_convolve_8tap_2x2_ssse3(
2714*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_128, s_16, ss_128);
2715*77c1e3ccSAndroid Build Coastguard Worker           const __m128i r = sr_y_round_sse2(res);
2716*77c1e3ccSAndroid Build Coastguard Worker           pack_store_2x2_sse2(r, dst, dst_stride);
2717*77c1e3ccSAndroid Build Coastguard Worker           ss_128[0] = ss_128[1];
2718*77c1e3ccSAndroid Build Coastguard Worker           ss_128[1] = ss_128[2];
2719*77c1e3ccSAndroid Build Coastguard Worker           ss_128[2] = ss_128[3];
2720*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2721*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2722*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2723*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2724*77c1e3ccSAndroid Build Coastguard Worker       } else {
2725*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_32[8], ss_128[4];
2726*77c1e3ccSAndroid Build Coastguard Worker 
2727*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 4);
2728*77c1e3ccSAndroid Build Coastguard Worker 
2729*77c1e3ccSAndroid Build Coastguard Worker         s_32[0] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 0 * src_stride));
2730*77c1e3ccSAndroid Build Coastguard Worker         s_32[1] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 1 * src_stride));
2731*77c1e3ccSAndroid Build Coastguard Worker         s_32[2] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 2 * src_stride));
2732*77c1e3ccSAndroid Build Coastguard Worker         s_32[3] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 3 * src_stride));
2733*77c1e3ccSAndroid Build Coastguard Worker         s_32[4] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 4 * src_stride));
2734*77c1e3ccSAndroid Build Coastguard Worker         s_32[5] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 5 * src_stride));
2735*77c1e3ccSAndroid Build Coastguard Worker         s_32[6] = _mm_cvtsi32_si128(loadu_int32(src_ptr + 6 * src_stride));
2736*77c1e3ccSAndroid Build Coastguard Worker 
2737*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2738*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2739*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2740*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2741*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2742*77c1e3ccSAndroid Build Coastguard Worker         const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2743*77c1e3ccSAndroid Build Coastguard Worker 
2744*77c1e3ccSAndroid Build Coastguard Worker         ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2745*77c1e3ccSAndroid Build Coastguard Worker         ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2746*77c1e3ccSAndroid Build Coastguard Worker         ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2747*77c1e3ccSAndroid Build Coastguard Worker 
2748*77c1e3ccSAndroid Build Coastguard Worker         do {
2749*77c1e3ccSAndroid Build Coastguard Worker           const __m128i res = y_convolve_8tap_4x2_ssse3(
2750*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_128, s_32, ss_128);
2751*77c1e3ccSAndroid Build Coastguard Worker           const __m128i r = sr_y_round_sse2(res);
2752*77c1e3ccSAndroid Build Coastguard Worker           pack_store_4x2_sse2(r, dst, dst_stride);
2753*77c1e3ccSAndroid Build Coastguard Worker           ss_128[0] = ss_128[1];
2754*77c1e3ccSAndroid Build Coastguard Worker           ss_128[1] = ss_128[2];
2755*77c1e3ccSAndroid Build Coastguard Worker           ss_128[2] = ss_128[3];
2756*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2757*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2758*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2759*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2760*77c1e3ccSAndroid Build Coastguard Worker       }
2761*77c1e3ccSAndroid Build Coastguard Worker     } else {
2762*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2763*77c1e3ccSAndroid Build Coastguard Worker 
2764*77c1e3ccSAndroid Build Coastguard Worker       if (w == 8) {
2765*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_64[8];
2766*77c1e3ccSAndroid Build Coastguard Worker         __m256i ss_256[4];
2767*77c1e3ccSAndroid Build Coastguard Worker 
2768*77c1e3ccSAndroid Build Coastguard Worker         s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2769*77c1e3ccSAndroid Build Coastguard Worker         s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2770*77c1e3ccSAndroid Build Coastguard Worker         s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2771*77c1e3ccSAndroid Build Coastguard Worker         s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2772*77c1e3ccSAndroid Build Coastguard Worker         s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2773*77c1e3ccSAndroid Build Coastguard Worker         s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2774*77c1e3ccSAndroid Build Coastguard Worker         s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2775*77c1e3ccSAndroid Build Coastguard Worker 
2776*77c1e3ccSAndroid Build Coastguard Worker         // Load lines a and b. Line a to lower 128, line b to upper 128
2777*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2778*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2779*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2780*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2781*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2782*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2783*77c1e3ccSAndroid Build Coastguard Worker 
2784*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2785*77c1e3ccSAndroid Build Coastguard Worker         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2786*77c1e3ccSAndroid Build Coastguard Worker         ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2787*77c1e3ccSAndroid Build Coastguard Worker 
2788*77c1e3ccSAndroid Build Coastguard Worker         y = h;
2789*77c1e3ccSAndroid Build Coastguard Worker         do {
2790*77c1e3ccSAndroid Build Coastguard Worker           const __m256i res = y_convolve_8tap_8x2_avx2(
2791*77c1e3ccSAndroid Build Coastguard Worker               src_ptr, src_stride, coeffs_256, s_64, ss_256);
2792*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_8x2_avx2(res, dst, dst_stride);
2793*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2794*77c1e3ccSAndroid Build Coastguard Worker           ss_256[1] = ss_256[2];
2795*77c1e3ccSAndroid Build Coastguard Worker           ss_256[2] = ss_256[3];
2796*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2797*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2798*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2799*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2800*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
2801*77c1e3ccSAndroid Build Coastguard Worker         __m128i s_128[8];
2802*77c1e3ccSAndroid Build Coastguard Worker         __m256i ss_256[8], r[2];
2803*77c1e3ccSAndroid Build Coastguard Worker 
2804*77c1e3ccSAndroid Build Coastguard Worker         s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2805*77c1e3ccSAndroid Build Coastguard Worker         s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2806*77c1e3ccSAndroid Build Coastguard Worker         s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2807*77c1e3ccSAndroid Build Coastguard Worker         s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2808*77c1e3ccSAndroid Build Coastguard Worker         s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2809*77c1e3ccSAndroid Build Coastguard Worker         s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2810*77c1e3ccSAndroid Build Coastguard Worker         s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2811*77c1e3ccSAndroid Build Coastguard Worker 
2812*77c1e3ccSAndroid Build Coastguard Worker         // Load lines a and b. Line a to lower 128, line b to upper 128
2813*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2814*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2815*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2816*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2817*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2818*77c1e3ccSAndroid Build Coastguard Worker         const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2819*77c1e3ccSAndroid Build Coastguard Worker 
2820*77c1e3ccSAndroid Build Coastguard Worker         ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2821*77c1e3ccSAndroid Build Coastguard Worker         ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2822*77c1e3ccSAndroid Build Coastguard Worker         ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2823*77c1e3ccSAndroid Build Coastguard Worker 
2824*77c1e3ccSAndroid Build Coastguard Worker         ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2825*77c1e3ccSAndroid Build Coastguard Worker         ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2826*77c1e3ccSAndroid Build Coastguard Worker         ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2827*77c1e3ccSAndroid Build Coastguard Worker 
2828*77c1e3ccSAndroid Build Coastguard Worker         y = h;
2829*77c1e3ccSAndroid Build Coastguard Worker         do {
2830*77c1e3ccSAndroid Build Coastguard Worker           y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128,
2831*77c1e3ccSAndroid Build Coastguard Worker                                     ss_256, r);
2832*77c1e3ccSAndroid Build Coastguard Worker           sr_y_round_store_16x2_avx2(r, dst, dst_stride);
2833*77c1e3ccSAndroid Build Coastguard Worker 
2834*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = ss_256[1];
2835*77c1e3ccSAndroid Build Coastguard Worker           ss_256[1] = ss_256[2];
2836*77c1e3ccSAndroid Build Coastguard Worker           ss_256[2] = ss_256[3];
2837*77c1e3ccSAndroid Build Coastguard Worker 
2838*77c1e3ccSAndroid Build Coastguard Worker           ss_256[4] = ss_256[5];
2839*77c1e3ccSAndroid Build Coastguard Worker           ss_256[5] = ss_256[6];
2840*77c1e3ccSAndroid Build Coastguard Worker           ss_256[6] = ss_256[7];
2841*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
2842*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
2843*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
2844*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
2845*77c1e3ccSAndroid Build Coastguard Worker       } else {
2846*77c1e3ccSAndroid Build Coastguard Worker         __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2847*77c1e3ccSAndroid Build Coastguard Worker 
2848*77c1e3ccSAndroid Build Coastguard Worker         assert(!(w % 32));
2849*77c1e3ccSAndroid Build Coastguard Worker 
2850*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
2851*77c1e3ccSAndroid Build Coastguard Worker         do {
2852*77c1e3ccSAndroid Build Coastguard Worker           const uint8_t *s = src_ptr + x;
2853*77c1e3ccSAndroid Build Coastguard Worker           uint8_t *d = dst + x;
2854*77c1e3ccSAndroid Build Coastguard Worker 
2855*77c1e3ccSAndroid Build Coastguard Worker           s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2856*77c1e3ccSAndroid Build Coastguard Worker           s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2857*77c1e3ccSAndroid Build Coastguard Worker           s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2858*77c1e3ccSAndroid Build Coastguard Worker           s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2859*77c1e3ccSAndroid Build Coastguard Worker           s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2860*77c1e3ccSAndroid Build Coastguard Worker           s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2861*77c1e3ccSAndroid Build Coastguard Worker           s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2862*77c1e3ccSAndroid Build Coastguard Worker 
2863*77c1e3ccSAndroid Build Coastguard Worker           ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2864*77c1e3ccSAndroid Build Coastguard Worker           ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2865*77c1e3ccSAndroid Build Coastguard Worker           ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2866*77c1e3ccSAndroid Build Coastguard Worker           ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2867*77c1e3ccSAndroid Build Coastguard Worker           ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2868*77c1e3ccSAndroid Build Coastguard Worker           ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2869*77c1e3ccSAndroid Build Coastguard Worker 
2870*77c1e3ccSAndroid Build Coastguard Worker           tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2871*77c1e3ccSAndroid Build Coastguard Worker           tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2872*77c1e3ccSAndroid Build Coastguard Worker           tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2873*77c1e3ccSAndroid Build Coastguard Worker           tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2874*77c1e3ccSAndroid Build Coastguard Worker           tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2875*77c1e3ccSAndroid Build Coastguard Worker           tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2876*77c1e3ccSAndroid Build Coastguard Worker 
2877*77c1e3ccSAndroid Build Coastguard Worker           y = h;
2878*77c1e3ccSAndroid Build Coastguard Worker           do {
2879*77c1e3ccSAndroid Build Coastguard Worker             y_convolve_8tap_32x2_avx2(s, src_stride, coeffs_256, s_256, ss_256,
2880*77c1e3ccSAndroid Build Coastguard Worker                                       tt_256, r);
2881*77c1e3ccSAndroid Build Coastguard Worker             sr_y_round_store_32x2_avx2(r, d, dst_stride);
2882*77c1e3ccSAndroid Build Coastguard Worker 
2883*77c1e3ccSAndroid Build Coastguard Worker             ss_256[0] = ss_256[1];
2884*77c1e3ccSAndroid Build Coastguard Worker             ss_256[1] = ss_256[2];
2885*77c1e3ccSAndroid Build Coastguard Worker             ss_256[2] = ss_256[3];
2886*77c1e3ccSAndroid Build Coastguard Worker             ss_256[4] = ss_256[5];
2887*77c1e3ccSAndroid Build Coastguard Worker             ss_256[5] = ss_256[6];
2888*77c1e3ccSAndroid Build Coastguard Worker             ss_256[6] = ss_256[7];
2889*77c1e3ccSAndroid Build Coastguard Worker 
2890*77c1e3ccSAndroid Build Coastguard Worker             tt_256[0] = tt_256[1];
2891*77c1e3ccSAndroid Build Coastguard Worker             tt_256[1] = tt_256[2];
2892*77c1e3ccSAndroid Build Coastguard Worker             tt_256[2] = tt_256[3];
2893*77c1e3ccSAndroid Build Coastguard Worker             tt_256[4] = tt_256[5];
2894*77c1e3ccSAndroid Build Coastguard Worker             tt_256[5] = tt_256[6];
2895*77c1e3ccSAndroid Build Coastguard Worker             tt_256[6] = tt_256[7];
2896*77c1e3ccSAndroid Build Coastguard Worker             s += 2 * src_stride;
2897*77c1e3ccSAndroid Build Coastguard Worker             d += 2 * dst_stride;
2898*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2899*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2900*77c1e3ccSAndroid Build Coastguard Worker 
2901*77c1e3ccSAndroid Build Coastguard Worker           x += 32;
2902*77c1e3ccSAndroid Build Coastguard Worker         } while (x < w);
2903*77c1e3ccSAndroid Build Coastguard Worker       }
2904*77c1e3ccSAndroid Build Coastguard Worker     }
2905*77c1e3ccSAndroid Build Coastguard Worker   }
2906*77c1e3ccSAndroid Build Coastguard Worker }
2907*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_2tap_32_avx2(const uint8_t * const src,const __m256i coeffs[1],uint8_t * const dst)2908*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_x_2tap_32_avx2(const uint8_t *const src,
2909*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[1],
2910*77c1e3ccSAndroid Build Coastguard Worker                                      uint8_t *const dst) {
2911*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
2912*77c1e3ccSAndroid Build Coastguard Worker 
2913*77c1e3ccSAndroid Build Coastguard Worker   x_convolve_2tap_32_avx2(src, coeffs, r);
2914*77c1e3ccSAndroid Build Coastguard Worker   sr_x_round_store_32_avx2(r, dst);
2915*77c1e3ccSAndroid Build Coastguard Worker }
2916*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],uint8_t * const dst)2917*77c1e3ccSAndroid Build Coastguard Worker static inline void sr_x_6tap_32_avx2(const uint8_t *const src,
2918*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i coeffs[3],
2919*77c1e3ccSAndroid Build Coastguard Worker                                      const __m256i filt[3],
2920*77c1e3ccSAndroid Build Coastguard Worker                                      uint8_t *const dst) {
2921*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
2922*77c1e3ccSAndroid Build Coastguard Worker 
2923*77c1e3ccSAndroid Build Coastguard Worker   x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2924*77c1e3ccSAndroid Build Coastguard Worker   sr_x_round_store_32_avx2(r, dst);
2925*77c1e3ccSAndroid Build Coastguard Worker }
2926*77c1e3ccSAndroid Build Coastguard Worker 
sr_x_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],uint8_t * const dst)2927*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void sr_x_8tap_32_avx2(const uint8_t *const src,
2928*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i coeffs[4],
2929*77c1e3ccSAndroid Build Coastguard Worker                                                const __m256i filt[4],
2930*77c1e3ccSAndroid Build Coastguard Worker                                                uint8_t *const dst) {
2931*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[2];
2932*77c1e3ccSAndroid Build Coastguard Worker 
2933*77c1e3ccSAndroid Build Coastguard Worker   x_convolve_8tap_32_avx2(src, coeffs, filt, r);
2934*77c1e3ccSAndroid Build Coastguard Worker   sr_x_round_store_32_avx2(r, dst);
2935*77c1e3ccSAndroid Build Coastguard Worker }
2936*77c1e3ccSAndroid Build Coastguard Worker 
av1_convolve_x_sr_specialized_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,ConvolveParams * conv_params)2937*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void av1_convolve_x_sr_specialized_avx2(
2938*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
2939*77c1e3ccSAndroid Build Coastguard Worker     int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
2940*77c1e3ccSAndroid Build Coastguard Worker     const int32_t subpel_x_q4, ConvolveParams *conv_params) {
2941*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = h;
2942*77c1e3ccSAndroid Build Coastguard Worker   __m128i coeffs_128[4];
2943*77c1e3ccSAndroid Build Coastguard Worker   __m256i coeffs_256[4];
2944*77c1e3ccSAndroid Build Coastguard Worker 
2945*77c1e3ccSAndroid Build Coastguard Worker   assert(conv_params->round_0 == 3);
2946*77c1e3ccSAndroid Build Coastguard Worker   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
2947*77c1e3ccSAndroid Build Coastguard Worker          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
2948*77c1e3ccSAndroid Build Coastguard Worker   (void)conv_params;
2949*77c1e3ccSAndroid Build Coastguard Worker 
2950*77c1e3ccSAndroid Build Coastguard Worker   const int horz_tap = get_filter_tap(filter_params_x, subpel_x_q4);
2951*77c1e3ccSAndroid Build Coastguard Worker 
2952*77c1e3ccSAndroid Build Coastguard Worker   if (horz_tap == 2) {
2953*77c1e3ccSAndroid Build Coastguard Worker     // horz_filt as 2 tap
2954*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src;
2955*77c1e3ccSAndroid Build Coastguard Worker 
2956*77c1e3ccSAndroid Build Coastguard Worker     if (subpel_x_q4 != 8) {
2957*77c1e3ccSAndroid Build Coastguard Worker       if (w <= 8) {
2958*77c1e3ccSAndroid Build Coastguard Worker         prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4,
2959*77c1e3ccSAndroid Build Coastguard Worker                                        coeffs_128);
2960*77c1e3ccSAndroid Build Coastguard Worker 
2961*77c1e3ccSAndroid Build Coastguard Worker         if (w == 2) {
2962*77c1e3ccSAndroid Build Coastguard Worker           do {
2963*77c1e3ccSAndroid Build Coastguard Worker             const __m128i res =
2964*77c1e3ccSAndroid Build Coastguard Worker                 x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
2965*77c1e3ccSAndroid Build Coastguard Worker             const __m128i r = sr_x_round_sse2(res);
2966*77c1e3ccSAndroid Build Coastguard Worker             pack_store_2x2_sse2(r, dst, dst_stride);
2967*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2968*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2969*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2970*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2971*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 4) {
2972*77c1e3ccSAndroid Build Coastguard Worker           do {
2973*77c1e3ccSAndroid Build Coastguard Worker             const __m128i res =
2974*77c1e3ccSAndroid Build Coastguard Worker                 x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
2975*77c1e3ccSAndroid Build Coastguard Worker             const __m128i r = sr_x_round_sse2(res);
2976*77c1e3ccSAndroid Build Coastguard Worker             pack_store_4x2_sse2(r, dst, dst_stride);
2977*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2978*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2979*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2980*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2981*77c1e3ccSAndroid Build Coastguard Worker         } else {
2982*77c1e3ccSAndroid Build Coastguard Worker           assert(w == 8);
2983*77c1e3ccSAndroid Build Coastguard Worker 
2984*77c1e3ccSAndroid Build Coastguard Worker           do {
2985*77c1e3ccSAndroid Build Coastguard Worker             __m128i res[2];
2986*77c1e3ccSAndroid Build Coastguard Worker 
2987*77c1e3ccSAndroid Build Coastguard Worker             x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, coeffs_128, res);
2988*77c1e3ccSAndroid Build Coastguard Worker             res[0] = sr_x_round_sse2(res[0]);
2989*77c1e3ccSAndroid Build Coastguard Worker             res[1] = sr_x_round_sse2(res[1]);
2990*77c1e3ccSAndroid Build Coastguard Worker             const __m128i d = _mm_packus_epi16(res[0], res[1]);
2991*77c1e3ccSAndroid Build Coastguard Worker             _mm_storel_epi64((__m128i *)dst, d);
2992*77c1e3ccSAndroid Build Coastguard Worker             _mm_storeh_epi64((__m128i *)(dst + dst_stride), d);
2993*77c1e3ccSAndroid Build Coastguard Worker 
2994*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
2995*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
2996*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
2997*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
2998*77c1e3ccSAndroid Build Coastguard Worker         }
2999*77c1e3ccSAndroid Build Coastguard Worker       } else {
3000*77c1e3ccSAndroid Build Coastguard Worker         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3001*77c1e3ccSAndroid Build Coastguard Worker 
3002*77c1e3ccSAndroid Build Coastguard Worker         if (w == 16) {
3003*77c1e3ccSAndroid Build Coastguard Worker           do {
3004*77c1e3ccSAndroid Build Coastguard Worker             __m256i r[2];
3005*77c1e3ccSAndroid Build Coastguard Worker 
3006*77c1e3ccSAndroid Build Coastguard Worker             x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3007*77c1e3ccSAndroid Build Coastguard Worker             sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3008*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += 2 * src_stride;
3009*77c1e3ccSAndroid Build Coastguard Worker             dst += 2 * dst_stride;
3010*77c1e3ccSAndroid Build Coastguard Worker             y -= 2;
3011*77c1e3ccSAndroid Build Coastguard Worker           } while (y);
3012*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 32) {
3013*77c1e3ccSAndroid Build Coastguard Worker           do {
3014*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr, coeffs_256, dst);
3015*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += src_stride;
3016*77c1e3ccSAndroid Build Coastguard Worker             dst += dst_stride;
3017*77c1e3ccSAndroid Build Coastguard Worker           } while (--y);
3018*77c1e3ccSAndroid Build Coastguard Worker         } else if (w == 64) {
3019*77c1e3ccSAndroid Build Coastguard Worker           do {
3020*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3021*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3022*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += src_stride;
3023*77c1e3ccSAndroid Build Coastguard Worker             dst += dst_stride;
3024*77c1e3ccSAndroid Build Coastguard Worker           } while (--y);
3025*77c1e3ccSAndroid Build Coastguard Worker         } else {
3026*77c1e3ccSAndroid Build Coastguard Worker           assert(w == 128);
3027*77c1e3ccSAndroid Build Coastguard Worker 
3028*77c1e3ccSAndroid Build Coastguard Worker           do {
3029*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr + 0 * 32, coeffs_256, dst + 0 * 32);
3030*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr + 1 * 32, coeffs_256, dst + 1 * 32);
3031*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr + 2 * 32, coeffs_256, dst + 2 * 32);
3032*77c1e3ccSAndroid Build Coastguard Worker             sr_x_2tap_32_avx2(src_ptr + 3 * 32, coeffs_256, dst + 3 * 32);
3033*77c1e3ccSAndroid Build Coastguard Worker             src_ptr += src_stride;
3034*77c1e3ccSAndroid Build Coastguard Worker             dst += dst_stride;
3035*77c1e3ccSAndroid Build Coastguard Worker           } while (--y);
3036*77c1e3ccSAndroid Build Coastguard Worker         }
3037*77c1e3ccSAndroid Build Coastguard Worker       }
3038*77c1e3ccSAndroid Build Coastguard Worker     } else {
3039*77c1e3ccSAndroid Build Coastguard Worker       // average to get half pel
3040*77c1e3ccSAndroid Build Coastguard Worker       if (w == 2) {
3041*77c1e3ccSAndroid Build Coastguard Worker         do {
3042*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_128;
3043*77c1e3ccSAndroid Build Coastguard Worker 
3044*77c1e3ccSAndroid Build Coastguard Worker           s_128 = load_u8_4x2_sse4_1(src_ptr, src_stride);
3045*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s1 = _mm_srli_si128(s_128, 1);
3046*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d = _mm_avg_epu8(s_128, s1);
3047*77c1e3ccSAndroid Build Coastguard Worker           *(uint16_t *)dst = (uint16_t)_mm_cvtsi128_si32(d);
3048*77c1e3ccSAndroid Build Coastguard Worker           *(uint16_t *)(dst + dst_stride) = _mm_extract_epi16(d, 2);
3049*77c1e3ccSAndroid Build Coastguard Worker 
3050*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3051*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3052*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3053*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3054*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 4) {
3055*77c1e3ccSAndroid Build Coastguard Worker         do {
3056*77c1e3ccSAndroid Build Coastguard Worker           __m128i s_128;
3057*77c1e3ccSAndroid Build Coastguard Worker 
3058*77c1e3ccSAndroid Build Coastguard Worker           s_128 = load_u8_8x2_sse2(src_ptr, src_stride);
3059*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s1 = _mm_srli_si128(s_128, 1);
3060*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d = _mm_avg_epu8(s_128, s1);
3061*77c1e3ccSAndroid Build Coastguard Worker           xx_storel_32(dst, d);
3062*77c1e3ccSAndroid Build Coastguard Worker           *(int32_t *)(dst + dst_stride) = _mm_extract_epi32(d, 2);
3063*77c1e3ccSAndroid Build Coastguard Worker 
3064*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3065*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3066*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3067*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3068*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 8) {
3069*77c1e3ccSAndroid Build Coastguard Worker         do {
3070*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3071*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s10 =
3072*77c1e3ccSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3073*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s01 = _mm_srli_si128(s00, 1);
3074*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s11 = _mm_srli_si128(s10, 1);
3075*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d0 = _mm_avg_epu8(s00, s01);
3076*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d1 = _mm_avg_epu8(s10, s11);
3077*77c1e3ccSAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i *)dst, d0);
3078*77c1e3ccSAndroid Build Coastguard Worker           _mm_storel_epi64((__m128i *)(dst + dst_stride), d1);
3079*77c1e3ccSAndroid Build Coastguard Worker 
3080*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3081*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3082*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3083*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3084*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
3085*77c1e3ccSAndroid Build Coastguard Worker         do {
3086*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s00 = _mm_loadu_si128((__m128i *)src_ptr);
3087*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s01 = _mm_loadu_si128((__m128i *)(src_ptr + 1));
3088*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s10 =
3089*77c1e3ccSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(src_ptr + src_stride));
3090*77c1e3ccSAndroid Build Coastguard Worker           const __m128i s11 =
3091*77c1e3ccSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(src_ptr + src_stride + 1));
3092*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d0 = _mm_avg_epu8(s00, s01);
3093*77c1e3ccSAndroid Build Coastguard Worker           const __m128i d1 = _mm_avg_epu8(s10, s11);
3094*77c1e3ccSAndroid Build Coastguard Worker           _mm_storeu_si128((__m128i *)dst, d0);
3095*77c1e3ccSAndroid Build Coastguard Worker           _mm_storeu_si128((__m128i *)(dst + dst_stride), d1);
3096*77c1e3ccSAndroid Build Coastguard Worker 
3097*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3098*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3099*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3100*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3101*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 32) {
3102*77c1e3ccSAndroid Build Coastguard Worker         do {
3103*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr, dst);
3104*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3105*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3106*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3107*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 64) {
3108*77c1e3ccSAndroid Build Coastguard Worker         do {
3109*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3110*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3111*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3112*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3113*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3114*77c1e3ccSAndroid Build Coastguard Worker       } else {
3115*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 128);
3116*77c1e3ccSAndroid Build Coastguard Worker 
3117*77c1e3ccSAndroid Build Coastguard Worker         do {
3118*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr + 0 * 32, dst + 0 * 32);
3119*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr + 1 * 32, dst + 1 * 32);
3120*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr + 2 * 32, dst + 2 * 32);
3121*77c1e3ccSAndroid Build Coastguard Worker           sr_x_2tap_32_avg_avx2(src_ptr + 3 * 32, dst + 3 * 32);
3122*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3123*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3124*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3125*77c1e3ccSAndroid Build Coastguard Worker       }
3126*77c1e3ccSAndroid Build Coastguard Worker     }
3127*77c1e3ccSAndroid Build Coastguard Worker   } else if (horz_tap == 4) {
3128*77c1e3ccSAndroid Build Coastguard Worker     // horz_filt as 4 tap
3129*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src - 1;
3130*77c1e3ccSAndroid Build Coastguard Worker 
3131*77c1e3ccSAndroid Build Coastguard Worker     prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3132*77c1e3ccSAndroid Build Coastguard Worker 
3133*77c1e3ccSAndroid Build Coastguard Worker     if (w == 2) {
3134*77c1e3ccSAndroid Build Coastguard Worker       do {
3135*77c1e3ccSAndroid Build Coastguard Worker         const __m128i res =
3136*77c1e3ccSAndroid Build Coastguard Worker             x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3137*77c1e3ccSAndroid Build Coastguard Worker         const __m128i r = sr_x_round_sse2(res);
3138*77c1e3ccSAndroid Build Coastguard Worker         pack_store_2x2_sse2(r, dst, dst_stride);
3139*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 2 * src_stride;
3140*77c1e3ccSAndroid Build Coastguard Worker         dst += 2 * dst_stride;
3141*77c1e3ccSAndroid Build Coastguard Worker         y -= 2;
3142*77c1e3ccSAndroid Build Coastguard Worker       } while (y);
3143*77c1e3ccSAndroid Build Coastguard Worker     } else if (w == 4) {
3144*77c1e3ccSAndroid Build Coastguard Worker       do {
3145*77c1e3ccSAndroid Build Coastguard Worker         const __m128i res =
3146*77c1e3ccSAndroid Build Coastguard Worker             x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3147*77c1e3ccSAndroid Build Coastguard Worker         const __m128i r = sr_x_round_sse2(res);
3148*77c1e3ccSAndroid Build Coastguard Worker         pack_store_4x2_sse2(r, dst, dst_stride);
3149*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 2 * src_stride;
3150*77c1e3ccSAndroid Build Coastguard Worker         dst += 2 * dst_stride;
3151*77c1e3ccSAndroid Build Coastguard Worker         y -= 2;
3152*77c1e3ccSAndroid Build Coastguard Worker       } while (y);
3153*77c1e3ccSAndroid Build Coastguard Worker     } else if (w == 8) {
3154*77c1e3ccSAndroid Build Coastguard Worker       // TODO([email protected]): Reuse the old SIMD code here. Need to
3155*77c1e3ccSAndroid Build Coastguard Worker       // rewrite this for better performance later.
3156*77c1e3ccSAndroid Build Coastguard Worker       __m256i filt_256[2];
3157*77c1e3ccSAndroid Build Coastguard Worker       prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3158*77c1e3ccSAndroid Build Coastguard Worker 
3159*77c1e3ccSAndroid Build Coastguard Worker       filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3160*77c1e3ccSAndroid Build Coastguard Worker       filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3161*77c1e3ccSAndroid Build Coastguard Worker       for (int i = 0; i < h; i += 2) {
3162*77c1e3ccSAndroid Build Coastguard Worker         const __m256i data = _mm256_permute2x128_si256(
3163*77c1e3ccSAndroid Build Coastguard Worker             _mm256_castsi128_si256(
3164*77c1e3ccSAndroid Build Coastguard Worker                 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
3165*77c1e3ccSAndroid Build Coastguard Worker             _mm256_castsi128_si256(_mm_loadu_si128(
3166*77c1e3ccSAndroid Build Coastguard Worker                 (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
3167*77c1e3ccSAndroid Build Coastguard Worker             0x20);
3168*77c1e3ccSAndroid Build Coastguard Worker 
3169*77c1e3ccSAndroid Build Coastguard Worker         __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3170*77c1e3ccSAndroid Build Coastguard Worker         res_16b = sr_x_round_avx2(res_16b);
3171*77c1e3ccSAndroid Build Coastguard Worker 
3172*77c1e3ccSAndroid Build Coastguard Worker         __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3173*77c1e3ccSAndroid Build Coastguard Worker 
3174*77c1e3ccSAndroid Build Coastguard Worker         const __m128i res_0 = _mm256_castsi256_si128(res_8b);
3175*77c1e3ccSAndroid Build Coastguard Worker         const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
3176*77c1e3ccSAndroid Build Coastguard Worker 
3177*77c1e3ccSAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
3178*77c1e3ccSAndroid Build Coastguard Worker         _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
3179*77c1e3ccSAndroid Build Coastguard Worker       }
3180*77c1e3ccSAndroid Build Coastguard Worker     } else {
3181*77c1e3ccSAndroid Build Coastguard Worker       assert(!(w % 16));
3182*77c1e3ccSAndroid Build Coastguard Worker       // TODO([email protected]): Reuse the old SIMD code here. Need to
3183*77c1e3ccSAndroid Build Coastguard Worker       // rewrite this for better performance later.
3184*77c1e3ccSAndroid Build Coastguard Worker       __m256i filt_256[2];
3185*77c1e3ccSAndroid Build Coastguard Worker       prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_256);
3186*77c1e3ccSAndroid Build Coastguard Worker       filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3187*77c1e3ccSAndroid Build Coastguard Worker       filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3188*77c1e3ccSAndroid Build Coastguard Worker 
3189*77c1e3ccSAndroid Build Coastguard Worker       for (int i = 0; i < h; ++i) {
3190*77c1e3ccSAndroid Build Coastguard Worker         for (int j = 0; j < w; j += 16) {
3191*77c1e3ccSAndroid Build Coastguard Worker           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
3192*77c1e3ccSAndroid Build Coastguard Worker           // 18 19 20 21 22 23
3193*77c1e3ccSAndroid Build Coastguard Worker           const __m256i data = _mm256_inserti128_si256(
3194*77c1e3ccSAndroid Build Coastguard Worker               _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
3195*77c1e3ccSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
3196*77c1e3ccSAndroid Build Coastguard Worker               1);
3197*77c1e3ccSAndroid Build Coastguard Worker 
3198*77c1e3ccSAndroid Build Coastguard Worker           __m256i res_16b =
3199*77c1e3ccSAndroid Build Coastguard Worker               convolve_lowbd_x_4tap(data, coeffs_256 + 1, filt_256);
3200*77c1e3ccSAndroid Build Coastguard Worker           res_16b = sr_x_round_avx2(res_16b);
3201*77c1e3ccSAndroid Build Coastguard Worker 
3202*77c1e3ccSAndroid Build Coastguard Worker           /* rounding code */
3203*77c1e3ccSAndroid Build Coastguard Worker           // 8 bit conversion and saturation to uint8
3204*77c1e3ccSAndroid Build Coastguard Worker           __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
3205*77c1e3ccSAndroid Build Coastguard Worker 
3206*77c1e3ccSAndroid Build Coastguard Worker           // Store values into the destination buffer
3207*77c1e3ccSAndroid Build Coastguard Worker           // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
3208*77c1e3ccSAndroid Build Coastguard Worker           res_8b = _mm256_permute4x64_epi64(res_8b, 216);
3209*77c1e3ccSAndroid Build Coastguard Worker           __m128i res = _mm256_castsi256_si128(res_8b);
3210*77c1e3ccSAndroid Build Coastguard Worker           _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
3211*77c1e3ccSAndroid Build Coastguard Worker         }
3212*77c1e3ccSAndroid Build Coastguard Worker       }
3213*77c1e3ccSAndroid Build Coastguard Worker     }
3214*77c1e3ccSAndroid Build Coastguard Worker   } else {
3215*77c1e3ccSAndroid Build Coastguard Worker     __m256i filt_256[4];
3216*77c1e3ccSAndroid Build Coastguard Worker 
3217*77c1e3ccSAndroid Build Coastguard Worker     filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx2);
3218*77c1e3ccSAndroid Build Coastguard Worker     filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx2);
3219*77c1e3ccSAndroid Build Coastguard Worker     filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx2);
3220*77c1e3ccSAndroid Build Coastguard Worker 
3221*77c1e3ccSAndroid Build Coastguard Worker     if (horz_tap == 6) {
3222*77c1e3ccSAndroid Build Coastguard Worker       // horz_filt as 6 tap
3223*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr = src - 2;
3224*77c1e3ccSAndroid Build Coastguard Worker 
3225*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3226*77c1e3ccSAndroid Build Coastguard Worker 
3227*77c1e3ccSAndroid Build Coastguard Worker       if (w == 8) {
3228*77c1e3ccSAndroid Build Coastguard Worker         do {
3229*77c1e3ccSAndroid Build Coastguard Worker           const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride,
3230*77c1e3ccSAndroid Build Coastguard Worker                                                        coeffs_256, filt_256);
3231*77c1e3ccSAndroid Build Coastguard Worker           sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3232*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3233*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3234*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3235*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3236*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
3237*77c1e3ccSAndroid Build Coastguard Worker         do {
3238*77c1e3ccSAndroid Build Coastguard Worker           __m256i r[2];
3239*77c1e3ccSAndroid Build Coastguard Worker 
3240*77c1e3ccSAndroid Build Coastguard Worker           x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3241*77c1e3ccSAndroid Build Coastguard Worker                                     r);
3242*77c1e3ccSAndroid Build Coastguard Worker           sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3243*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3244*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3245*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3246*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3247*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 32) {
3248*77c1e3ccSAndroid Build Coastguard Worker         do {
3249*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3250*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3251*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3252*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3253*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 64) {
3254*77c1e3ccSAndroid Build Coastguard Worker         do {
3255*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3256*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3257*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3258*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3259*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3260*77c1e3ccSAndroid Build Coastguard Worker       } else {
3261*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 128);
3262*77c1e3ccSAndroid Build Coastguard Worker 
3263*77c1e3ccSAndroid Build Coastguard Worker         do {
3264*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3265*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3266*77c1e3ccSAndroid Build Coastguard Worker                             dst + 1 * 32);
3267*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3268*77c1e3ccSAndroid Build Coastguard Worker                             dst + 2 * 32);
3269*77c1e3ccSAndroid Build Coastguard Worker           sr_x_6tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3270*77c1e3ccSAndroid Build Coastguard Worker                             dst + 3 * 32);
3271*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3272*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3273*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3274*77c1e3ccSAndroid Build Coastguard Worker       }
3275*77c1e3ccSAndroid Build Coastguard Worker     } else if (horz_tap == 8) {
3276*77c1e3ccSAndroid Build Coastguard Worker       // horz_filt as 8 tap
3277*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src_ptr = src - 3;
3278*77c1e3ccSAndroid Build Coastguard Worker 
3279*77c1e3ccSAndroid Build Coastguard Worker       filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx2);
3280*77c1e3ccSAndroid Build Coastguard Worker 
3281*77c1e3ccSAndroid Build Coastguard Worker       prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3282*77c1e3ccSAndroid Build Coastguard Worker 
3283*77c1e3ccSAndroid Build Coastguard Worker       if (w == 8) {
3284*77c1e3ccSAndroid Build Coastguard Worker         do {
3285*77c1e3ccSAndroid Build Coastguard Worker           const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride,
3286*77c1e3ccSAndroid Build Coastguard Worker                                                        coeffs_256, filt_256);
3287*77c1e3ccSAndroid Build Coastguard Worker           sr_x_round_store_8x2_avx2(res, dst, dst_stride);
3288*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3289*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3290*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3291*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3292*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 16) {
3293*77c1e3ccSAndroid Build Coastguard Worker         do {
3294*77c1e3ccSAndroid Build Coastguard Worker           __m256i r[2];
3295*77c1e3ccSAndroid Build Coastguard Worker 
3296*77c1e3ccSAndroid Build Coastguard Worker           x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256,
3297*77c1e3ccSAndroid Build Coastguard Worker                                     r);
3298*77c1e3ccSAndroid Build Coastguard Worker           sr_x_round_store_16x2_avx2(r, dst, dst_stride);
3299*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 2 * src_stride;
3300*77c1e3ccSAndroid Build Coastguard Worker           dst += 2 * dst_stride;
3301*77c1e3ccSAndroid Build Coastguard Worker           y -= 2;
3302*77c1e3ccSAndroid Build Coastguard Worker         } while (y);
3303*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 32) {
3304*77c1e3ccSAndroid Build Coastguard Worker         do {
3305*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3306*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3307*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3308*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3309*77c1e3ccSAndroid Build Coastguard Worker       } else if (w == 64) {
3310*77c1e3ccSAndroid Build Coastguard Worker         do {
3311*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3312*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, dst + 32);
3313*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3314*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3315*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3316*77c1e3ccSAndroid Build Coastguard Worker       } else {
3317*77c1e3ccSAndroid Build Coastguard Worker         assert(w == 128);
3318*77c1e3ccSAndroid Build Coastguard Worker 
3319*77c1e3ccSAndroid Build Coastguard Worker         do {
3320*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, dst);
3321*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr + 1 * 32, coeffs_256, filt_256,
3322*77c1e3ccSAndroid Build Coastguard Worker                             dst + 1 * 32);
3323*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr + 2 * 32, coeffs_256, filt_256,
3324*77c1e3ccSAndroid Build Coastguard Worker                             dst + 2 * 32);
3325*77c1e3ccSAndroid Build Coastguard Worker           sr_x_8tap_32_avx2(src_ptr + 3 * 32, coeffs_256, filt_256,
3326*77c1e3ccSAndroid Build Coastguard Worker                             dst + 3 * 32);
3327*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += src_stride;
3328*77c1e3ccSAndroid Build Coastguard Worker           dst += dst_stride;
3329*77c1e3ccSAndroid Build Coastguard Worker         } while (--y);
3330*77c1e3ccSAndroid Build Coastguard Worker       }
3331*77c1e3ccSAndroid Build Coastguard Worker     }
3332*77c1e3ccSAndroid Build Coastguard Worker   }
3333*77c1e3ccSAndroid Build Coastguard Worker }
3334*77c1e3ccSAndroid Build Coastguard Worker 
3335*77c1e3ccSAndroid Build Coastguard Worker #endif  // THIRD_PARTY_SVT_AV1_CONVOLVE_AVX2_H_
3336