xref: /aosp_15_r20/external/libgav1/src/dsp/x86/convolve_sse4.inc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop// Copyright 2020 The libgav1 Authors
2*09537850SAkhilesh Sanikop//
3*09537850SAkhilesh Sanikop// Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop// you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop// You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop//
7*09537850SAkhilesh Sanikop//      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop//
9*09537850SAkhilesh Sanikop// Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop// distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop// See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop// limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop// Common 128 bit functions used for sse4/avx2 convolve implementations.
16*09537850SAkhilesh Sanikop// This will be included inside an anonymous namespace on files where these are
17*09537850SAkhilesh Sanikop// necessary.
18*09537850SAkhilesh Sanikop
19*09537850SAkhilesh Sanikop#include "src/dsp/convolve.inc"
20*09537850SAkhilesh Sanikop
21*09537850SAkhilesh Sanikop// This version checks for the special cases when filter_index == 1.
22*09537850SAkhilesh Sanikopint GetNumTapsInFilter(const int filter_index, const int filter_id) {
23*09537850SAkhilesh Sanikop  if (filter_index == 0) {
24*09537850SAkhilesh Sanikop    // Despite the names these only use 6 taps.
25*09537850SAkhilesh Sanikop    // kInterpolationFilterEightTap
26*09537850SAkhilesh Sanikop    // kInterpolationFilterEightTapSmooth
27*09537850SAkhilesh Sanikop    return 6;
28*09537850SAkhilesh Sanikop  }
29*09537850SAkhilesh Sanikop
30*09537850SAkhilesh Sanikop  if (filter_index == 1) {
31*09537850SAkhilesh Sanikop    // Despite the names these only use 6 taps.
32*09537850SAkhilesh Sanikop    // kInterpolationFilterEightTap
33*09537850SAkhilesh Sanikop    // kInterpolationFilterEightTapSmooth
34*09537850SAkhilesh Sanikop    if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) |
35*09537850SAkhilesh Sanikop         (filter_id == 8) | (filter_id == 9)) != 0) {
36*09537850SAkhilesh Sanikop      return 6;
37*09537850SAkhilesh Sanikop    }
38*09537850SAkhilesh Sanikop    // When |filter_index| == 1, the |filter_id| values not listed above map to
39*09537850SAkhilesh Sanikop    // 4 tap filters.
40*09537850SAkhilesh Sanikop    return 4;
41*09537850SAkhilesh Sanikop  }
42*09537850SAkhilesh Sanikop
43*09537850SAkhilesh Sanikop  if (filter_index == 2) {
44*09537850SAkhilesh Sanikop    // kInterpolationFilterEightTapSharp
45*09537850SAkhilesh Sanikop    return 8;
46*09537850SAkhilesh Sanikop  }
47*09537850SAkhilesh Sanikop
48*09537850SAkhilesh Sanikop  if (filter_index == 3) {
49*09537850SAkhilesh Sanikop    // kInterpolationFilterBilinear
50*09537850SAkhilesh Sanikop    return 2;
51*09537850SAkhilesh Sanikop  }
52*09537850SAkhilesh Sanikop
53*09537850SAkhilesh Sanikop  assert(filter_index > 3);
54*09537850SAkhilesh Sanikop  // For small sizes (width/height <= 4) the large filters are replaced with 4
55*09537850SAkhilesh Sanikop  // tap options.
56*09537850SAkhilesh Sanikop  // If the original filters were |kInterpolationFilterEightTap| or
57*09537850SAkhilesh Sanikop  // |kInterpolationFilterEightTapSharp| then it becomes
58*09537850SAkhilesh Sanikop  // |kInterpolationFilterSwitchable|.
59*09537850SAkhilesh Sanikop  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
60*09537850SAkhilesh Sanikop  // tap filter.
61*09537850SAkhilesh Sanikop  return 4;
62*09537850SAkhilesh Sanikop}
63*09537850SAkhilesh Sanikop
64*09537850SAkhilesh Sanikop// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
65*09537850SAkhilesh Sanikop// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
66*09537850SAkhilesh Sanikop// sum from outranging int16_t.
67*09537850SAkhilesh Sanikoptemplate <int num_taps>
68*09537850SAkhilesh Sanikop__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
69*09537850SAkhilesh Sanikop  __m128i sum;
70*09537850SAkhilesh Sanikop  if (num_taps == 6) {
71*09537850SAkhilesh Sanikop    // 6 taps.
72*09537850SAkhilesh Sanikop    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
73*09537850SAkhilesh Sanikop    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
74*09537850SAkhilesh Sanikop    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
75*09537850SAkhilesh Sanikop    sum = _mm_add_epi16(v_madd_21, v_madd_43);
76*09537850SAkhilesh Sanikop    sum = _mm_add_epi16(sum, v_madd_65);
77*09537850SAkhilesh Sanikop  } else if (num_taps == 8) {
78*09537850SAkhilesh Sanikop    // 8 taps.
79*09537850SAkhilesh Sanikop    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
80*09537850SAkhilesh Sanikop    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
81*09537850SAkhilesh Sanikop    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
82*09537850SAkhilesh Sanikop    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
83*09537850SAkhilesh Sanikop    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
84*09537850SAkhilesh Sanikop    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
85*09537850SAkhilesh Sanikop    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
86*09537850SAkhilesh Sanikop  } else if (num_taps == 2) {
87*09537850SAkhilesh Sanikop    // 2 taps.
88*09537850SAkhilesh Sanikop    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
89*09537850SAkhilesh Sanikop  } else {
90*09537850SAkhilesh Sanikop    // 4 taps.
91*09537850SAkhilesh Sanikop    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
92*09537850SAkhilesh Sanikop    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
93*09537850SAkhilesh Sanikop    sum = _mm_add_epi16(v_madd_32, v_madd_54);
94*09537850SAkhilesh Sanikop  }
95*09537850SAkhilesh Sanikop  return sum;
96*09537850SAkhilesh Sanikop}
97*09537850SAkhilesh Sanikop
98*09537850SAkhilesh Sanikoptemplate <int num_taps>
99*09537850SAkhilesh Sanikop__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
100*09537850SAkhilesh Sanikop                             const __m128i* const v_tap) {
101*09537850SAkhilesh Sanikop  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
102*09537850SAkhilesh Sanikop  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
103*09537850SAkhilesh Sanikop
104*09537850SAkhilesh Sanikop  if (num_taps == 2) {
105*09537850SAkhilesh Sanikop    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
106*09537850SAkhilesh Sanikop    const __m128i v_src_43 = _mm_shuffle_epi8(
107*09537850SAkhilesh Sanikop        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
108*09537850SAkhilesh Sanikop    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
109*09537850SAkhilesh Sanikop    return v_sum_43;
110*09537850SAkhilesh Sanikop  }
111*09537850SAkhilesh Sanikop
112*09537850SAkhilesh Sanikop  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
113*09537850SAkhilesh Sanikop  const __m128i v_src_32 = _mm_shuffle_epi8(
114*09537850SAkhilesh Sanikop      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
115*09537850SAkhilesh Sanikop  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
116*09537850SAkhilesh Sanikop  const __m128i v_src_54 = _mm_shuffle_epi8(
117*09537850SAkhilesh Sanikop      v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
118*09537850SAkhilesh Sanikop                           static_cast<int>(0x80070706), 0x06050504));
119*09537850SAkhilesh Sanikop  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
120*09537850SAkhilesh Sanikop  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
121*09537850SAkhilesh Sanikop  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
122*09537850SAkhilesh Sanikop  return v_sum_5432;
123*09537850SAkhilesh Sanikop}
124*09537850SAkhilesh Sanikop
125*09537850SAkhilesh Sanikoptemplate <int num_taps>
126*09537850SAkhilesh Sanikop__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
127*09537850SAkhilesh Sanikop                                const __m128i* const v_tap) {
128*09537850SAkhilesh Sanikop  __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
129*09537850SAkhilesh Sanikop
130*09537850SAkhilesh Sanikop  // Normally the Horizontal pass does the downshift in two passes:
131*09537850SAkhilesh Sanikop  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
132*09537850SAkhilesh Sanikop  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
133*09537850SAkhilesh Sanikop  // requires adding the rounding offset from the skipped shift.
134*09537850SAkhilesh Sanikop  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
135*09537850SAkhilesh Sanikop
136*09537850SAkhilesh Sanikop  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
137*09537850SAkhilesh Sanikop  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
138*09537850SAkhilesh Sanikop  return _mm_packus_epi16(sum, sum);
139*09537850SAkhilesh Sanikop}
140*09537850SAkhilesh Sanikop
141*09537850SAkhilesh Sanikoptemplate <int num_taps>
142*09537850SAkhilesh Sanikop__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
143*09537850SAkhilesh Sanikop                                const __m128i* const v_tap) {
144*09537850SAkhilesh Sanikop  const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
145*09537850SAkhilesh Sanikop
146*09537850SAkhilesh Sanikop  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
147*09537850SAkhilesh Sanikop}
148*09537850SAkhilesh Sanikop
149*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_2d_vertical = false>
150*09537850SAkhilesh SanikopLIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
151*09537850SAkhilesh Sanikop                                     __m128i* v_tap) {
152*09537850SAkhilesh Sanikop  if (num_taps == 8) {
153*09537850SAkhilesh Sanikop    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
154*09537850SAkhilesh Sanikop    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
155*09537850SAkhilesh Sanikop    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
156*09537850SAkhilesh Sanikop    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
157*09537850SAkhilesh Sanikop    if (is_2d_vertical) {
158*09537850SAkhilesh Sanikop      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
159*09537850SAkhilesh Sanikop      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
160*09537850SAkhilesh Sanikop      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
161*09537850SAkhilesh Sanikop      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
162*09537850SAkhilesh Sanikop    } else {
163*09537850SAkhilesh Sanikop      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
164*09537850SAkhilesh Sanikop      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
165*09537850SAkhilesh Sanikop      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
166*09537850SAkhilesh Sanikop      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
167*09537850SAkhilesh Sanikop    }
168*09537850SAkhilesh Sanikop  } else if (num_taps == 6) {
169*09537850SAkhilesh Sanikop    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
170*09537850SAkhilesh Sanikop    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
171*09537850SAkhilesh Sanikop    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
172*09537850SAkhilesh Sanikop    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
173*09537850SAkhilesh Sanikop    if (is_2d_vertical) {
174*09537850SAkhilesh Sanikop      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
175*09537850SAkhilesh Sanikop      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
176*09537850SAkhilesh Sanikop      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
177*09537850SAkhilesh Sanikop    } else {
178*09537850SAkhilesh Sanikop      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
179*09537850SAkhilesh Sanikop      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
180*09537850SAkhilesh Sanikop      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
181*09537850SAkhilesh Sanikop    }
182*09537850SAkhilesh Sanikop  } else if (num_taps == 4) {
183*09537850SAkhilesh Sanikop    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
184*09537850SAkhilesh Sanikop    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
185*09537850SAkhilesh Sanikop    if (is_2d_vertical) {
186*09537850SAkhilesh Sanikop      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
187*09537850SAkhilesh Sanikop      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
188*09537850SAkhilesh Sanikop    } else {
189*09537850SAkhilesh Sanikop      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
190*09537850SAkhilesh Sanikop      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
191*09537850SAkhilesh Sanikop    }
192*09537850SAkhilesh Sanikop  } else {  // num_taps == 2
193*09537850SAkhilesh Sanikop    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
194*09537850SAkhilesh Sanikop    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
195*09537850SAkhilesh Sanikop    if (is_2d_vertical) {
196*09537850SAkhilesh Sanikop      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
197*09537850SAkhilesh Sanikop    } else {
198*09537850SAkhilesh Sanikop      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
199*09537850SAkhilesh Sanikop    }
200*09537850SAkhilesh Sanikop  }
201*09537850SAkhilesh Sanikop}
202*09537850SAkhilesh Sanikop
203*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound>
204*09537850SAkhilesh Sanikop__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
205*09537850SAkhilesh Sanikop                                const __m128i* const taps) {
206*09537850SAkhilesh Sanikop  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
207*09537850SAkhilesh Sanikop  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
208*09537850SAkhilesh Sanikop  if (num_taps >= 4) {
209*09537850SAkhilesh Sanikop    __m128i madd_lo =
210*09537850SAkhilesh Sanikop        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
211*09537850SAkhilesh Sanikop    __m128i madd_hi =
212*09537850SAkhilesh Sanikop        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
213*09537850SAkhilesh Sanikop    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
214*09537850SAkhilesh Sanikop    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
215*09537850SAkhilesh Sanikop    if (num_taps >= 6) {
216*09537850SAkhilesh Sanikop      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
217*09537850SAkhilesh Sanikop      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
218*09537850SAkhilesh Sanikop      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
219*09537850SAkhilesh Sanikop      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
220*09537850SAkhilesh Sanikop      if (num_taps == 8) {
221*09537850SAkhilesh Sanikop        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
222*09537850SAkhilesh Sanikop        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
223*09537850SAkhilesh Sanikop        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
224*09537850SAkhilesh Sanikop        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
225*09537850SAkhilesh Sanikop      }
226*09537850SAkhilesh Sanikop    }
227*09537850SAkhilesh Sanikop  }
228*09537850SAkhilesh Sanikop
229*09537850SAkhilesh Sanikop  if (is_compound) {
230*09537850SAkhilesh Sanikop    return _mm_packs_epi32(
231*09537850SAkhilesh Sanikop        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
232*09537850SAkhilesh Sanikop        RightShiftWithRounding_S32(sum_hi,
233*09537850SAkhilesh Sanikop                                   kInterRoundBitsCompoundVertical - 1));
234*09537850SAkhilesh Sanikop  }
235*09537850SAkhilesh Sanikop
236*09537850SAkhilesh Sanikop  return _mm_packs_epi32(
237*09537850SAkhilesh Sanikop      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
238*09537850SAkhilesh Sanikop      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
239*09537850SAkhilesh Sanikop}
240*09537850SAkhilesh Sanikop
241*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound = false>
242*09537850SAkhilesh Sanikopvoid Filter2DVertical(const uint16_t* src, void* const dst,
243*09537850SAkhilesh Sanikop                      const ptrdiff_t dst_stride, const int width,
244*09537850SAkhilesh Sanikop                      const int height, const __m128i* const taps) {
245*09537850SAkhilesh Sanikop  assert(width >= 8);
246*09537850SAkhilesh Sanikop  constexpr int next_row = num_taps - 1;
247*09537850SAkhilesh Sanikop  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
248*09537850SAkhilesh Sanikop  const ptrdiff_t src_stride = width;
249*09537850SAkhilesh Sanikop
250*09537850SAkhilesh Sanikop  auto* dst8 = static_cast<uint8_t*>(dst);
251*09537850SAkhilesh Sanikop  auto* dst16 = static_cast<uint16_t*>(dst);
252*09537850SAkhilesh Sanikop
253*09537850SAkhilesh Sanikop  int x = 0;
254*09537850SAkhilesh Sanikop  do {
255*09537850SAkhilesh Sanikop    __m128i srcs[8];
256*09537850SAkhilesh Sanikop    const uint16_t* src_x = src + x;
257*09537850SAkhilesh Sanikop    srcs[0] = LoadAligned16(src_x);
258*09537850SAkhilesh Sanikop    src_x += src_stride;
259*09537850SAkhilesh Sanikop    if (num_taps >= 4) {
260*09537850SAkhilesh Sanikop      srcs[1] = LoadAligned16(src_x);
261*09537850SAkhilesh Sanikop      src_x += src_stride;
262*09537850SAkhilesh Sanikop      srcs[2] = LoadAligned16(src_x);
263*09537850SAkhilesh Sanikop      src_x += src_stride;
264*09537850SAkhilesh Sanikop      if (num_taps >= 6) {
265*09537850SAkhilesh Sanikop        srcs[3] = LoadAligned16(src_x);
266*09537850SAkhilesh Sanikop        src_x += src_stride;
267*09537850SAkhilesh Sanikop        srcs[4] = LoadAligned16(src_x);
268*09537850SAkhilesh Sanikop        src_x += src_stride;
269*09537850SAkhilesh Sanikop        if (num_taps == 8) {
270*09537850SAkhilesh Sanikop          srcs[5] = LoadAligned16(src_x);
271*09537850SAkhilesh Sanikop          src_x += src_stride;
272*09537850SAkhilesh Sanikop          srcs[6] = LoadAligned16(src_x);
273*09537850SAkhilesh Sanikop          src_x += src_stride;
274*09537850SAkhilesh Sanikop        }
275*09537850SAkhilesh Sanikop      }
276*09537850SAkhilesh Sanikop    }
277*09537850SAkhilesh Sanikop
278*09537850SAkhilesh Sanikop    auto* dst8_x = dst8 + x;
279*09537850SAkhilesh Sanikop    auto* dst16_x = dst16 + x;
280*09537850SAkhilesh Sanikop    int y = height;
281*09537850SAkhilesh Sanikop    do {
282*09537850SAkhilesh Sanikop      srcs[next_row] = LoadAligned16(src_x);
283*09537850SAkhilesh Sanikop      src_x += src_stride;
284*09537850SAkhilesh Sanikop
285*09537850SAkhilesh Sanikop      const __m128i sum =
286*09537850SAkhilesh Sanikop          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
287*09537850SAkhilesh Sanikop      if (is_compound) {
288*09537850SAkhilesh Sanikop        StoreUnaligned16(dst16_x, sum);
289*09537850SAkhilesh Sanikop        dst16_x += dst_stride;
290*09537850SAkhilesh Sanikop      } else {
291*09537850SAkhilesh Sanikop        StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
292*09537850SAkhilesh Sanikop        dst8_x += dst_stride;
293*09537850SAkhilesh Sanikop      }
294*09537850SAkhilesh Sanikop
295*09537850SAkhilesh Sanikop      srcs[0] = srcs[1];
296*09537850SAkhilesh Sanikop      if (num_taps >= 4) {
297*09537850SAkhilesh Sanikop        srcs[1] = srcs[2];
298*09537850SAkhilesh Sanikop        srcs[2] = srcs[3];
299*09537850SAkhilesh Sanikop        if (num_taps >= 6) {
300*09537850SAkhilesh Sanikop          srcs[3] = srcs[4];
301*09537850SAkhilesh Sanikop          srcs[4] = srcs[5];
302*09537850SAkhilesh Sanikop          if (num_taps == 8) {
303*09537850SAkhilesh Sanikop            srcs[5] = srcs[6];
304*09537850SAkhilesh Sanikop            srcs[6] = srcs[7];
305*09537850SAkhilesh Sanikop          }
306*09537850SAkhilesh Sanikop        }
307*09537850SAkhilesh Sanikop      }
308*09537850SAkhilesh Sanikop    } while (--y != 0);
309*09537850SAkhilesh Sanikop    x += 8;
310*09537850SAkhilesh Sanikop  } while (x < width);
311*09537850SAkhilesh Sanikop}
312*09537850SAkhilesh Sanikop
313*09537850SAkhilesh Sanikop// Take advantage of |src_stride| == |width| to process two rows at a time.
314*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound = false>
315*09537850SAkhilesh Sanikopvoid Filter2DVertical4xH(const uint16_t* src, void* const dst,
316*09537850SAkhilesh Sanikop                         const ptrdiff_t dst_stride, const int height,
317*09537850SAkhilesh Sanikop                         const __m128i* const taps) {
318*09537850SAkhilesh Sanikop  auto* dst8 = static_cast<uint8_t*>(dst);
319*09537850SAkhilesh Sanikop  auto* dst16 = static_cast<uint16_t*>(dst);
320*09537850SAkhilesh Sanikop
321*09537850SAkhilesh Sanikop  __m128i srcs[9];
322*09537850SAkhilesh Sanikop  srcs[0] = LoadAligned16(src);
323*09537850SAkhilesh Sanikop  src += 8;
324*09537850SAkhilesh Sanikop  if (num_taps >= 4) {
325*09537850SAkhilesh Sanikop    srcs[2] = LoadAligned16(src);
326*09537850SAkhilesh Sanikop    src += 8;
327*09537850SAkhilesh Sanikop    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
328*09537850SAkhilesh Sanikop    if (num_taps >= 6) {
329*09537850SAkhilesh Sanikop      srcs[4] = LoadAligned16(src);
330*09537850SAkhilesh Sanikop      src += 8;
331*09537850SAkhilesh Sanikop      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
332*09537850SAkhilesh Sanikop      if (num_taps == 8) {
333*09537850SAkhilesh Sanikop        srcs[6] = LoadAligned16(src);
334*09537850SAkhilesh Sanikop        src += 8;
335*09537850SAkhilesh Sanikop        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
336*09537850SAkhilesh Sanikop      }
337*09537850SAkhilesh Sanikop    }
338*09537850SAkhilesh Sanikop  }
339*09537850SAkhilesh Sanikop
340*09537850SAkhilesh Sanikop  int y = height;
341*09537850SAkhilesh Sanikop  do {
342*09537850SAkhilesh Sanikop    srcs[num_taps] = LoadAligned16(src);
343*09537850SAkhilesh Sanikop    src += 8;
344*09537850SAkhilesh Sanikop    srcs[num_taps - 1] = _mm_unpacklo_epi64(
345*09537850SAkhilesh Sanikop        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
346*09537850SAkhilesh Sanikop
347*09537850SAkhilesh Sanikop    const __m128i sum =
348*09537850SAkhilesh Sanikop        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
349*09537850SAkhilesh Sanikop    if (is_compound) {
350*09537850SAkhilesh Sanikop      StoreUnaligned16(dst16, sum);
351*09537850SAkhilesh Sanikop      dst16 += 4 << 1;
352*09537850SAkhilesh Sanikop    } else {
353*09537850SAkhilesh Sanikop      const __m128i results = _mm_packus_epi16(sum, sum);
354*09537850SAkhilesh Sanikop      Store4(dst8, results);
355*09537850SAkhilesh Sanikop      dst8 += dst_stride;
356*09537850SAkhilesh Sanikop      Store4(dst8, _mm_srli_si128(results, 4));
357*09537850SAkhilesh Sanikop      dst8 += dst_stride;
358*09537850SAkhilesh Sanikop    }
359*09537850SAkhilesh Sanikop
360*09537850SAkhilesh Sanikop    srcs[0] = srcs[2];
361*09537850SAkhilesh Sanikop    if (num_taps >= 4) {
362*09537850SAkhilesh Sanikop      srcs[1] = srcs[3];
363*09537850SAkhilesh Sanikop      srcs[2] = srcs[4];
364*09537850SAkhilesh Sanikop      if (num_taps >= 6) {
365*09537850SAkhilesh Sanikop        srcs[3] = srcs[5];
366*09537850SAkhilesh Sanikop        srcs[4] = srcs[6];
367*09537850SAkhilesh Sanikop        if (num_taps == 8) {
368*09537850SAkhilesh Sanikop          srcs[5] = srcs[7];
369*09537850SAkhilesh Sanikop          srcs[6] = srcs[8];
370*09537850SAkhilesh Sanikop        }
371*09537850SAkhilesh Sanikop      }
372*09537850SAkhilesh Sanikop    }
373*09537850SAkhilesh Sanikop    y -= 2;
374*09537850SAkhilesh Sanikop  } while (y != 0);
375*09537850SAkhilesh Sanikop}
376*09537850SAkhilesh Sanikop
377*09537850SAkhilesh Sanikop// Take advantage of |src_stride| == |width| to process four rows at a time.
378*09537850SAkhilesh Sanikoptemplate <int num_taps>
379*09537850SAkhilesh Sanikopvoid Filter2DVertical2xH(const uint16_t* src, void* const dst,
380*09537850SAkhilesh Sanikop                         const ptrdiff_t dst_stride, const int height,
381*09537850SAkhilesh Sanikop                         const __m128i* const taps) {
382*09537850SAkhilesh Sanikop  constexpr int next_row = (num_taps < 6) ? 4 : 8;
383*09537850SAkhilesh Sanikop
384*09537850SAkhilesh Sanikop  auto* dst8 = static_cast<uint8_t*>(dst);
385*09537850SAkhilesh Sanikop
386*09537850SAkhilesh Sanikop  __m128i srcs[9];
387*09537850SAkhilesh Sanikop  srcs[0] = LoadAligned16(src);
388*09537850SAkhilesh Sanikop  src += 8;
389*09537850SAkhilesh Sanikop  if (num_taps >= 6) {
390*09537850SAkhilesh Sanikop    srcs[4] = LoadAligned16(src);
391*09537850SAkhilesh Sanikop    src += 8;
392*09537850SAkhilesh Sanikop    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
393*09537850SAkhilesh Sanikop    if (num_taps == 8) {
394*09537850SAkhilesh Sanikop      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
395*09537850SAkhilesh Sanikop      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
396*09537850SAkhilesh Sanikop    }
397*09537850SAkhilesh Sanikop  }
398*09537850SAkhilesh Sanikop
399*09537850SAkhilesh Sanikop  int y = height;
400*09537850SAkhilesh Sanikop  do {
401*09537850SAkhilesh Sanikop    srcs[next_row] = LoadAligned16(src);
402*09537850SAkhilesh Sanikop    src += 8;
403*09537850SAkhilesh Sanikop    if (num_taps == 2) {
404*09537850SAkhilesh Sanikop      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
405*09537850SAkhilesh Sanikop    } else if (num_taps == 4) {
406*09537850SAkhilesh Sanikop      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
407*09537850SAkhilesh Sanikop      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
408*09537850SAkhilesh Sanikop      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
409*09537850SAkhilesh Sanikop    } else if (num_taps == 6) {
410*09537850SAkhilesh Sanikop      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
411*09537850SAkhilesh Sanikop      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
412*09537850SAkhilesh Sanikop      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
413*09537850SAkhilesh Sanikop    } else if (num_taps == 8) {
414*09537850SAkhilesh Sanikop      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
415*09537850SAkhilesh Sanikop      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
416*09537850SAkhilesh Sanikop      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
417*09537850SAkhilesh Sanikop    }
418*09537850SAkhilesh Sanikop
419*09537850SAkhilesh Sanikop    const __m128i sum =
420*09537850SAkhilesh Sanikop        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
421*09537850SAkhilesh Sanikop    const __m128i results = _mm_packus_epi16(sum, sum);
422*09537850SAkhilesh Sanikop
423*09537850SAkhilesh Sanikop    Store2(dst8, results);
424*09537850SAkhilesh Sanikop    dst8 += dst_stride;
425*09537850SAkhilesh Sanikop    Store2(dst8, _mm_srli_si128(results, 2));
426*09537850SAkhilesh Sanikop    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
427*09537850SAkhilesh Sanikop    // Therefore we don't need to check this condition when |height| > 4.
428*09537850SAkhilesh Sanikop    if (num_taps <= 4 && height == 2) return;
429*09537850SAkhilesh Sanikop    dst8 += dst_stride;
430*09537850SAkhilesh Sanikop    Store2(dst8, _mm_srli_si128(results, 4));
431*09537850SAkhilesh Sanikop    dst8 += dst_stride;
432*09537850SAkhilesh Sanikop    Store2(dst8, _mm_srli_si128(results, 6));
433*09537850SAkhilesh Sanikop    dst8 += dst_stride;
434*09537850SAkhilesh Sanikop
435*09537850SAkhilesh Sanikop    srcs[0] = srcs[4];
436*09537850SAkhilesh Sanikop    if (num_taps == 6) {
437*09537850SAkhilesh Sanikop      srcs[1] = srcs[5];
438*09537850SAkhilesh Sanikop      srcs[4] = srcs[8];
439*09537850SAkhilesh Sanikop    } else if (num_taps == 8) {
440*09537850SAkhilesh Sanikop      srcs[1] = srcs[5];
441*09537850SAkhilesh Sanikop      srcs[2] = srcs[6];
442*09537850SAkhilesh Sanikop      srcs[3] = srcs[7];
443*09537850SAkhilesh Sanikop      srcs[4] = srcs[8];
444*09537850SAkhilesh Sanikop    }
445*09537850SAkhilesh Sanikop
446*09537850SAkhilesh Sanikop    y -= 4;
447*09537850SAkhilesh Sanikop  } while (y != 0);
448*09537850SAkhilesh Sanikop}
449*09537850SAkhilesh Sanikop
450*09537850SAkhilesh Sanikop// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
451*09537850SAkhilesh Sanikop// Vertical calculations.
452*09537850SAkhilesh Sanikop__m128i Compound1DShift(const __m128i sum) {
453*09537850SAkhilesh Sanikop  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
454*09537850SAkhilesh Sanikop}
455*09537850SAkhilesh Sanikop
456*09537850SAkhilesh Sanikoptemplate <int num_taps>
457*09537850SAkhilesh Sanikop__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
458*09537850SAkhilesh Sanikop  __m128i v_src[4];
459*09537850SAkhilesh Sanikop
460*09537850SAkhilesh Sanikop  if (num_taps == 6) {
461*09537850SAkhilesh Sanikop    // 6 taps.
462*09537850SAkhilesh Sanikop    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
463*09537850SAkhilesh Sanikop    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
464*09537850SAkhilesh Sanikop    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
465*09537850SAkhilesh Sanikop  } else if (num_taps == 8) {
466*09537850SAkhilesh Sanikop    // 8 taps.
467*09537850SAkhilesh Sanikop    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
468*09537850SAkhilesh Sanikop    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
469*09537850SAkhilesh Sanikop    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
470*09537850SAkhilesh Sanikop    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
471*09537850SAkhilesh Sanikop  } else if (num_taps == 2) {
472*09537850SAkhilesh Sanikop    // 2 taps.
473*09537850SAkhilesh Sanikop    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
474*09537850SAkhilesh Sanikop  } else {
475*09537850SAkhilesh Sanikop    // 4 taps.
476*09537850SAkhilesh Sanikop    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
477*09537850SAkhilesh Sanikop    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
478*09537850SAkhilesh Sanikop  }
479*09537850SAkhilesh Sanikop  const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
480*09537850SAkhilesh Sanikop  return sum;
481*09537850SAkhilesh Sanikop}
482*09537850SAkhilesh Sanikop
483*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound = false>
484*09537850SAkhilesh Sanikopvoid FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
485*09537850SAkhilesh Sanikop                       void* const dst, const ptrdiff_t dst_stride,
486*09537850SAkhilesh Sanikop                       const int height, const __m128i* const v_tap) {
487*09537850SAkhilesh Sanikop  auto* dst8 = static_cast<uint8_t*>(dst);
488*09537850SAkhilesh Sanikop  auto* dst16 = static_cast<uint16_t*>(dst);
489*09537850SAkhilesh Sanikop
490*09537850SAkhilesh Sanikop  __m128i srcs[9];
491*09537850SAkhilesh Sanikop
492*09537850SAkhilesh Sanikop  if (num_taps == 2) {
493*09537850SAkhilesh Sanikop    srcs[2] = _mm_setzero_si128();
494*09537850SAkhilesh Sanikop    // 00 01 02 03
495*09537850SAkhilesh Sanikop    srcs[0] = Load4(src);
496*09537850SAkhilesh Sanikop    src += src_stride;
497*09537850SAkhilesh Sanikop
498*09537850SAkhilesh Sanikop    int y = height;
499*09537850SAkhilesh Sanikop    do {
500*09537850SAkhilesh Sanikop      // 10 11 12 13
501*09537850SAkhilesh Sanikop      const __m128i a = Load4(src);
502*09537850SAkhilesh Sanikop      // 00 01 02 03 10 11 12 13
503*09537850SAkhilesh Sanikop      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
504*09537850SAkhilesh Sanikop      src += src_stride;
505*09537850SAkhilesh Sanikop      // 20 21 22 23
506*09537850SAkhilesh Sanikop      srcs[2] = Load4(src);
507*09537850SAkhilesh Sanikop      src += src_stride;
508*09537850SAkhilesh Sanikop      // 10 11 12 13 20 21 22 23
509*09537850SAkhilesh Sanikop      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
510*09537850SAkhilesh Sanikop
511*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
512*09537850SAkhilesh Sanikop      if (is_compound) {
513*09537850SAkhilesh Sanikop        const __m128i results = Compound1DShift(sums);
514*09537850SAkhilesh Sanikop        StoreUnaligned16(dst16, results);
515*09537850SAkhilesh Sanikop        dst16 += 4 << 1;
516*09537850SAkhilesh Sanikop      } else {
517*09537850SAkhilesh Sanikop        const __m128i results_16 =
518*09537850SAkhilesh Sanikop            RightShiftWithRounding_S16(sums, kFilterBits - 1);
519*09537850SAkhilesh Sanikop        const __m128i results = _mm_packus_epi16(results_16, results_16);
520*09537850SAkhilesh Sanikop        Store4(dst8, results);
521*09537850SAkhilesh Sanikop        dst8 += dst_stride;
522*09537850SAkhilesh Sanikop        Store4(dst8, _mm_srli_si128(results, 4));
523*09537850SAkhilesh Sanikop        dst8 += dst_stride;
524*09537850SAkhilesh Sanikop      }
525*09537850SAkhilesh Sanikop
526*09537850SAkhilesh Sanikop      srcs[0] = srcs[2];
527*09537850SAkhilesh Sanikop      y -= 2;
528*09537850SAkhilesh Sanikop    } while (y != 0);
529*09537850SAkhilesh Sanikop  } else if (num_taps == 4) {
530*09537850SAkhilesh Sanikop    srcs[4] = _mm_setzero_si128();
531*09537850SAkhilesh Sanikop    // 00 01 02 03
532*09537850SAkhilesh Sanikop    srcs[0] = Load4(src);
533*09537850SAkhilesh Sanikop    src += src_stride;
534*09537850SAkhilesh Sanikop    // 10 11 12 13
535*09537850SAkhilesh Sanikop    const __m128i a = Load4(src);
536*09537850SAkhilesh Sanikop    // 00 01 02 03 10 11 12 13
537*09537850SAkhilesh Sanikop    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
538*09537850SAkhilesh Sanikop    src += src_stride;
539*09537850SAkhilesh Sanikop    // 20 21 22 23
540*09537850SAkhilesh Sanikop    srcs[2] = Load4(src);
541*09537850SAkhilesh Sanikop    src += src_stride;
542*09537850SAkhilesh Sanikop    // 10 11 12 13 20 21 22 23
543*09537850SAkhilesh Sanikop    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
544*09537850SAkhilesh Sanikop
545*09537850SAkhilesh Sanikop    int y = height;
546*09537850SAkhilesh Sanikop    do {
547*09537850SAkhilesh Sanikop      // 30 31 32 33
548*09537850SAkhilesh Sanikop      const __m128i b = Load4(src);
549*09537850SAkhilesh Sanikop      // 20 21 22 23 30 31 32 33
550*09537850SAkhilesh Sanikop      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
551*09537850SAkhilesh Sanikop      src += src_stride;
552*09537850SAkhilesh Sanikop      // 40 41 42 43
553*09537850SAkhilesh Sanikop      srcs[4] = Load4(src);
554*09537850SAkhilesh Sanikop      src += src_stride;
555*09537850SAkhilesh Sanikop      // 30 31 32 33 40 41 42 43
556*09537850SAkhilesh Sanikop      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
557*09537850SAkhilesh Sanikop
558*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
559*09537850SAkhilesh Sanikop      if (is_compound) {
560*09537850SAkhilesh Sanikop        const __m128i results = Compound1DShift(sums);
561*09537850SAkhilesh Sanikop        StoreUnaligned16(dst16, results);
562*09537850SAkhilesh Sanikop        dst16 += 4 << 1;
563*09537850SAkhilesh Sanikop      } else {
564*09537850SAkhilesh Sanikop        const __m128i results_16 =
565*09537850SAkhilesh Sanikop            RightShiftWithRounding_S16(sums, kFilterBits - 1);
566*09537850SAkhilesh Sanikop        const __m128i results = _mm_packus_epi16(results_16, results_16);
567*09537850SAkhilesh Sanikop        Store4(dst8, results);
568*09537850SAkhilesh Sanikop        dst8 += dst_stride;
569*09537850SAkhilesh Sanikop        Store4(dst8, _mm_srli_si128(results, 4));
570*09537850SAkhilesh Sanikop        dst8 += dst_stride;
571*09537850SAkhilesh Sanikop      }
572*09537850SAkhilesh Sanikop
573*09537850SAkhilesh Sanikop      srcs[0] = srcs[2];
574*09537850SAkhilesh Sanikop      srcs[1] = srcs[3];
575*09537850SAkhilesh Sanikop      srcs[2] = srcs[4];
576*09537850SAkhilesh Sanikop      y -= 2;
577*09537850SAkhilesh Sanikop    } while (y != 0);
578*09537850SAkhilesh Sanikop  } else if (num_taps == 6) {
579*09537850SAkhilesh Sanikop    srcs[6] = _mm_setzero_si128();
580*09537850SAkhilesh Sanikop    // 00 01 02 03
581*09537850SAkhilesh Sanikop    srcs[0] = Load4(src);
582*09537850SAkhilesh Sanikop    src += src_stride;
583*09537850SAkhilesh Sanikop    // 10 11 12 13
584*09537850SAkhilesh Sanikop    const __m128i a = Load4(src);
585*09537850SAkhilesh Sanikop    // 00 01 02 03 10 11 12 13
586*09537850SAkhilesh Sanikop    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
587*09537850SAkhilesh Sanikop    src += src_stride;
588*09537850SAkhilesh Sanikop    // 20 21 22 23
589*09537850SAkhilesh Sanikop    srcs[2] = Load4(src);
590*09537850SAkhilesh Sanikop    src += src_stride;
591*09537850SAkhilesh Sanikop    // 10 11 12 13 20 21 22 23
592*09537850SAkhilesh Sanikop    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
593*09537850SAkhilesh Sanikop    // 30 31 32 33
594*09537850SAkhilesh Sanikop    const __m128i b = Load4(src);
595*09537850SAkhilesh Sanikop    // 20 21 22 23 30 31 32 33
596*09537850SAkhilesh Sanikop    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
597*09537850SAkhilesh Sanikop    src += src_stride;
598*09537850SAkhilesh Sanikop    // 40 41 42 43
599*09537850SAkhilesh Sanikop    srcs[4] = Load4(src);
600*09537850SAkhilesh Sanikop    src += src_stride;
601*09537850SAkhilesh Sanikop    // 30 31 32 33 40 41 42 43
602*09537850SAkhilesh Sanikop    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
603*09537850SAkhilesh Sanikop
604*09537850SAkhilesh Sanikop    int y = height;
605*09537850SAkhilesh Sanikop    do {
606*09537850SAkhilesh Sanikop      // 50 51 52 53
607*09537850SAkhilesh Sanikop      const __m128i c = Load4(src);
608*09537850SAkhilesh Sanikop      // 40 41 42 43 50 51 52 53
609*09537850SAkhilesh Sanikop      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
610*09537850SAkhilesh Sanikop      src += src_stride;
611*09537850SAkhilesh Sanikop      // 60 61 62 63
612*09537850SAkhilesh Sanikop      srcs[6] = Load4(src);
613*09537850SAkhilesh Sanikop      src += src_stride;
614*09537850SAkhilesh Sanikop      // 50 51 52 53 60 61 62 63
615*09537850SAkhilesh Sanikop      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
616*09537850SAkhilesh Sanikop
617*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
618*09537850SAkhilesh Sanikop      if (is_compound) {
619*09537850SAkhilesh Sanikop        const __m128i results = Compound1DShift(sums);
620*09537850SAkhilesh Sanikop        StoreUnaligned16(dst16, results);
621*09537850SAkhilesh Sanikop        dst16 += 4 << 1;
622*09537850SAkhilesh Sanikop      } else {
623*09537850SAkhilesh Sanikop        const __m128i results_16 =
624*09537850SAkhilesh Sanikop            RightShiftWithRounding_S16(sums, kFilterBits - 1);
625*09537850SAkhilesh Sanikop        const __m128i results = _mm_packus_epi16(results_16, results_16);
626*09537850SAkhilesh Sanikop        Store4(dst8, results);
627*09537850SAkhilesh Sanikop        dst8 += dst_stride;
628*09537850SAkhilesh Sanikop        Store4(dst8, _mm_srli_si128(results, 4));
629*09537850SAkhilesh Sanikop        dst8 += dst_stride;
630*09537850SAkhilesh Sanikop      }
631*09537850SAkhilesh Sanikop
632*09537850SAkhilesh Sanikop      srcs[0] = srcs[2];
633*09537850SAkhilesh Sanikop      srcs[1] = srcs[3];
634*09537850SAkhilesh Sanikop      srcs[2] = srcs[4];
635*09537850SAkhilesh Sanikop      srcs[3] = srcs[5];
636*09537850SAkhilesh Sanikop      srcs[4] = srcs[6];
637*09537850SAkhilesh Sanikop      y -= 2;
638*09537850SAkhilesh Sanikop    } while (y != 0);
639*09537850SAkhilesh Sanikop  } else if (num_taps == 8) {
640*09537850SAkhilesh Sanikop    srcs[8] = _mm_setzero_si128();
641*09537850SAkhilesh Sanikop    // 00 01 02 03
642*09537850SAkhilesh Sanikop    srcs[0] = Load4(src);
643*09537850SAkhilesh Sanikop    src += src_stride;
644*09537850SAkhilesh Sanikop    // 10 11 12 13
645*09537850SAkhilesh Sanikop    const __m128i a = Load4(src);
646*09537850SAkhilesh Sanikop    // 00 01 02 03 10 11 12 13
647*09537850SAkhilesh Sanikop    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
648*09537850SAkhilesh Sanikop    src += src_stride;
649*09537850SAkhilesh Sanikop    // 20 21 22 23
650*09537850SAkhilesh Sanikop    srcs[2] = Load4(src);
651*09537850SAkhilesh Sanikop    src += src_stride;
652*09537850SAkhilesh Sanikop    // 10 11 12 13 20 21 22 23
653*09537850SAkhilesh Sanikop    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
654*09537850SAkhilesh Sanikop    // 30 31 32 33
655*09537850SAkhilesh Sanikop    const __m128i b = Load4(src);
656*09537850SAkhilesh Sanikop    // 20 21 22 23 30 31 32 33
657*09537850SAkhilesh Sanikop    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
658*09537850SAkhilesh Sanikop    src += src_stride;
659*09537850SAkhilesh Sanikop    // 40 41 42 43
660*09537850SAkhilesh Sanikop    srcs[4] = Load4(src);
661*09537850SAkhilesh Sanikop    src += src_stride;
662*09537850SAkhilesh Sanikop    // 30 31 32 33 40 41 42 43
663*09537850SAkhilesh Sanikop    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
664*09537850SAkhilesh Sanikop    // 50 51 52 53
665*09537850SAkhilesh Sanikop    const __m128i c = Load4(src);
666*09537850SAkhilesh Sanikop    // 40 41 42 43 50 51 52 53
667*09537850SAkhilesh Sanikop    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
668*09537850SAkhilesh Sanikop    src += src_stride;
669*09537850SAkhilesh Sanikop    // 60 61 62 63
670*09537850SAkhilesh Sanikop    srcs[6] = Load4(src);
671*09537850SAkhilesh Sanikop    src += src_stride;
672*09537850SAkhilesh Sanikop    // 50 51 52 53 60 61 62 63
673*09537850SAkhilesh Sanikop    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
674*09537850SAkhilesh Sanikop
675*09537850SAkhilesh Sanikop    int y = height;
676*09537850SAkhilesh Sanikop    do {
677*09537850SAkhilesh Sanikop      // 70 71 72 73
678*09537850SAkhilesh Sanikop      const __m128i d = Load4(src);
679*09537850SAkhilesh Sanikop      // 60 61 62 63 70 71 72 73
680*09537850SAkhilesh Sanikop      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
681*09537850SAkhilesh Sanikop      src += src_stride;
682*09537850SAkhilesh Sanikop      // 80 81 82 83
683*09537850SAkhilesh Sanikop      srcs[8] = Load4(src);
684*09537850SAkhilesh Sanikop      src += src_stride;
685*09537850SAkhilesh Sanikop      // 70 71 72 73 80 81 82 83
686*09537850SAkhilesh Sanikop      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
687*09537850SAkhilesh Sanikop
688*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
689*09537850SAkhilesh Sanikop      if (is_compound) {
690*09537850SAkhilesh Sanikop        const __m128i results = Compound1DShift(sums);
691*09537850SAkhilesh Sanikop        StoreUnaligned16(dst16, results);
692*09537850SAkhilesh Sanikop        dst16 += 4 << 1;
693*09537850SAkhilesh Sanikop      } else {
694*09537850SAkhilesh Sanikop        const __m128i results_16 =
695*09537850SAkhilesh Sanikop            RightShiftWithRounding_S16(sums, kFilterBits - 1);
696*09537850SAkhilesh Sanikop        const __m128i results = _mm_packus_epi16(results_16, results_16);
697*09537850SAkhilesh Sanikop        Store4(dst8, results);
698*09537850SAkhilesh Sanikop        dst8 += dst_stride;
699*09537850SAkhilesh Sanikop        Store4(dst8, _mm_srli_si128(results, 4));
700*09537850SAkhilesh Sanikop        dst8 += dst_stride;
701*09537850SAkhilesh Sanikop      }
702*09537850SAkhilesh Sanikop
703*09537850SAkhilesh Sanikop      srcs[0] = srcs[2];
704*09537850SAkhilesh Sanikop      srcs[1] = srcs[3];
705*09537850SAkhilesh Sanikop      srcs[2] = srcs[4];
706*09537850SAkhilesh Sanikop      srcs[3] = srcs[5];
707*09537850SAkhilesh Sanikop      srcs[4] = srcs[6];
708*09537850SAkhilesh Sanikop      srcs[5] = srcs[7];
709*09537850SAkhilesh Sanikop      srcs[6] = srcs[8];
710*09537850SAkhilesh Sanikop      y -= 2;
711*09537850SAkhilesh Sanikop    } while (y != 0);
712*09537850SAkhilesh Sanikop  }
713*09537850SAkhilesh Sanikop}
714*09537850SAkhilesh Sanikop
715*09537850SAkhilesh Sanikoptemplate <int num_taps, bool negative_outside_taps = false>
716*09537850SAkhilesh Sanikopvoid FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
717*09537850SAkhilesh Sanikop                       void* const dst, const ptrdiff_t dst_stride,
718*09537850SAkhilesh Sanikop                       const int height, const __m128i* const v_tap) {
719*09537850SAkhilesh Sanikop  auto* dst8 = static_cast<uint8_t*>(dst);
720*09537850SAkhilesh Sanikop
721*09537850SAkhilesh Sanikop  __m128i srcs[9];
722*09537850SAkhilesh Sanikop
723*09537850SAkhilesh Sanikop  if (num_taps == 2) {
724*09537850SAkhilesh Sanikop    srcs[2] = _mm_setzero_si128();
725*09537850SAkhilesh Sanikop    // 00 01
726*09537850SAkhilesh Sanikop    srcs[0] = Load2(src);
727*09537850SAkhilesh Sanikop    src += src_stride;
728*09537850SAkhilesh Sanikop
729*09537850SAkhilesh Sanikop    int y = height;
730*09537850SAkhilesh Sanikop    do {
731*09537850SAkhilesh Sanikop      // 00 01 10 11
732*09537850SAkhilesh Sanikop      srcs[0] = Load2<1>(src, srcs[0]);
733*09537850SAkhilesh Sanikop      src += src_stride;
734*09537850SAkhilesh Sanikop      // 00 01 10 11 20 21
735*09537850SAkhilesh Sanikop      srcs[0] = Load2<2>(src, srcs[0]);
736*09537850SAkhilesh Sanikop      src += src_stride;
737*09537850SAkhilesh Sanikop      // 00 01 10 11 20 21 30 31
738*09537850SAkhilesh Sanikop      srcs[0] = Load2<3>(src, srcs[0]);
739*09537850SAkhilesh Sanikop      src += src_stride;
740*09537850SAkhilesh Sanikop      // 40 41
741*09537850SAkhilesh Sanikop      srcs[2] = Load2<0>(src, srcs[2]);
742*09537850SAkhilesh Sanikop      src += src_stride;
743*09537850SAkhilesh Sanikop      // 00 01 10 11 20 21 30 31 40 41
744*09537850SAkhilesh Sanikop      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
745*09537850SAkhilesh Sanikop      // 10 11 20 21 30 31 40 41
746*09537850SAkhilesh Sanikop      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
747*09537850SAkhilesh Sanikop      // This uses srcs[0]..srcs[1].
748*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
749*09537850SAkhilesh Sanikop      const __m128i results_16 =
750*09537850SAkhilesh Sanikop          RightShiftWithRounding_S16(sums, kFilterBits - 1);
751*09537850SAkhilesh Sanikop      const __m128i results = _mm_packus_epi16(results_16, results_16);
752*09537850SAkhilesh Sanikop
753*09537850SAkhilesh Sanikop      Store2(dst8, results);
754*09537850SAkhilesh Sanikop      dst8 += dst_stride;
755*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 2));
756*09537850SAkhilesh Sanikop      if (height == 2) return;
757*09537850SAkhilesh Sanikop      dst8 += dst_stride;
758*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 4));
759*09537850SAkhilesh Sanikop      dst8 += dst_stride;
760*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 6));
761*09537850SAkhilesh Sanikop      dst8 += dst_stride;
762*09537850SAkhilesh Sanikop
763*09537850SAkhilesh Sanikop      srcs[0] = srcs[2];
764*09537850SAkhilesh Sanikop      y -= 4;
765*09537850SAkhilesh Sanikop    } while (y != 0);
766*09537850SAkhilesh Sanikop  } else if (num_taps == 4) {
767*09537850SAkhilesh Sanikop    srcs[4] = _mm_setzero_si128();
768*09537850SAkhilesh Sanikop
769*09537850SAkhilesh Sanikop    // 00 01
770*09537850SAkhilesh Sanikop    srcs[0] = Load2(src);
771*09537850SAkhilesh Sanikop    src += src_stride;
772*09537850SAkhilesh Sanikop    // 00 01 10 11
773*09537850SAkhilesh Sanikop    srcs[0] = Load2<1>(src, srcs[0]);
774*09537850SAkhilesh Sanikop    src += src_stride;
775*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21
776*09537850SAkhilesh Sanikop    srcs[0] = Load2<2>(src, srcs[0]);
777*09537850SAkhilesh Sanikop    src += src_stride;
778*09537850SAkhilesh Sanikop
779*09537850SAkhilesh Sanikop    int y = height;
780*09537850SAkhilesh Sanikop    do {
781*09537850SAkhilesh Sanikop      // 00 01 10 11 20 21 30 31
782*09537850SAkhilesh Sanikop      srcs[0] = Load2<3>(src, srcs[0]);
783*09537850SAkhilesh Sanikop      src += src_stride;
784*09537850SAkhilesh Sanikop      // 40 41
785*09537850SAkhilesh Sanikop      srcs[4] = Load2<0>(src, srcs[4]);
786*09537850SAkhilesh Sanikop      src += src_stride;
787*09537850SAkhilesh Sanikop      // 40 41 50 51
788*09537850SAkhilesh Sanikop      srcs[4] = Load2<1>(src, srcs[4]);
789*09537850SAkhilesh Sanikop      src += src_stride;
790*09537850SAkhilesh Sanikop      // 40 41 50 51 60 61
791*09537850SAkhilesh Sanikop      srcs[4] = Load2<2>(src, srcs[4]);
792*09537850SAkhilesh Sanikop      src += src_stride;
793*09537850SAkhilesh Sanikop      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
794*09537850SAkhilesh Sanikop      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
795*09537850SAkhilesh Sanikop      // 10 11 20 21 30 31 40 41
796*09537850SAkhilesh Sanikop      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
797*09537850SAkhilesh Sanikop      // 20 21 30 31 40 41 50 51
798*09537850SAkhilesh Sanikop      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
799*09537850SAkhilesh Sanikop      // 30 31 40 41 50 51 60 61
800*09537850SAkhilesh Sanikop      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
801*09537850SAkhilesh Sanikop
802*09537850SAkhilesh Sanikop      // This uses srcs[0]..srcs[3].
803*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
804*09537850SAkhilesh Sanikop      const __m128i results_16 =
805*09537850SAkhilesh Sanikop          RightShiftWithRounding_S16(sums, kFilterBits - 1);
806*09537850SAkhilesh Sanikop      const __m128i results = _mm_packus_epi16(results_16, results_16);
807*09537850SAkhilesh Sanikop
808*09537850SAkhilesh Sanikop      Store2(dst8, results);
809*09537850SAkhilesh Sanikop      dst8 += dst_stride;
810*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 2));
811*09537850SAkhilesh Sanikop      if (height == 2) return;
812*09537850SAkhilesh Sanikop      dst8 += dst_stride;
813*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 4));
814*09537850SAkhilesh Sanikop      dst8 += dst_stride;
815*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 6));
816*09537850SAkhilesh Sanikop      dst8 += dst_stride;
817*09537850SAkhilesh Sanikop
818*09537850SAkhilesh Sanikop      srcs[0] = srcs[4];
819*09537850SAkhilesh Sanikop      y -= 4;
820*09537850SAkhilesh Sanikop    } while (y != 0);
821*09537850SAkhilesh Sanikop  } else if (num_taps == 6) {
822*09537850SAkhilesh Sanikop    // During the vertical pass the number of taps is restricted when
823*09537850SAkhilesh Sanikop    // |height| <= 4.
824*09537850SAkhilesh Sanikop    assert(height > 4);
825*09537850SAkhilesh Sanikop    srcs[8] = _mm_setzero_si128();
826*09537850SAkhilesh Sanikop
827*09537850SAkhilesh Sanikop    // 00 01
828*09537850SAkhilesh Sanikop    srcs[0] = Load2(src);
829*09537850SAkhilesh Sanikop    src += src_stride;
830*09537850SAkhilesh Sanikop    // 00 01 10 11
831*09537850SAkhilesh Sanikop    srcs[0] = Load2<1>(src, srcs[0]);
832*09537850SAkhilesh Sanikop    src += src_stride;
833*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21
834*09537850SAkhilesh Sanikop    srcs[0] = Load2<2>(src, srcs[0]);
835*09537850SAkhilesh Sanikop    src += src_stride;
836*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21 30 31
837*09537850SAkhilesh Sanikop    srcs[0] = Load2<3>(src, srcs[0]);
838*09537850SAkhilesh Sanikop    src += src_stride;
839*09537850SAkhilesh Sanikop    // 40 41
840*09537850SAkhilesh Sanikop    srcs[4] = Load2(src);
841*09537850SAkhilesh Sanikop    src += src_stride;
842*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
843*09537850SAkhilesh Sanikop    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
844*09537850SAkhilesh Sanikop    // 10 11 20 21 30 31 40 41
845*09537850SAkhilesh Sanikop    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
846*09537850SAkhilesh Sanikop
847*09537850SAkhilesh Sanikop    int y = height;
848*09537850SAkhilesh Sanikop    do {
849*09537850SAkhilesh Sanikop      // 40 41 50 51
850*09537850SAkhilesh Sanikop      srcs[4] = Load2<1>(src, srcs[4]);
851*09537850SAkhilesh Sanikop      src += src_stride;
852*09537850SAkhilesh Sanikop      // 40 41 50 51 60 61
853*09537850SAkhilesh Sanikop      srcs[4] = Load2<2>(src, srcs[4]);
854*09537850SAkhilesh Sanikop      src += src_stride;
855*09537850SAkhilesh Sanikop      // 40 41 50 51 60 61 70 71
856*09537850SAkhilesh Sanikop      srcs[4] = Load2<3>(src, srcs[4]);
857*09537850SAkhilesh Sanikop      src += src_stride;
858*09537850SAkhilesh Sanikop      // 80 81
859*09537850SAkhilesh Sanikop      srcs[8] = Load2<0>(src, srcs[8]);
860*09537850SAkhilesh Sanikop      src += src_stride;
861*09537850SAkhilesh Sanikop      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
862*09537850SAkhilesh Sanikop      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
863*09537850SAkhilesh Sanikop      // 20 21 30 31 40 41 50 51
864*09537850SAkhilesh Sanikop      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
865*09537850SAkhilesh Sanikop      // 30 31 40 41 50 51 60 61
866*09537850SAkhilesh Sanikop      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
867*09537850SAkhilesh Sanikop      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
868*09537850SAkhilesh Sanikop      // 50 51 60 61 70 71 80 81
869*09537850SAkhilesh Sanikop      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
870*09537850SAkhilesh Sanikop
871*09537850SAkhilesh Sanikop      // This uses srcs[0]..srcs[5].
872*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
873*09537850SAkhilesh Sanikop      const __m128i results_16 =
874*09537850SAkhilesh Sanikop          RightShiftWithRounding_S16(sums, kFilterBits - 1);
875*09537850SAkhilesh Sanikop      const __m128i results = _mm_packus_epi16(results_16, results_16);
876*09537850SAkhilesh Sanikop
877*09537850SAkhilesh Sanikop      Store2(dst8, results);
878*09537850SAkhilesh Sanikop      dst8 += dst_stride;
879*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 2));
880*09537850SAkhilesh Sanikop      dst8 += dst_stride;
881*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 4));
882*09537850SAkhilesh Sanikop      dst8 += dst_stride;
883*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 6));
884*09537850SAkhilesh Sanikop      dst8 += dst_stride;
885*09537850SAkhilesh Sanikop
886*09537850SAkhilesh Sanikop      srcs[0] = srcs[4];
887*09537850SAkhilesh Sanikop      srcs[1] = srcs[5];
888*09537850SAkhilesh Sanikop      srcs[4] = srcs[8];
889*09537850SAkhilesh Sanikop      y -= 4;
890*09537850SAkhilesh Sanikop    } while (y != 0);
891*09537850SAkhilesh Sanikop  } else if (num_taps == 8) {
892*09537850SAkhilesh Sanikop    // During the vertical pass the number of taps is restricted when
893*09537850SAkhilesh Sanikop    // |height| <= 4.
894*09537850SAkhilesh Sanikop    assert(height > 4);
895*09537850SAkhilesh Sanikop    srcs[8] = _mm_setzero_si128();
896*09537850SAkhilesh Sanikop    // 00 01
897*09537850SAkhilesh Sanikop    srcs[0] = Load2(src);
898*09537850SAkhilesh Sanikop    src += src_stride;
899*09537850SAkhilesh Sanikop    // 00 01 10 11
900*09537850SAkhilesh Sanikop    srcs[0] = Load2<1>(src, srcs[0]);
901*09537850SAkhilesh Sanikop    src += src_stride;
902*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21
903*09537850SAkhilesh Sanikop    srcs[0] = Load2<2>(src, srcs[0]);
904*09537850SAkhilesh Sanikop    src += src_stride;
905*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21 30 31
906*09537850SAkhilesh Sanikop    srcs[0] = Load2<3>(src, srcs[0]);
907*09537850SAkhilesh Sanikop    src += src_stride;
908*09537850SAkhilesh Sanikop    // 40 41
909*09537850SAkhilesh Sanikop    srcs[4] = Load2(src);
910*09537850SAkhilesh Sanikop    src += src_stride;
911*09537850SAkhilesh Sanikop    // 40 41 50 51
912*09537850SAkhilesh Sanikop    srcs[4] = Load2<1>(src, srcs[4]);
913*09537850SAkhilesh Sanikop    src += src_stride;
914*09537850SAkhilesh Sanikop    // 40 41 50 51 60 61
915*09537850SAkhilesh Sanikop    srcs[4] = Load2<2>(src, srcs[4]);
916*09537850SAkhilesh Sanikop    src += src_stride;
917*09537850SAkhilesh Sanikop
918*09537850SAkhilesh Sanikop    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
919*09537850SAkhilesh Sanikop    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
920*09537850SAkhilesh Sanikop    // 10 11 20 21 30 31 40 41
921*09537850SAkhilesh Sanikop    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
922*09537850SAkhilesh Sanikop    // 20 21 30 31 40 41 50 51
923*09537850SAkhilesh Sanikop    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
924*09537850SAkhilesh Sanikop    // 30 31 40 41 50 51 60 61
925*09537850SAkhilesh Sanikop    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
926*09537850SAkhilesh Sanikop
927*09537850SAkhilesh Sanikop    int y = height;
928*09537850SAkhilesh Sanikop    do {
929*09537850SAkhilesh Sanikop      // 40 41 50 51 60 61 70 71
930*09537850SAkhilesh Sanikop      srcs[4] = Load2<3>(src, srcs[4]);
931*09537850SAkhilesh Sanikop      src += src_stride;
932*09537850SAkhilesh Sanikop      // 80 81
933*09537850SAkhilesh Sanikop      srcs[8] = Load2<0>(src, srcs[8]);
934*09537850SAkhilesh Sanikop      src += src_stride;
935*09537850SAkhilesh Sanikop      // 80 81 90 91
936*09537850SAkhilesh Sanikop      srcs[8] = Load2<1>(src, srcs[8]);
937*09537850SAkhilesh Sanikop      src += src_stride;
938*09537850SAkhilesh Sanikop      // 80 81 90 91 a0 a1
939*09537850SAkhilesh Sanikop      srcs[8] = Load2<2>(src, srcs[8]);
940*09537850SAkhilesh Sanikop      src += src_stride;
941*09537850SAkhilesh Sanikop
942*09537850SAkhilesh Sanikop      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
943*09537850SAkhilesh Sanikop      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
944*09537850SAkhilesh Sanikop      // 50 51 60 61 70 71 80 81
945*09537850SAkhilesh Sanikop      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
946*09537850SAkhilesh Sanikop      // 60 61 70 71 80 81 90 91
947*09537850SAkhilesh Sanikop      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
948*09537850SAkhilesh Sanikop      // 70 71 80 81 90 91 a0 a1
949*09537850SAkhilesh Sanikop      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
950*09537850SAkhilesh Sanikop
951*09537850SAkhilesh Sanikop      // This uses srcs[0]..srcs[7].
952*09537850SAkhilesh Sanikop      const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
953*09537850SAkhilesh Sanikop      const __m128i results_16 =
954*09537850SAkhilesh Sanikop          RightShiftWithRounding_S16(sums, kFilterBits - 1);
955*09537850SAkhilesh Sanikop      const __m128i results = _mm_packus_epi16(results_16, results_16);
956*09537850SAkhilesh Sanikop
957*09537850SAkhilesh Sanikop      Store2(dst8, results);
958*09537850SAkhilesh Sanikop      dst8 += dst_stride;
959*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 2));
960*09537850SAkhilesh Sanikop      dst8 += dst_stride;
961*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 4));
962*09537850SAkhilesh Sanikop      dst8 += dst_stride;
963*09537850SAkhilesh Sanikop      Store2(dst8, _mm_srli_si128(results, 6));
964*09537850SAkhilesh Sanikop      dst8 += dst_stride;
965*09537850SAkhilesh Sanikop
966*09537850SAkhilesh Sanikop      srcs[0] = srcs[4];
967*09537850SAkhilesh Sanikop      srcs[1] = srcs[5];
968*09537850SAkhilesh Sanikop      srcs[2] = srcs[6];
969*09537850SAkhilesh Sanikop      srcs[3] = srcs[7];
970*09537850SAkhilesh Sanikop      srcs[4] = srcs[8];
971*09537850SAkhilesh Sanikop      y -= 4;
972*09537850SAkhilesh Sanikop    } while (y != 0);
973*09537850SAkhilesh Sanikop  }
974*09537850SAkhilesh Sanikop}
975