1*09537850SAkhilesh Sanikop// Copyright 2020 The libgav1 Authors 2*09537850SAkhilesh Sanikop// 3*09537850SAkhilesh Sanikop// Licensed under the Apache License, Version 2.0 (the "License"); 4*09537850SAkhilesh Sanikop// you may not use this file except in compliance with the License. 5*09537850SAkhilesh Sanikop// You may obtain a copy of the License at 6*09537850SAkhilesh Sanikop// 7*09537850SAkhilesh Sanikop// http://www.apache.org/licenses/LICENSE-2.0 8*09537850SAkhilesh Sanikop// 9*09537850SAkhilesh Sanikop// Unless required by applicable law or agreed to in writing, software 10*09537850SAkhilesh Sanikop// distributed under the License is distributed on an "AS IS" BASIS, 11*09537850SAkhilesh Sanikop// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*09537850SAkhilesh Sanikop// See the License for the specific language governing permissions and 13*09537850SAkhilesh Sanikop// limitations under the License. 14*09537850SAkhilesh Sanikop 15*09537850SAkhilesh Sanikop// Common 128 bit functions used for sse4/avx2 convolve implementations. 16*09537850SAkhilesh Sanikop// This will be included inside an anonymous namespace on files where these are 17*09537850SAkhilesh Sanikop// necessary. 18*09537850SAkhilesh Sanikop 19*09537850SAkhilesh Sanikop#include "src/dsp/convolve.inc" 20*09537850SAkhilesh Sanikop 21*09537850SAkhilesh Sanikop// This version checks for the special cases when filter_index == 1. 22*09537850SAkhilesh Sanikopint GetNumTapsInFilter(const int filter_index, const int filter_id) { 23*09537850SAkhilesh Sanikop if (filter_index == 0) { 24*09537850SAkhilesh Sanikop // Despite the names these only use 6 taps. 25*09537850SAkhilesh Sanikop // kInterpolationFilterEightTap 26*09537850SAkhilesh Sanikop // kInterpolationFilterEightTapSmooth 27*09537850SAkhilesh Sanikop return 6; 28*09537850SAkhilesh Sanikop } 29*09537850SAkhilesh Sanikop 30*09537850SAkhilesh Sanikop if (filter_index == 1) { 31*09537850SAkhilesh Sanikop // Despite the names these only use 6 taps. 32*09537850SAkhilesh Sanikop // kInterpolationFilterEightTap 33*09537850SAkhilesh Sanikop // kInterpolationFilterEightTapSmooth 34*09537850SAkhilesh Sanikop if (((filter_id == 1) | (filter_id == 15) | (filter_id == 7) | 35*09537850SAkhilesh Sanikop (filter_id == 8) | (filter_id == 9)) != 0) { 36*09537850SAkhilesh Sanikop return 6; 37*09537850SAkhilesh Sanikop } 38*09537850SAkhilesh Sanikop // When |filter_index| == 1, the |filter_id| values not listed above map to 39*09537850SAkhilesh Sanikop // 4 tap filters. 40*09537850SAkhilesh Sanikop return 4; 41*09537850SAkhilesh Sanikop } 42*09537850SAkhilesh Sanikop 43*09537850SAkhilesh Sanikop if (filter_index == 2) { 44*09537850SAkhilesh Sanikop // kInterpolationFilterEightTapSharp 45*09537850SAkhilesh Sanikop return 8; 46*09537850SAkhilesh Sanikop } 47*09537850SAkhilesh Sanikop 48*09537850SAkhilesh Sanikop if (filter_index == 3) { 49*09537850SAkhilesh Sanikop // kInterpolationFilterBilinear 50*09537850SAkhilesh Sanikop return 2; 51*09537850SAkhilesh Sanikop } 52*09537850SAkhilesh Sanikop 53*09537850SAkhilesh Sanikop assert(filter_index > 3); 54*09537850SAkhilesh Sanikop // For small sizes (width/height <= 4) the large filters are replaced with 4 55*09537850SAkhilesh Sanikop // tap options. 56*09537850SAkhilesh Sanikop // If the original filters were |kInterpolationFilterEightTap| or 57*09537850SAkhilesh Sanikop // |kInterpolationFilterEightTapSharp| then it becomes 58*09537850SAkhilesh Sanikop // |kInterpolationFilterSwitchable|. 59*09537850SAkhilesh Sanikop // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4 60*09537850SAkhilesh Sanikop // tap filter. 61*09537850SAkhilesh Sanikop return 4; 62*09537850SAkhilesh Sanikop} 63*09537850SAkhilesh Sanikop 64*09537850SAkhilesh Sanikop// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and 65*09537850SAkhilesh Sanikop// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final 66*09537850SAkhilesh Sanikop// sum from outranging int16_t. 67*09537850SAkhilesh Sanikoptemplate <int num_taps> 68*09537850SAkhilesh Sanikop__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) { 69*09537850SAkhilesh Sanikop __m128i sum; 70*09537850SAkhilesh Sanikop if (num_taps == 6) { 71*09537850SAkhilesh Sanikop // 6 taps. 72*09537850SAkhilesh Sanikop const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1 73*09537850SAkhilesh Sanikop const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3 74*09537850SAkhilesh Sanikop const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5 75*09537850SAkhilesh Sanikop sum = _mm_add_epi16(v_madd_21, v_madd_43); 76*09537850SAkhilesh Sanikop sum = _mm_add_epi16(sum, v_madd_65); 77*09537850SAkhilesh Sanikop } else if (num_taps == 8) { 78*09537850SAkhilesh Sanikop // 8 taps. 79*09537850SAkhilesh Sanikop const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0 80*09537850SAkhilesh Sanikop const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2 81*09537850SAkhilesh Sanikop const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4 82*09537850SAkhilesh Sanikop const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6 83*09537850SAkhilesh Sanikop const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32); 84*09537850SAkhilesh Sanikop const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76); 85*09537850SAkhilesh Sanikop sum = _mm_add_epi16(v_sum_7654, v_sum_3210); 86*09537850SAkhilesh Sanikop } else if (num_taps == 2) { 87*09537850SAkhilesh Sanikop // 2 taps. 88*09537850SAkhilesh Sanikop sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3 89*09537850SAkhilesh Sanikop } else { 90*09537850SAkhilesh Sanikop // 4 taps. 91*09537850SAkhilesh Sanikop const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2 92*09537850SAkhilesh Sanikop const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4 93*09537850SAkhilesh Sanikop sum = _mm_add_epi16(v_madd_32, v_madd_54); 94*09537850SAkhilesh Sanikop } 95*09537850SAkhilesh Sanikop return sum; 96*09537850SAkhilesh Sanikop} 97*09537850SAkhilesh Sanikop 98*09537850SAkhilesh Sanikoptemplate <int num_taps> 99*09537850SAkhilesh Sanikop__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, 100*09537850SAkhilesh Sanikop const __m128i* const v_tap) { 101*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 102*09537850SAkhilesh Sanikop const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]); 103*09537850SAkhilesh Sanikop 104*09537850SAkhilesh Sanikop if (num_taps == 2) { 105*09537850SAkhilesh Sanikop // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17 106*09537850SAkhilesh Sanikop const __m128i v_src_43 = _mm_shuffle_epi8( 107*09537850SAkhilesh Sanikop v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403)); 108*09537850SAkhilesh Sanikop const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3 109*09537850SAkhilesh Sanikop return v_sum_43; 110*09537850SAkhilesh Sanikop } 111*09537850SAkhilesh Sanikop 112*09537850SAkhilesh Sanikop // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 113*09537850SAkhilesh Sanikop const __m128i v_src_32 = _mm_shuffle_epi8( 114*09537850SAkhilesh Sanikop v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302)); 115*09537850SAkhilesh Sanikop // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx 116*09537850SAkhilesh Sanikop const __m128i v_src_54 = _mm_shuffle_epi8( 117*09537850SAkhilesh Sanikop v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c, 118*09537850SAkhilesh Sanikop static_cast<int>(0x80070706), 0x06050504)); 119*09537850SAkhilesh Sanikop const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2 120*09537850SAkhilesh Sanikop const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4 121*09537850SAkhilesh Sanikop const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32); 122*09537850SAkhilesh Sanikop return v_sum_5432; 123*09537850SAkhilesh Sanikop} 124*09537850SAkhilesh Sanikop 125*09537850SAkhilesh Sanikoptemplate <int num_taps> 126*09537850SAkhilesh Sanikop__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride, 127*09537850SAkhilesh Sanikop const __m128i* const v_tap) { 128*09537850SAkhilesh Sanikop __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); 129*09537850SAkhilesh Sanikop 130*09537850SAkhilesh Sanikop // Normally the Horizontal pass does the downshift in two passes: 131*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal - 1 and then (kFilterBits - 132*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them 133*09537850SAkhilesh Sanikop // requires adding the rounding offset from the skipped shift. 134*09537850SAkhilesh Sanikop constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2); 135*09537850SAkhilesh Sanikop 136*09537850SAkhilesh Sanikop sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit)); 137*09537850SAkhilesh Sanikop sum = RightShiftWithRounding_S16(sum, kFilterBits - 1); 138*09537850SAkhilesh Sanikop return _mm_packus_epi16(sum, sum); 139*09537850SAkhilesh Sanikop} 140*09537850SAkhilesh Sanikop 141*09537850SAkhilesh Sanikoptemplate <int num_taps> 142*09537850SAkhilesh Sanikop__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride, 143*09537850SAkhilesh Sanikop const __m128i* const v_tap) { 144*09537850SAkhilesh Sanikop const __m128i sum = SumHorizontalTaps2x2<num_taps>(src, src_stride, v_tap); 145*09537850SAkhilesh Sanikop 146*09537850SAkhilesh Sanikop return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); 147*09537850SAkhilesh Sanikop} 148*09537850SAkhilesh Sanikop 149*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_2d_vertical = false> 150*09537850SAkhilesh SanikopLIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter, 151*09537850SAkhilesh Sanikop __m128i* v_tap) { 152*09537850SAkhilesh Sanikop if (num_taps == 8) { 153*09537850SAkhilesh Sanikop v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0 154*09537850SAkhilesh Sanikop v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 155*09537850SAkhilesh Sanikop v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 156*09537850SAkhilesh Sanikop v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6 157*09537850SAkhilesh Sanikop if (is_2d_vertical) { 158*09537850SAkhilesh Sanikop v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 159*09537850SAkhilesh Sanikop v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); 160*09537850SAkhilesh Sanikop v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); 161*09537850SAkhilesh Sanikop v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]); 162*09537850SAkhilesh Sanikop } else { 163*09537850SAkhilesh Sanikop v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 164*09537850SAkhilesh Sanikop v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); 165*09537850SAkhilesh Sanikop v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); 166*09537850SAkhilesh Sanikop v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]); 167*09537850SAkhilesh Sanikop } 168*09537850SAkhilesh Sanikop } else if (num_taps == 6) { 169*09537850SAkhilesh Sanikop const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); 170*09537850SAkhilesh Sanikop v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1 171*09537850SAkhilesh Sanikop v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 172*09537850SAkhilesh Sanikop v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5 173*09537850SAkhilesh Sanikop if (is_2d_vertical) { 174*09537850SAkhilesh Sanikop v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 175*09537850SAkhilesh Sanikop v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); 176*09537850SAkhilesh Sanikop v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]); 177*09537850SAkhilesh Sanikop } else { 178*09537850SAkhilesh Sanikop v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 179*09537850SAkhilesh Sanikop v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); 180*09537850SAkhilesh Sanikop v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]); 181*09537850SAkhilesh Sanikop } 182*09537850SAkhilesh Sanikop } else if (num_taps == 4) { 183*09537850SAkhilesh Sanikop v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2 184*09537850SAkhilesh Sanikop v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4 185*09537850SAkhilesh Sanikop if (is_2d_vertical) { 186*09537850SAkhilesh Sanikop v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 187*09537850SAkhilesh Sanikop v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]); 188*09537850SAkhilesh Sanikop } else { 189*09537850SAkhilesh Sanikop v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 190*09537850SAkhilesh Sanikop v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]); 191*09537850SAkhilesh Sanikop } 192*09537850SAkhilesh Sanikop } else { // num_taps == 2 193*09537850SAkhilesh Sanikop const __m128i adjusted_filter = _mm_srli_si128(*filter, 1); 194*09537850SAkhilesh Sanikop v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3 195*09537850SAkhilesh Sanikop if (is_2d_vertical) { 196*09537850SAkhilesh Sanikop v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]); 197*09537850SAkhilesh Sanikop } else { 198*09537850SAkhilesh Sanikop v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]); 199*09537850SAkhilesh Sanikop } 200*09537850SAkhilesh Sanikop } 201*09537850SAkhilesh Sanikop} 202*09537850SAkhilesh Sanikop 203*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound> 204*09537850SAkhilesh Sanikop__m128i SimpleSum2DVerticalTaps(const __m128i* const src, 205*09537850SAkhilesh Sanikop const __m128i* const taps) { 206*09537850SAkhilesh Sanikop __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]); 207*09537850SAkhilesh Sanikop __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]); 208*09537850SAkhilesh Sanikop if (num_taps >= 4) { 209*09537850SAkhilesh Sanikop __m128i madd_lo = 210*09537850SAkhilesh Sanikop _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]); 211*09537850SAkhilesh Sanikop __m128i madd_hi = 212*09537850SAkhilesh Sanikop _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]); 213*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, madd_lo); 214*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, madd_hi); 215*09537850SAkhilesh Sanikop if (num_taps >= 6) { 216*09537850SAkhilesh Sanikop madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]); 217*09537850SAkhilesh Sanikop madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]); 218*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, madd_lo); 219*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, madd_hi); 220*09537850SAkhilesh Sanikop if (num_taps == 8) { 221*09537850SAkhilesh Sanikop madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]); 222*09537850SAkhilesh Sanikop madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]); 223*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, madd_lo); 224*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, madd_hi); 225*09537850SAkhilesh Sanikop } 226*09537850SAkhilesh Sanikop } 227*09537850SAkhilesh Sanikop } 228*09537850SAkhilesh Sanikop 229*09537850SAkhilesh Sanikop if (is_compound) { 230*09537850SAkhilesh Sanikop return _mm_packs_epi32( 231*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1), 232*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi, 233*09537850SAkhilesh Sanikop kInterRoundBitsCompoundVertical - 1)); 234*09537850SAkhilesh Sanikop } 235*09537850SAkhilesh Sanikop 236*09537850SAkhilesh Sanikop return _mm_packs_epi32( 237*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1), 238*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1)); 239*09537850SAkhilesh Sanikop} 240*09537850SAkhilesh Sanikop 241*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound = false> 242*09537850SAkhilesh Sanikopvoid Filter2DVertical(const uint16_t* src, void* const dst, 243*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int width, 244*09537850SAkhilesh Sanikop const int height, const __m128i* const taps) { 245*09537850SAkhilesh Sanikop assert(width >= 8); 246*09537850SAkhilesh Sanikop constexpr int next_row = num_taps - 1; 247*09537850SAkhilesh Sanikop // The Horizontal pass uses |width| as |stride| for the intermediate buffer. 248*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = width; 249*09537850SAkhilesh Sanikop 250*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst); 251*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst); 252*09537850SAkhilesh Sanikop 253*09537850SAkhilesh Sanikop int x = 0; 254*09537850SAkhilesh Sanikop do { 255*09537850SAkhilesh Sanikop __m128i srcs[8]; 256*09537850SAkhilesh Sanikop const uint16_t* src_x = src + x; 257*09537850SAkhilesh Sanikop srcs[0] = LoadAligned16(src_x); 258*09537850SAkhilesh Sanikop src_x += src_stride; 259*09537850SAkhilesh Sanikop if (num_taps >= 4) { 260*09537850SAkhilesh Sanikop srcs[1] = LoadAligned16(src_x); 261*09537850SAkhilesh Sanikop src_x += src_stride; 262*09537850SAkhilesh Sanikop srcs[2] = LoadAligned16(src_x); 263*09537850SAkhilesh Sanikop src_x += src_stride; 264*09537850SAkhilesh Sanikop if (num_taps >= 6) { 265*09537850SAkhilesh Sanikop srcs[3] = LoadAligned16(src_x); 266*09537850SAkhilesh Sanikop src_x += src_stride; 267*09537850SAkhilesh Sanikop srcs[4] = LoadAligned16(src_x); 268*09537850SAkhilesh Sanikop src_x += src_stride; 269*09537850SAkhilesh Sanikop if (num_taps == 8) { 270*09537850SAkhilesh Sanikop srcs[5] = LoadAligned16(src_x); 271*09537850SAkhilesh Sanikop src_x += src_stride; 272*09537850SAkhilesh Sanikop srcs[6] = LoadAligned16(src_x); 273*09537850SAkhilesh Sanikop src_x += src_stride; 274*09537850SAkhilesh Sanikop } 275*09537850SAkhilesh Sanikop } 276*09537850SAkhilesh Sanikop } 277*09537850SAkhilesh Sanikop 278*09537850SAkhilesh Sanikop auto* dst8_x = dst8 + x; 279*09537850SAkhilesh Sanikop auto* dst16_x = dst16 + x; 280*09537850SAkhilesh Sanikop int y = height; 281*09537850SAkhilesh Sanikop do { 282*09537850SAkhilesh Sanikop srcs[next_row] = LoadAligned16(src_x); 283*09537850SAkhilesh Sanikop src_x += src_stride; 284*09537850SAkhilesh Sanikop 285*09537850SAkhilesh Sanikop const __m128i sum = 286*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); 287*09537850SAkhilesh Sanikop if (is_compound) { 288*09537850SAkhilesh Sanikop StoreUnaligned16(dst16_x, sum); 289*09537850SAkhilesh Sanikop dst16_x += dst_stride; 290*09537850SAkhilesh Sanikop } else { 291*09537850SAkhilesh Sanikop StoreLo8(dst8_x, _mm_packus_epi16(sum, sum)); 292*09537850SAkhilesh Sanikop dst8_x += dst_stride; 293*09537850SAkhilesh Sanikop } 294*09537850SAkhilesh Sanikop 295*09537850SAkhilesh Sanikop srcs[0] = srcs[1]; 296*09537850SAkhilesh Sanikop if (num_taps >= 4) { 297*09537850SAkhilesh Sanikop srcs[1] = srcs[2]; 298*09537850SAkhilesh Sanikop srcs[2] = srcs[3]; 299*09537850SAkhilesh Sanikop if (num_taps >= 6) { 300*09537850SAkhilesh Sanikop srcs[3] = srcs[4]; 301*09537850SAkhilesh Sanikop srcs[4] = srcs[5]; 302*09537850SAkhilesh Sanikop if (num_taps == 8) { 303*09537850SAkhilesh Sanikop srcs[5] = srcs[6]; 304*09537850SAkhilesh Sanikop srcs[6] = srcs[7]; 305*09537850SAkhilesh Sanikop } 306*09537850SAkhilesh Sanikop } 307*09537850SAkhilesh Sanikop } 308*09537850SAkhilesh Sanikop } while (--y != 0); 309*09537850SAkhilesh Sanikop x += 8; 310*09537850SAkhilesh Sanikop } while (x < width); 311*09537850SAkhilesh Sanikop} 312*09537850SAkhilesh Sanikop 313*09537850SAkhilesh Sanikop// Take advantage of |src_stride| == |width| to process two rows at a time. 314*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound = false> 315*09537850SAkhilesh Sanikopvoid Filter2DVertical4xH(const uint16_t* src, void* const dst, 316*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int height, 317*09537850SAkhilesh Sanikop const __m128i* const taps) { 318*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst); 319*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst); 320*09537850SAkhilesh Sanikop 321*09537850SAkhilesh Sanikop __m128i srcs[9]; 322*09537850SAkhilesh Sanikop srcs[0] = LoadAligned16(src); 323*09537850SAkhilesh Sanikop src += 8; 324*09537850SAkhilesh Sanikop if (num_taps >= 4) { 325*09537850SAkhilesh Sanikop srcs[2] = LoadAligned16(src); 326*09537850SAkhilesh Sanikop src += 8; 327*09537850SAkhilesh Sanikop srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]); 328*09537850SAkhilesh Sanikop if (num_taps >= 6) { 329*09537850SAkhilesh Sanikop srcs[4] = LoadAligned16(src); 330*09537850SAkhilesh Sanikop src += 8; 331*09537850SAkhilesh Sanikop srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]); 332*09537850SAkhilesh Sanikop if (num_taps == 8) { 333*09537850SAkhilesh Sanikop srcs[6] = LoadAligned16(src); 334*09537850SAkhilesh Sanikop src += 8; 335*09537850SAkhilesh Sanikop srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]); 336*09537850SAkhilesh Sanikop } 337*09537850SAkhilesh Sanikop } 338*09537850SAkhilesh Sanikop } 339*09537850SAkhilesh Sanikop 340*09537850SAkhilesh Sanikop int y = height; 341*09537850SAkhilesh Sanikop do { 342*09537850SAkhilesh Sanikop srcs[num_taps] = LoadAligned16(src); 343*09537850SAkhilesh Sanikop src += 8; 344*09537850SAkhilesh Sanikop srcs[num_taps - 1] = _mm_unpacklo_epi64( 345*09537850SAkhilesh Sanikop _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]); 346*09537850SAkhilesh Sanikop 347*09537850SAkhilesh Sanikop const __m128i sum = 348*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps); 349*09537850SAkhilesh Sanikop if (is_compound) { 350*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, sum); 351*09537850SAkhilesh Sanikop dst16 += 4 << 1; 352*09537850SAkhilesh Sanikop } else { 353*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(sum, sum); 354*09537850SAkhilesh Sanikop Store4(dst8, results); 355*09537850SAkhilesh Sanikop dst8 += dst_stride; 356*09537850SAkhilesh Sanikop Store4(dst8, _mm_srli_si128(results, 4)); 357*09537850SAkhilesh Sanikop dst8 += dst_stride; 358*09537850SAkhilesh Sanikop } 359*09537850SAkhilesh Sanikop 360*09537850SAkhilesh Sanikop srcs[0] = srcs[2]; 361*09537850SAkhilesh Sanikop if (num_taps >= 4) { 362*09537850SAkhilesh Sanikop srcs[1] = srcs[3]; 363*09537850SAkhilesh Sanikop srcs[2] = srcs[4]; 364*09537850SAkhilesh Sanikop if (num_taps >= 6) { 365*09537850SAkhilesh Sanikop srcs[3] = srcs[5]; 366*09537850SAkhilesh Sanikop srcs[4] = srcs[6]; 367*09537850SAkhilesh Sanikop if (num_taps == 8) { 368*09537850SAkhilesh Sanikop srcs[5] = srcs[7]; 369*09537850SAkhilesh Sanikop srcs[6] = srcs[8]; 370*09537850SAkhilesh Sanikop } 371*09537850SAkhilesh Sanikop } 372*09537850SAkhilesh Sanikop } 373*09537850SAkhilesh Sanikop y -= 2; 374*09537850SAkhilesh Sanikop } while (y != 0); 375*09537850SAkhilesh Sanikop} 376*09537850SAkhilesh Sanikop 377*09537850SAkhilesh Sanikop// Take advantage of |src_stride| == |width| to process four rows at a time. 378*09537850SAkhilesh Sanikoptemplate <int num_taps> 379*09537850SAkhilesh Sanikopvoid Filter2DVertical2xH(const uint16_t* src, void* const dst, 380*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int height, 381*09537850SAkhilesh Sanikop const __m128i* const taps) { 382*09537850SAkhilesh Sanikop constexpr int next_row = (num_taps < 6) ? 4 : 8; 383*09537850SAkhilesh Sanikop 384*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst); 385*09537850SAkhilesh Sanikop 386*09537850SAkhilesh Sanikop __m128i srcs[9]; 387*09537850SAkhilesh Sanikop srcs[0] = LoadAligned16(src); 388*09537850SAkhilesh Sanikop src += 8; 389*09537850SAkhilesh Sanikop if (num_taps >= 6) { 390*09537850SAkhilesh Sanikop srcs[4] = LoadAligned16(src); 391*09537850SAkhilesh Sanikop src += 8; 392*09537850SAkhilesh Sanikop srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); 393*09537850SAkhilesh Sanikop if (num_taps == 8) { 394*09537850SAkhilesh Sanikop srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); 395*09537850SAkhilesh Sanikop srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); 396*09537850SAkhilesh Sanikop } 397*09537850SAkhilesh Sanikop } 398*09537850SAkhilesh Sanikop 399*09537850SAkhilesh Sanikop int y = height; 400*09537850SAkhilesh Sanikop do { 401*09537850SAkhilesh Sanikop srcs[next_row] = LoadAligned16(src); 402*09537850SAkhilesh Sanikop src += 8; 403*09537850SAkhilesh Sanikop if (num_taps == 2) { 404*09537850SAkhilesh Sanikop srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); 405*09537850SAkhilesh Sanikop } else if (num_taps == 4) { 406*09537850SAkhilesh Sanikop srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4); 407*09537850SAkhilesh Sanikop srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); 408*09537850SAkhilesh Sanikop srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); 409*09537850SAkhilesh Sanikop } else if (num_taps == 6) { 410*09537850SAkhilesh Sanikop srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8); 411*09537850SAkhilesh Sanikop srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12); 412*09537850SAkhilesh Sanikop srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); 413*09537850SAkhilesh Sanikop } else if (num_taps == 8) { 414*09537850SAkhilesh Sanikop srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4); 415*09537850SAkhilesh Sanikop srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8); 416*09537850SAkhilesh Sanikop srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12); 417*09537850SAkhilesh Sanikop } 418*09537850SAkhilesh Sanikop 419*09537850SAkhilesh Sanikop const __m128i sum = 420*09537850SAkhilesh Sanikop SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps); 421*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(sum, sum); 422*09537850SAkhilesh Sanikop 423*09537850SAkhilesh Sanikop Store2(dst8, results); 424*09537850SAkhilesh Sanikop dst8 += dst_stride; 425*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 2)); 426*09537850SAkhilesh Sanikop // When |height| <= 4 the taps are restricted to 2 and 4 tap variants. 427*09537850SAkhilesh Sanikop // Therefore we don't need to check this condition when |height| > 4. 428*09537850SAkhilesh Sanikop if (num_taps <= 4 && height == 2) return; 429*09537850SAkhilesh Sanikop dst8 += dst_stride; 430*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 4)); 431*09537850SAkhilesh Sanikop dst8 += dst_stride; 432*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 6)); 433*09537850SAkhilesh Sanikop dst8 += dst_stride; 434*09537850SAkhilesh Sanikop 435*09537850SAkhilesh Sanikop srcs[0] = srcs[4]; 436*09537850SAkhilesh Sanikop if (num_taps == 6) { 437*09537850SAkhilesh Sanikop srcs[1] = srcs[5]; 438*09537850SAkhilesh Sanikop srcs[4] = srcs[8]; 439*09537850SAkhilesh Sanikop } else if (num_taps == 8) { 440*09537850SAkhilesh Sanikop srcs[1] = srcs[5]; 441*09537850SAkhilesh Sanikop srcs[2] = srcs[6]; 442*09537850SAkhilesh Sanikop srcs[3] = srcs[7]; 443*09537850SAkhilesh Sanikop srcs[4] = srcs[8]; 444*09537850SAkhilesh Sanikop } 445*09537850SAkhilesh Sanikop 446*09537850SAkhilesh Sanikop y -= 4; 447*09537850SAkhilesh Sanikop } while (y != 0); 448*09537850SAkhilesh Sanikop} 449*09537850SAkhilesh Sanikop 450*09537850SAkhilesh Sanikop// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D 451*09537850SAkhilesh Sanikop// Vertical calculations. 452*09537850SAkhilesh Sanikop__m128i Compound1DShift(const __m128i sum) { 453*09537850SAkhilesh Sanikop return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1); 454*09537850SAkhilesh Sanikop} 455*09537850SAkhilesh Sanikop 456*09537850SAkhilesh Sanikoptemplate <int num_taps> 457*09537850SAkhilesh Sanikop__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) { 458*09537850SAkhilesh Sanikop __m128i v_src[4]; 459*09537850SAkhilesh Sanikop 460*09537850SAkhilesh Sanikop if (num_taps == 6) { 461*09537850SAkhilesh Sanikop // 6 taps. 462*09537850SAkhilesh Sanikop v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 463*09537850SAkhilesh Sanikop v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); 464*09537850SAkhilesh Sanikop v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); 465*09537850SAkhilesh Sanikop } else if (num_taps == 8) { 466*09537850SAkhilesh Sanikop // 8 taps. 467*09537850SAkhilesh Sanikop v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 468*09537850SAkhilesh Sanikop v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); 469*09537850SAkhilesh Sanikop v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]); 470*09537850SAkhilesh Sanikop v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]); 471*09537850SAkhilesh Sanikop } else if (num_taps == 2) { 472*09537850SAkhilesh Sanikop // 2 taps. 473*09537850SAkhilesh Sanikop v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 474*09537850SAkhilesh Sanikop } else { 475*09537850SAkhilesh Sanikop // 4 taps. 476*09537850SAkhilesh Sanikop v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]); 477*09537850SAkhilesh Sanikop v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]); 478*09537850SAkhilesh Sanikop } 479*09537850SAkhilesh Sanikop const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap); 480*09537850SAkhilesh Sanikop return sum; 481*09537850SAkhilesh Sanikop} 482*09537850SAkhilesh Sanikop 483*09537850SAkhilesh Sanikoptemplate <int num_taps, bool is_compound = false> 484*09537850SAkhilesh Sanikopvoid FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride, 485*09537850SAkhilesh Sanikop void* const dst, const ptrdiff_t dst_stride, 486*09537850SAkhilesh Sanikop const int height, const __m128i* const v_tap) { 487*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst); 488*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst); 489*09537850SAkhilesh Sanikop 490*09537850SAkhilesh Sanikop __m128i srcs[9]; 491*09537850SAkhilesh Sanikop 492*09537850SAkhilesh Sanikop if (num_taps == 2) { 493*09537850SAkhilesh Sanikop srcs[2] = _mm_setzero_si128(); 494*09537850SAkhilesh Sanikop // 00 01 02 03 495*09537850SAkhilesh Sanikop srcs[0] = Load4(src); 496*09537850SAkhilesh Sanikop src += src_stride; 497*09537850SAkhilesh Sanikop 498*09537850SAkhilesh Sanikop int y = height; 499*09537850SAkhilesh Sanikop do { 500*09537850SAkhilesh Sanikop // 10 11 12 13 501*09537850SAkhilesh Sanikop const __m128i a = Load4(src); 502*09537850SAkhilesh Sanikop // 00 01 02 03 10 11 12 13 503*09537850SAkhilesh Sanikop srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 504*09537850SAkhilesh Sanikop src += src_stride; 505*09537850SAkhilesh Sanikop // 20 21 22 23 506*09537850SAkhilesh Sanikop srcs[2] = Load4(src); 507*09537850SAkhilesh Sanikop src += src_stride; 508*09537850SAkhilesh Sanikop // 10 11 12 13 20 21 22 23 509*09537850SAkhilesh Sanikop srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 510*09537850SAkhilesh Sanikop 511*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 512*09537850SAkhilesh Sanikop if (is_compound) { 513*09537850SAkhilesh Sanikop const __m128i results = Compound1DShift(sums); 514*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, results); 515*09537850SAkhilesh Sanikop dst16 += 4 << 1; 516*09537850SAkhilesh Sanikop } else { 517*09537850SAkhilesh Sanikop const __m128i results_16 = 518*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 519*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 520*09537850SAkhilesh Sanikop Store4(dst8, results); 521*09537850SAkhilesh Sanikop dst8 += dst_stride; 522*09537850SAkhilesh Sanikop Store4(dst8, _mm_srli_si128(results, 4)); 523*09537850SAkhilesh Sanikop dst8 += dst_stride; 524*09537850SAkhilesh Sanikop } 525*09537850SAkhilesh Sanikop 526*09537850SAkhilesh Sanikop srcs[0] = srcs[2]; 527*09537850SAkhilesh Sanikop y -= 2; 528*09537850SAkhilesh Sanikop } while (y != 0); 529*09537850SAkhilesh Sanikop } else if (num_taps == 4) { 530*09537850SAkhilesh Sanikop srcs[4] = _mm_setzero_si128(); 531*09537850SAkhilesh Sanikop // 00 01 02 03 532*09537850SAkhilesh Sanikop srcs[0] = Load4(src); 533*09537850SAkhilesh Sanikop src += src_stride; 534*09537850SAkhilesh Sanikop // 10 11 12 13 535*09537850SAkhilesh Sanikop const __m128i a = Load4(src); 536*09537850SAkhilesh Sanikop // 00 01 02 03 10 11 12 13 537*09537850SAkhilesh Sanikop srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 538*09537850SAkhilesh Sanikop src += src_stride; 539*09537850SAkhilesh Sanikop // 20 21 22 23 540*09537850SAkhilesh Sanikop srcs[2] = Load4(src); 541*09537850SAkhilesh Sanikop src += src_stride; 542*09537850SAkhilesh Sanikop // 10 11 12 13 20 21 22 23 543*09537850SAkhilesh Sanikop srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 544*09537850SAkhilesh Sanikop 545*09537850SAkhilesh Sanikop int y = height; 546*09537850SAkhilesh Sanikop do { 547*09537850SAkhilesh Sanikop // 30 31 32 33 548*09537850SAkhilesh Sanikop const __m128i b = Load4(src); 549*09537850SAkhilesh Sanikop // 20 21 22 23 30 31 32 33 550*09537850SAkhilesh Sanikop srcs[2] = _mm_unpacklo_epi32(srcs[2], b); 551*09537850SAkhilesh Sanikop src += src_stride; 552*09537850SAkhilesh Sanikop // 40 41 42 43 553*09537850SAkhilesh Sanikop srcs[4] = Load4(src); 554*09537850SAkhilesh Sanikop src += src_stride; 555*09537850SAkhilesh Sanikop // 30 31 32 33 40 41 42 43 556*09537850SAkhilesh Sanikop srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); 557*09537850SAkhilesh Sanikop 558*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 559*09537850SAkhilesh Sanikop if (is_compound) { 560*09537850SAkhilesh Sanikop const __m128i results = Compound1DShift(sums); 561*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, results); 562*09537850SAkhilesh Sanikop dst16 += 4 << 1; 563*09537850SAkhilesh Sanikop } else { 564*09537850SAkhilesh Sanikop const __m128i results_16 = 565*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 566*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 567*09537850SAkhilesh Sanikop Store4(dst8, results); 568*09537850SAkhilesh Sanikop dst8 += dst_stride; 569*09537850SAkhilesh Sanikop Store4(dst8, _mm_srli_si128(results, 4)); 570*09537850SAkhilesh Sanikop dst8 += dst_stride; 571*09537850SAkhilesh Sanikop } 572*09537850SAkhilesh Sanikop 573*09537850SAkhilesh Sanikop srcs[0] = srcs[2]; 574*09537850SAkhilesh Sanikop srcs[1] = srcs[3]; 575*09537850SAkhilesh Sanikop srcs[2] = srcs[4]; 576*09537850SAkhilesh Sanikop y -= 2; 577*09537850SAkhilesh Sanikop } while (y != 0); 578*09537850SAkhilesh Sanikop } else if (num_taps == 6) { 579*09537850SAkhilesh Sanikop srcs[6] = _mm_setzero_si128(); 580*09537850SAkhilesh Sanikop // 00 01 02 03 581*09537850SAkhilesh Sanikop srcs[0] = Load4(src); 582*09537850SAkhilesh Sanikop src += src_stride; 583*09537850SAkhilesh Sanikop // 10 11 12 13 584*09537850SAkhilesh Sanikop const __m128i a = Load4(src); 585*09537850SAkhilesh Sanikop // 00 01 02 03 10 11 12 13 586*09537850SAkhilesh Sanikop srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 587*09537850SAkhilesh Sanikop src += src_stride; 588*09537850SAkhilesh Sanikop // 20 21 22 23 589*09537850SAkhilesh Sanikop srcs[2] = Load4(src); 590*09537850SAkhilesh Sanikop src += src_stride; 591*09537850SAkhilesh Sanikop // 10 11 12 13 20 21 22 23 592*09537850SAkhilesh Sanikop srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 593*09537850SAkhilesh Sanikop // 30 31 32 33 594*09537850SAkhilesh Sanikop const __m128i b = Load4(src); 595*09537850SAkhilesh Sanikop // 20 21 22 23 30 31 32 33 596*09537850SAkhilesh Sanikop srcs[2] = _mm_unpacklo_epi32(srcs[2], b); 597*09537850SAkhilesh Sanikop src += src_stride; 598*09537850SAkhilesh Sanikop // 40 41 42 43 599*09537850SAkhilesh Sanikop srcs[4] = Load4(src); 600*09537850SAkhilesh Sanikop src += src_stride; 601*09537850SAkhilesh Sanikop // 30 31 32 33 40 41 42 43 602*09537850SAkhilesh Sanikop srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); 603*09537850SAkhilesh Sanikop 604*09537850SAkhilesh Sanikop int y = height; 605*09537850SAkhilesh Sanikop do { 606*09537850SAkhilesh Sanikop // 50 51 52 53 607*09537850SAkhilesh Sanikop const __m128i c = Load4(src); 608*09537850SAkhilesh Sanikop // 40 41 42 43 50 51 52 53 609*09537850SAkhilesh Sanikop srcs[4] = _mm_unpacklo_epi32(srcs[4], c); 610*09537850SAkhilesh Sanikop src += src_stride; 611*09537850SAkhilesh Sanikop // 60 61 62 63 612*09537850SAkhilesh Sanikop srcs[6] = Load4(src); 613*09537850SAkhilesh Sanikop src += src_stride; 614*09537850SAkhilesh Sanikop // 50 51 52 53 60 61 62 63 615*09537850SAkhilesh Sanikop srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); 616*09537850SAkhilesh Sanikop 617*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 618*09537850SAkhilesh Sanikop if (is_compound) { 619*09537850SAkhilesh Sanikop const __m128i results = Compound1DShift(sums); 620*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, results); 621*09537850SAkhilesh Sanikop dst16 += 4 << 1; 622*09537850SAkhilesh Sanikop } else { 623*09537850SAkhilesh Sanikop const __m128i results_16 = 624*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 625*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 626*09537850SAkhilesh Sanikop Store4(dst8, results); 627*09537850SAkhilesh Sanikop dst8 += dst_stride; 628*09537850SAkhilesh Sanikop Store4(dst8, _mm_srli_si128(results, 4)); 629*09537850SAkhilesh Sanikop dst8 += dst_stride; 630*09537850SAkhilesh Sanikop } 631*09537850SAkhilesh Sanikop 632*09537850SAkhilesh Sanikop srcs[0] = srcs[2]; 633*09537850SAkhilesh Sanikop srcs[1] = srcs[3]; 634*09537850SAkhilesh Sanikop srcs[2] = srcs[4]; 635*09537850SAkhilesh Sanikop srcs[3] = srcs[5]; 636*09537850SAkhilesh Sanikop srcs[4] = srcs[6]; 637*09537850SAkhilesh Sanikop y -= 2; 638*09537850SAkhilesh Sanikop } while (y != 0); 639*09537850SAkhilesh Sanikop } else if (num_taps == 8) { 640*09537850SAkhilesh Sanikop srcs[8] = _mm_setzero_si128(); 641*09537850SAkhilesh Sanikop // 00 01 02 03 642*09537850SAkhilesh Sanikop srcs[0] = Load4(src); 643*09537850SAkhilesh Sanikop src += src_stride; 644*09537850SAkhilesh Sanikop // 10 11 12 13 645*09537850SAkhilesh Sanikop const __m128i a = Load4(src); 646*09537850SAkhilesh Sanikop // 00 01 02 03 10 11 12 13 647*09537850SAkhilesh Sanikop srcs[0] = _mm_unpacklo_epi32(srcs[0], a); 648*09537850SAkhilesh Sanikop src += src_stride; 649*09537850SAkhilesh Sanikop // 20 21 22 23 650*09537850SAkhilesh Sanikop srcs[2] = Load4(src); 651*09537850SAkhilesh Sanikop src += src_stride; 652*09537850SAkhilesh Sanikop // 10 11 12 13 20 21 22 23 653*09537850SAkhilesh Sanikop srcs[1] = _mm_unpacklo_epi32(a, srcs[2]); 654*09537850SAkhilesh Sanikop // 30 31 32 33 655*09537850SAkhilesh Sanikop const __m128i b = Load4(src); 656*09537850SAkhilesh Sanikop // 20 21 22 23 30 31 32 33 657*09537850SAkhilesh Sanikop srcs[2] = _mm_unpacklo_epi32(srcs[2], b); 658*09537850SAkhilesh Sanikop src += src_stride; 659*09537850SAkhilesh Sanikop // 40 41 42 43 660*09537850SAkhilesh Sanikop srcs[4] = Load4(src); 661*09537850SAkhilesh Sanikop src += src_stride; 662*09537850SAkhilesh Sanikop // 30 31 32 33 40 41 42 43 663*09537850SAkhilesh Sanikop srcs[3] = _mm_unpacklo_epi32(b, srcs[4]); 664*09537850SAkhilesh Sanikop // 50 51 52 53 665*09537850SAkhilesh Sanikop const __m128i c = Load4(src); 666*09537850SAkhilesh Sanikop // 40 41 42 43 50 51 52 53 667*09537850SAkhilesh Sanikop srcs[4] = _mm_unpacklo_epi32(srcs[4], c); 668*09537850SAkhilesh Sanikop src += src_stride; 669*09537850SAkhilesh Sanikop // 60 61 62 63 670*09537850SAkhilesh Sanikop srcs[6] = Load4(src); 671*09537850SAkhilesh Sanikop src += src_stride; 672*09537850SAkhilesh Sanikop // 50 51 52 53 60 61 62 63 673*09537850SAkhilesh Sanikop srcs[5] = _mm_unpacklo_epi32(c, srcs[6]); 674*09537850SAkhilesh Sanikop 675*09537850SAkhilesh Sanikop int y = height; 676*09537850SAkhilesh Sanikop do { 677*09537850SAkhilesh Sanikop // 70 71 72 73 678*09537850SAkhilesh Sanikop const __m128i d = Load4(src); 679*09537850SAkhilesh Sanikop // 60 61 62 63 70 71 72 73 680*09537850SAkhilesh Sanikop srcs[6] = _mm_unpacklo_epi32(srcs[6], d); 681*09537850SAkhilesh Sanikop src += src_stride; 682*09537850SAkhilesh Sanikop // 80 81 82 83 683*09537850SAkhilesh Sanikop srcs[8] = Load4(src); 684*09537850SAkhilesh Sanikop src += src_stride; 685*09537850SAkhilesh Sanikop // 70 71 72 73 80 81 82 83 686*09537850SAkhilesh Sanikop srcs[7] = _mm_unpacklo_epi32(d, srcs[8]); 687*09537850SAkhilesh Sanikop 688*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 689*09537850SAkhilesh Sanikop if (is_compound) { 690*09537850SAkhilesh Sanikop const __m128i results = Compound1DShift(sums); 691*09537850SAkhilesh Sanikop StoreUnaligned16(dst16, results); 692*09537850SAkhilesh Sanikop dst16 += 4 << 1; 693*09537850SAkhilesh Sanikop } else { 694*09537850SAkhilesh Sanikop const __m128i results_16 = 695*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 696*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 697*09537850SAkhilesh Sanikop Store4(dst8, results); 698*09537850SAkhilesh Sanikop dst8 += dst_stride; 699*09537850SAkhilesh Sanikop Store4(dst8, _mm_srli_si128(results, 4)); 700*09537850SAkhilesh Sanikop dst8 += dst_stride; 701*09537850SAkhilesh Sanikop } 702*09537850SAkhilesh Sanikop 703*09537850SAkhilesh Sanikop srcs[0] = srcs[2]; 704*09537850SAkhilesh Sanikop srcs[1] = srcs[3]; 705*09537850SAkhilesh Sanikop srcs[2] = srcs[4]; 706*09537850SAkhilesh Sanikop srcs[3] = srcs[5]; 707*09537850SAkhilesh Sanikop srcs[4] = srcs[6]; 708*09537850SAkhilesh Sanikop srcs[5] = srcs[7]; 709*09537850SAkhilesh Sanikop srcs[6] = srcs[8]; 710*09537850SAkhilesh Sanikop y -= 2; 711*09537850SAkhilesh Sanikop } while (y != 0); 712*09537850SAkhilesh Sanikop } 713*09537850SAkhilesh Sanikop} 714*09537850SAkhilesh Sanikop 715*09537850SAkhilesh Sanikoptemplate <int num_taps, bool negative_outside_taps = false> 716*09537850SAkhilesh Sanikopvoid FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride, 717*09537850SAkhilesh Sanikop void* const dst, const ptrdiff_t dst_stride, 718*09537850SAkhilesh Sanikop const int height, const __m128i* const v_tap) { 719*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst); 720*09537850SAkhilesh Sanikop 721*09537850SAkhilesh Sanikop __m128i srcs[9]; 722*09537850SAkhilesh Sanikop 723*09537850SAkhilesh Sanikop if (num_taps == 2) { 724*09537850SAkhilesh Sanikop srcs[2] = _mm_setzero_si128(); 725*09537850SAkhilesh Sanikop // 00 01 726*09537850SAkhilesh Sanikop srcs[0] = Load2(src); 727*09537850SAkhilesh Sanikop src += src_stride; 728*09537850SAkhilesh Sanikop 729*09537850SAkhilesh Sanikop int y = height; 730*09537850SAkhilesh Sanikop do { 731*09537850SAkhilesh Sanikop // 00 01 10 11 732*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]); 733*09537850SAkhilesh Sanikop src += src_stride; 734*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 735*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]); 736*09537850SAkhilesh Sanikop src += src_stride; 737*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 738*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]); 739*09537850SAkhilesh Sanikop src += src_stride; 740*09537850SAkhilesh Sanikop // 40 41 741*09537850SAkhilesh Sanikop srcs[2] = Load2<0>(src, srcs[2]); 742*09537850SAkhilesh Sanikop src += src_stride; 743*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 40 41 744*09537850SAkhilesh Sanikop const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]); 745*09537850SAkhilesh Sanikop // 10 11 20 21 30 31 40 41 746*09537850SAkhilesh Sanikop srcs[1] = _mm_srli_si128(srcs_0_2, 2); 747*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[1]. 748*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 749*09537850SAkhilesh Sanikop const __m128i results_16 = 750*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 751*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 752*09537850SAkhilesh Sanikop 753*09537850SAkhilesh Sanikop Store2(dst8, results); 754*09537850SAkhilesh Sanikop dst8 += dst_stride; 755*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 2)); 756*09537850SAkhilesh Sanikop if (height == 2) return; 757*09537850SAkhilesh Sanikop dst8 += dst_stride; 758*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 4)); 759*09537850SAkhilesh Sanikop dst8 += dst_stride; 760*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 6)); 761*09537850SAkhilesh Sanikop dst8 += dst_stride; 762*09537850SAkhilesh Sanikop 763*09537850SAkhilesh Sanikop srcs[0] = srcs[2]; 764*09537850SAkhilesh Sanikop y -= 4; 765*09537850SAkhilesh Sanikop } while (y != 0); 766*09537850SAkhilesh Sanikop } else if (num_taps == 4) { 767*09537850SAkhilesh Sanikop srcs[4] = _mm_setzero_si128(); 768*09537850SAkhilesh Sanikop 769*09537850SAkhilesh Sanikop // 00 01 770*09537850SAkhilesh Sanikop srcs[0] = Load2(src); 771*09537850SAkhilesh Sanikop src += src_stride; 772*09537850SAkhilesh Sanikop // 00 01 10 11 773*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]); 774*09537850SAkhilesh Sanikop src += src_stride; 775*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 776*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]); 777*09537850SAkhilesh Sanikop src += src_stride; 778*09537850SAkhilesh Sanikop 779*09537850SAkhilesh Sanikop int y = height; 780*09537850SAkhilesh Sanikop do { 781*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 782*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]); 783*09537850SAkhilesh Sanikop src += src_stride; 784*09537850SAkhilesh Sanikop // 40 41 785*09537850SAkhilesh Sanikop srcs[4] = Load2<0>(src, srcs[4]); 786*09537850SAkhilesh Sanikop src += src_stride; 787*09537850SAkhilesh Sanikop // 40 41 50 51 788*09537850SAkhilesh Sanikop srcs[4] = Load2<1>(src, srcs[4]); 789*09537850SAkhilesh Sanikop src += src_stride; 790*09537850SAkhilesh Sanikop // 40 41 50 51 60 61 791*09537850SAkhilesh Sanikop srcs[4] = Load2<2>(src, srcs[4]); 792*09537850SAkhilesh Sanikop src += src_stride; 793*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 794*09537850SAkhilesh Sanikop const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); 795*09537850SAkhilesh Sanikop // 10 11 20 21 30 31 40 41 796*09537850SAkhilesh Sanikop srcs[1] = _mm_srli_si128(srcs_0_4, 2); 797*09537850SAkhilesh Sanikop // 20 21 30 31 40 41 50 51 798*09537850SAkhilesh Sanikop srcs[2] = _mm_srli_si128(srcs_0_4, 4); 799*09537850SAkhilesh Sanikop // 30 31 40 41 50 51 60 61 800*09537850SAkhilesh Sanikop srcs[3] = _mm_srli_si128(srcs_0_4, 6); 801*09537850SAkhilesh Sanikop 802*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[3]. 803*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 804*09537850SAkhilesh Sanikop const __m128i results_16 = 805*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 806*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 807*09537850SAkhilesh Sanikop 808*09537850SAkhilesh Sanikop Store2(dst8, results); 809*09537850SAkhilesh Sanikop dst8 += dst_stride; 810*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 2)); 811*09537850SAkhilesh Sanikop if (height == 2) return; 812*09537850SAkhilesh Sanikop dst8 += dst_stride; 813*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 4)); 814*09537850SAkhilesh Sanikop dst8 += dst_stride; 815*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 6)); 816*09537850SAkhilesh Sanikop dst8 += dst_stride; 817*09537850SAkhilesh Sanikop 818*09537850SAkhilesh Sanikop srcs[0] = srcs[4]; 819*09537850SAkhilesh Sanikop y -= 4; 820*09537850SAkhilesh Sanikop } while (y != 0); 821*09537850SAkhilesh Sanikop } else if (num_taps == 6) { 822*09537850SAkhilesh Sanikop // During the vertical pass the number of taps is restricted when 823*09537850SAkhilesh Sanikop // |height| <= 4. 824*09537850SAkhilesh Sanikop assert(height > 4); 825*09537850SAkhilesh Sanikop srcs[8] = _mm_setzero_si128(); 826*09537850SAkhilesh Sanikop 827*09537850SAkhilesh Sanikop // 00 01 828*09537850SAkhilesh Sanikop srcs[0] = Load2(src); 829*09537850SAkhilesh Sanikop src += src_stride; 830*09537850SAkhilesh Sanikop // 00 01 10 11 831*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]); 832*09537850SAkhilesh Sanikop src += src_stride; 833*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 834*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]); 835*09537850SAkhilesh Sanikop src += src_stride; 836*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 837*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]); 838*09537850SAkhilesh Sanikop src += src_stride; 839*09537850SAkhilesh Sanikop // 40 41 840*09537850SAkhilesh Sanikop srcs[4] = Load2(src); 841*09537850SAkhilesh Sanikop src += src_stride; 842*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 843*09537850SAkhilesh Sanikop const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]); 844*09537850SAkhilesh Sanikop // 10 11 20 21 30 31 40 41 845*09537850SAkhilesh Sanikop srcs[1] = _mm_srli_si128(srcs_0_4x, 2); 846*09537850SAkhilesh Sanikop 847*09537850SAkhilesh Sanikop int y = height; 848*09537850SAkhilesh Sanikop do { 849*09537850SAkhilesh Sanikop // 40 41 50 51 850*09537850SAkhilesh Sanikop srcs[4] = Load2<1>(src, srcs[4]); 851*09537850SAkhilesh Sanikop src += src_stride; 852*09537850SAkhilesh Sanikop // 40 41 50 51 60 61 853*09537850SAkhilesh Sanikop srcs[4] = Load2<2>(src, srcs[4]); 854*09537850SAkhilesh Sanikop src += src_stride; 855*09537850SAkhilesh Sanikop // 40 41 50 51 60 61 70 71 856*09537850SAkhilesh Sanikop srcs[4] = Load2<3>(src, srcs[4]); 857*09537850SAkhilesh Sanikop src += src_stride; 858*09537850SAkhilesh Sanikop // 80 81 859*09537850SAkhilesh Sanikop srcs[8] = Load2<0>(src, srcs[8]); 860*09537850SAkhilesh Sanikop src += src_stride; 861*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 862*09537850SAkhilesh Sanikop const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); 863*09537850SAkhilesh Sanikop // 20 21 30 31 40 41 50 51 864*09537850SAkhilesh Sanikop srcs[2] = _mm_srli_si128(srcs_0_4, 4); 865*09537850SAkhilesh Sanikop // 30 31 40 41 50 51 60 61 866*09537850SAkhilesh Sanikop srcs[3] = _mm_srli_si128(srcs_0_4, 6); 867*09537850SAkhilesh Sanikop const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); 868*09537850SAkhilesh Sanikop // 50 51 60 61 70 71 80 81 869*09537850SAkhilesh Sanikop srcs[5] = _mm_srli_si128(srcs_4_8, 2); 870*09537850SAkhilesh Sanikop 871*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[5]. 872*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 873*09537850SAkhilesh Sanikop const __m128i results_16 = 874*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 875*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 876*09537850SAkhilesh Sanikop 877*09537850SAkhilesh Sanikop Store2(dst8, results); 878*09537850SAkhilesh Sanikop dst8 += dst_stride; 879*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 2)); 880*09537850SAkhilesh Sanikop dst8 += dst_stride; 881*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 4)); 882*09537850SAkhilesh Sanikop dst8 += dst_stride; 883*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 6)); 884*09537850SAkhilesh Sanikop dst8 += dst_stride; 885*09537850SAkhilesh Sanikop 886*09537850SAkhilesh Sanikop srcs[0] = srcs[4]; 887*09537850SAkhilesh Sanikop srcs[1] = srcs[5]; 888*09537850SAkhilesh Sanikop srcs[4] = srcs[8]; 889*09537850SAkhilesh Sanikop y -= 4; 890*09537850SAkhilesh Sanikop } while (y != 0); 891*09537850SAkhilesh Sanikop } else if (num_taps == 8) { 892*09537850SAkhilesh Sanikop // During the vertical pass the number of taps is restricted when 893*09537850SAkhilesh Sanikop // |height| <= 4. 894*09537850SAkhilesh Sanikop assert(height > 4); 895*09537850SAkhilesh Sanikop srcs[8] = _mm_setzero_si128(); 896*09537850SAkhilesh Sanikop // 00 01 897*09537850SAkhilesh Sanikop srcs[0] = Load2(src); 898*09537850SAkhilesh Sanikop src += src_stride; 899*09537850SAkhilesh Sanikop // 00 01 10 11 900*09537850SAkhilesh Sanikop srcs[0] = Load2<1>(src, srcs[0]); 901*09537850SAkhilesh Sanikop src += src_stride; 902*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 903*09537850SAkhilesh Sanikop srcs[0] = Load2<2>(src, srcs[0]); 904*09537850SAkhilesh Sanikop src += src_stride; 905*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 906*09537850SAkhilesh Sanikop srcs[0] = Load2<3>(src, srcs[0]); 907*09537850SAkhilesh Sanikop src += src_stride; 908*09537850SAkhilesh Sanikop // 40 41 909*09537850SAkhilesh Sanikop srcs[4] = Load2(src); 910*09537850SAkhilesh Sanikop src += src_stride; 911*09537850SAkhilesh Sanikop // 40 41 50 51 912*09537850SAkhilesh Sanikop srcs[4] = Load2<1>(src, srcs[4]); 913*09537850SAkhilesh Sanikop src += src_stride; 914*09537850SAkhilesh Sanikop // 40 41 50 51 60 61 915*09537850SAkhilesh Sanikop srcs[4] = Load2<2>(src, srcs[4]); 916*09537850SAkhilesh Sanikop src += src_stride; 917*09537850SAkhilesh Sanikop 918*09537850SAkhilesh Sanikop // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 919*09537850SAkhilesh Sanikop const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]); 920*09537850SAkhilesh Sanikop // 10 11 20 21 30 31 40 41 921*09537850SAkhilesh Sanikop srcs[1] = _mm_srli_si128(srcs_0_4, 2); 922*09537850SAkhilesh Sanikop // 20 21 30 31 40 41 50 51 923*09537850SAkhilesh Sanikop srcs[2] = _mm_srli_si128(srcs_0_4, 4); 924*09537850SAkhilesh Sanikop // 30 31 40 41 50 51 60 61 925*09537850SAkhilesh Sanikop srcs[3] = _mm_srli_si128(srcs_0_4, 6); 926*09537850SAkhilesh Sanikop 927*09537850SAkhilesh Sanikop int y = height; 928*09537850SAkhilesh Sanikop do { 929*09537850SAkhilesh Sanikop // 40 41 50 51 60 61 70 71 930*09537850SAkhilesh Sanikop srcs[4] = Load2<3>(src, srcs[4]); 931*09537850SAkhilesh Sanikop src += src_stride; 932*09537850SAkhilesh Sanikop // 80 81 933*09537850SAkhilesh Sanikop srcs[8] = Load2<0>(src, srcs[8]); 934*09537850SAkhilesh Sanikop src += src_stride; 935*09537850SAkhilesh Sanikop // 80 81 90 91 936*09537850SAkhilesh Sanikop srcs[8] = Load2<1>(src, srcs[8]); 937*09537850SAkhilesh Sanikop src += src_stride; 938*09537850SAkhilesh Sanikop // 80 81 90 91 a0 a1 939*09537850SAkhilesh Sanikop srcs[8] = Load2<2>(src, srcs[8]); 940*09537850SAkhilesh Sanikop src += src_stride; 941*09537850SAkhilesh Sanikop 942*09537850SAkhilesh Sanikop // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1 943*09537850SAkhilesh Sanikop const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]); 944*09537850SAkhilesh Sanikop // 50 51 60 61 70 71 80 81 945*09537850SAkhilesh Sanikop srcs[5] = _mm_srli_si128(srcs_4_8, 2); 946*09537850SAkhilesh Sanikop // 60 61 70 71 80 81 90 91 947*09537850SAkhilesh Sanikop srcs[6] = _mm_srli_si128(srcs_4_8, 4); 948*09537850SAkhilesh Sanikop // 70 71 80 81 90 91 a0 a1 949*09537850SAkhilesh Sanikop srcs[7] = _mm_srli_si128(srcs_4_8, 6); 950*09537850SAkhilesh Sanikop 951*09537850SAkhilesh Sanikop // This uses srcs[0]..srcs[7]. 952*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap); 953*09537850SAkhilesh Sanikop const __m128i results_16 = 954*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1); 955*09537850SAkhilesh Sanikop const __m128i results = _mm_packus_epi16(results_16, results_16); 956*09537850SAkhilesh Sanikop 957*09537850SAkhilesh Sanikop Store2(dst8, results); 958*09537850SAkhilesh Sanikop dst8 += dst_stride; 959*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 2)); 960*09537850SAkhilesh Sanikop dst8 += dst_stride; 961*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 4)); 962*09537850SAkhilesh Sanikop dst8 += dst_stride; 963*09537850SAkhilesh Sanikop Store2(dst8, _mm_srli_si128(results, 6)); 964*09537850SAkhilesh Sanikop dst8 += dst_stride; 965*09537850SAkhilesh Sanikop 966*09537850SAkhilesh Sanikop srcs[0] = srcs[4]; 967*09537850SAkhilesh Sanikop srcs[1] = srcs[5]; 968*09537850SAkhilesh Sanikop srcs[2] = srcs[6]; 969*09537850SAkhilesh Sanikop srcs[3] = srcs[7]; 970*09537850SAkhilesh Sanikop srcs[4] = srcs[8]; 971*09537850SAkhilesh Sanikop y -= 4; 972*09537850SAkhilesh Sanikop } while (y != 0); 973*09537850SAkhilesh Sanikop } 974*09537850SAkhilesh Sanikop} 975