1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/obmc.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_SSE4_1
19*09537850SAkhilesh Sanikop
20*09537850SAkhilesh Sanikop #include <xmmintrin.h>
21*09537850SAkhilesh Sanikop
22*09537850SAkhilesh Sanikop #include <cassert>
23*09537850SAkhilesh Sanikop #include <cstddef>
24*09537850SAkhilesh Sanikop #include <cstdint>
25*09537850SAkhilesh Sanikop
26*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
27*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
28*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_sse4.h"
29*09537850SAkhilesh Sanikop #include "src/utils/common.h"
30*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
31*09537850SAkhilesh Sanikop
32*09537850SAkhilesh Sanikop namespace libgav1 {
33*09537850SAkhilesh Sanikop namespace dsp {
34*09537850SAkhilesh Sanikop namespace low_bitdepth {
35*09537850SAkhilesh Sanikop namespace {
36*09537850SAkhilesh Sanikop
37*09537850SAkhilesh Sanikop #include "src/dsp/obmc.inc"
38*09537850SAkhilesh Sanikop
OverlapBlendFromLeft2xH_SSE4_1(uint8_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT const obmc_prediction)39*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft2xH_SSE4_1(
40*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const prediction,
41*09537850SAkhilesh Sanikop const ptrdiff_t prediction_stride, const int height,
42*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
43*09537850SAkhilesh Sanikop constexpr int obmc_prediction_stride = 2;
44*09537850SAkhilesh Sanikop uint8_t* pred = prediction;
45*09537850SAkhilesh Sanikop const uint8_t* obmc_pred = obmc_prediction;
46*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
47*09537850SAkhilesh Sanikop const __m128i mask_val = _mm_shufflelo_epi16(Load4(kObmcMask), 0);
48*09537850SAkhilesh Sanikop // 64 - mask
49*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
50*09537850SAkhilesh Sanikop const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
51*09537850SAkhilesh Sanikop int y = height;
52*09537850SAkhilesh Sanikop do {
53*09537850SAkhilesh Sanikop const __m128i pred_val = Load2x2(pred, pred + prediction_stride);
54*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = Load4(obmc_pred);
55*09537850SAkhilesh Sanikop
56*09537850SAkhilesh Sanikop const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
57*09537850SAkhilesh Sanikop const __m128i result =
58*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
59*09537850SAkhilesh Sanikop const __m128i packed_result = _mm_packus_epi16(result, result);
60*09537850SAkhilesh Sanikop Store2(pred, packed_result);
61*09537850SAkhilesh Sanikop pred += prediction_stride;
62*09537850SAkhilesh Sanikop const int16_t second_row_result = _mm_extract_epi16(packed_result, 1);
63*09537850SAkhilesh Sanikop memcpy(pred, &second_row_result, sizeof(second_row_result));
64*09537850SAkhilesh Sanikop pred += prediction_stride;
65*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride << 1;
66*09537850SAkhilesh Sanikop y -= 2;
67*09537850SAkhilesh Sanikop } while (y != 0);
68*09537850SAkhilesh Sanikop }
69*09537850SAkhilesh Sanikop
OverlapBlendFromLeft4xH_SSE4_1(uint8_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT const obmc_prediction)70*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft4xH_SSE4_1(
71*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const prediction,
72*09537850SAkhilesh Sanikop const ptrdiff_t prediction_stride, const int height,
73*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
74*09537850SAkhilesh Sanikop constexpr int obmc_prediction_stride = 4;
75*09537850SAkhilesh Sanikop uint8_t* pred = prediction;
76*09537850SAkhilesh Sanikop const uint8_t* obmc_pred = obmc_prediction;
77*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
78*09537850SAkhilesh Sanikop const __m128i mask_val = Load4(kObmcMask + 2);
79*09537850SAkhilesh Sanikop // 64 - mask
80*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
81*09537850SAkhilesh Sanikop // Duplicate first half of vector.
82*09537850SAkhilesh Sanikop const __m128i masks =
83*09537850SAkhilesh Sanikop _mm_shuffle_epi32(_mm_unpacklo_epi8(mask_val, obmc_mask_val), 0x44);
84*09537850SAkhilesh Sanikop int y = height;
85*09537850SAkhilesh Sanikop do {
86*09537850SAkhilesh Sanikop const __m128i pred_val0 = Load4(pred);
87*09537850SAkhilesh Sanikop pred += prediction_stride;
88*09537850SAkhilesh Sanikop
89*09537850SAkhilesh Sanikop // Place the second row of each source in the second four bytes.
90*09537850SAkhilesh Sanikop const __m128i pred_val =
91*09537850SAkhilesh Sanikop _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
92*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadLo8(obmc_pred);
93*09537850SAkhilesh Sanikop const __m128i terms = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
94*09537850SAkhilesh Sanikop const __m128i result =
95*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
96*09537850SAkhilesh Sanikop const __m128i packed_result = _mm_packus_epi16(result, result);
97*09537850SAkhilesh Sanikop Store4(pred - prediction_stride, packed_result);
98*09537850SAkhilesh Sanikop const int second_row_result = _mm_extract_epi32(packed_result, 1);
99*09537850SAkhilesh Sanikop memcpy(pred, &second_row_result, sizeof(second_row_result));
100*09537850SAkhilesh Sanikop pred += prediction_stride;
101*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride << 1;
102*09537850SAkhilesh Sanikop y -= 2;
103*09537850SAkhilesh Sanikop } while (y != 0);
104*09537850SAkhilesh Sanikop }
105*09537850SAkhilesh Sanikop
OverlapBlendFromLeft8xH_SSE4_1(uint8_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT const obmc_prediction)106*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft8xH_SSE4_1(
107*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const prediction,
108*09537850SAkhilesh Sanikop const ptrdiff_t prediction_stride, const int height,
109*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
110*09537850SAkhilesh Sanikop constexpr int obmc_prediction_stride = 8;
111*09537850SAkhilesh Sanikop uint8_t* pred = prediction;
112*09537850SAkhilesh Sanikop const uint8_t* obmc_pred = obmc_prediction;
113*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi8(64);
114*09537850SAkhilesh Sanikop const __m128i mask_val = LoadLo8(kObmcMask + 6);
115*09537850SAkhilesh Sanikop // 64 - mask
116*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
117*09537850SAkhilesh Sanikop const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
118*09537850SAkhilesh Sanikop int y = height;
119*09537850SAkhilesh Sanikop do {
120*09537850SAkhilesh Sanikop const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
121*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
122*09537850SAkhilesh Sanikop
123*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
124*09537850SAkhilesh Sanikop const __m128i result_lo =
125*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
126*09537850SAkhilesh Sanikop
127*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
128*09537850SAkhilesh Sanikop const __m128i result_hi =
129*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
130*09537850SAkhilesh Sanikop
131*09537850SAkhilesh Sanikop const __m128i result = _mm_packus_epi16(result_lo, result_hi);
132*09537850SAkhilesh Sanikop StoreLo8(pred, result);
133*09537850SAkhilesh Sanikop pred += prediction_stride;
134*09537850SAkhilesh Sanikop StoreHi8(pred, result);
135*09537850SAkhilesh Sanikop pred += prediction_stride;
136*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride << 1;
137*09537850SAkhilesh Sanikop y -= 2;
138*09537850SAkhilesh Sanikop } while (y != 0);
139*09537850SAkhilesh Sanikop }
140*09537850SAkhilesh Sanikop
OverlapBlendFromLeft_SSE4_1(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)141*09537850SAkhilesh Sanikop void OverlapBlendFromLeft_SSE4_1(
142*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
143*09537850SAkhilesh Sanikop const int width, const int height,
144*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const obmc_prediction,
145*09537850SAkhilesh Sanikop const ptrdiff_t obmc_prediction_stride) {
146*09537850SAkhilesh Sanikop auto* pred = static_cast<uint8_t*>(prediction);
147*09537850SAkhilesh Sanikop const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
148*09537850SAkhilesh Sanikop assert(width >= 2);
149*09537850SAkhilesh Sanikop assert(height >= 4);
150*09537850SAkhilesh Sanikop
151*09537850SAkhilesh Sanikop if (width == 2) {
152*09537850SAkhilesh Sanikop OverlapBlendFromLeft2xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
153*09537850SAkhilesh Sanikop return;
154*09537850SAkhilesh Sanikop }
155*09537850SAkhilesh Sanikop if (width == 4) {
156*09537850SAkhilesh Sanikop OverlapBlendFromLeft4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
157*09537850SAkhilesh Sanikop return;
158*09537850SAkhilesh Sanikop }
159*09537850SAkhilesh Sanikop if (width == 8) {
160*09537850SAkhilesh Sanikop OverlapBlendFromLeft8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
161*09537850SAkhilesh Sanikop return;
162*09537850SAkhilesh Sanikop }
163*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi8(64);
164*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + width - 2;
165*09537850SAkhilesh Sanikop int x = 0;
166*09537850SAkhilesh Sanikop do {
167*09537850SAkhilesh Sanikop pred = static_cast<uint8_t*>(prediction) + x;
168*09537850SAkhilesh Sanikop obmc_pred = static_cast<const uint8_t*>(obmc_prediction) + x;
169*09537850SAkhilesh Sanikop const __m128i mask_val = LoadUnaligned16(mask + x);
170*09537850SAkhilesh Sanikop // 64 - mask
171*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
172*09537850SAkhilesh Sanikop const __m128i masks_lo = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
173*09537850SAkhilesh Sanikop const __m128i masks_hi = _mm_unpackhi_epi8(mask_val, obmc_mask_val);
174*09537850SAkhilesh Sanikop
175*09537850SAkhilesh Sanikop int y = 0;
176*09537850SAkhilesh Sanikop do {
177*09537850SAkhilesh Sanikop const __m128i pred_val = LoadUnaligned16(pred);
178*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
179*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
180*09537850SAkhilesh Sanikop const __m128i result_lo =
181*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks_lo), 6);
182*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
183*09537850SAkhilesh Sanikop const __m128i result_hi =
184*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks_hi), 6);
185*09537850SAkhilesh Sanikop StoreUnaligned16(pred, _mm_packus_epi16(result_lo, result_hi));
186*09537850SAkhilesh Sanikop
187*09537850SAkhilesh Sanikop pred += prediction_stride;
188*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride;
189*09537850SAkhilesh Sanikop } while (++y < height);
190*09537850SAkhilesh Sanikop x += 16;
191*09537850SAkhilesh Sanikop } while (x < width);
192*09537850SAkhilesh Sanikop }
193*09537850SAkhilesh Sanikop
OverlapBlendFromTop4xH_SSE4_1(uint8_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT const obmc_prediction)194*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop4xH_SSE4_1(
195*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const prediction,
196*09537850SAkhilesh Sanikop const ptrdiff_t prediction_stride, const int height,
197*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
198*09537850SAkhilesh Sanikop constexpr int obmc_prediction_stride = 4;
199*09537850SAkhilesh Sanikop uint8_t* pred = prediction;
200*09537850SAkhilesh Sanikop const uint8_t* obmc_pred = obmc_prediction;
201*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi16(64);
202*09537850SAkhilesh Sanikop const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
203*09537850SAkhilesh Sanikop const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
204*09537850SAkhilesh Sanikop
205*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + height - 2;
206*09537850SAkhilesh Sanikop const int compute_height = height - (height >> 2);
207*09537850SAkhilesh Sanikop int y = 0;
208*09537850SAkhilesh Sanikop do {
209*09537850SAkhilesh Sanikop // First mask in the first half, second mask in the second half.
210*09537850SAkhilesh Sanikop const __m128i mask_val = _mm_shuffle_epi8(
211*09537850SAkhilesh Sanikop _mm_cvtsi32_si128(*reinterpret_cast<const uint16_t*>(mask + y)),
212*09537850SAkhilesh Sanikop mask_shuffler);
213*09537850SAkhilesh Sanikop const __m128i masks =
214*09537850SAkhilesh Sanikop _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
215*09537850SAkhilesh Sanikop const __m128i pred_val0 = Load4(pred);
216*09537850SAkhilesh Sanikop
217*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadLo8(obmc_pred);
218*09537850SAkhilesh Sanikop pred += prediction_stride;
219*09537850SAkhilesh Sanikop const __m128i pred_val =
220*09537850SAkhilesh Sanikop _mm_alignr_epi8(Load4(pred), _mm_slli_si128(pred_val0, 12), 12);
221*09537850SAkhilesh Sanikop const __m128i terms = _mm_unpacklo_epi8(obmc_pred_val, pred_val);
222*09537850SAkhilesh Sanikop const __m128i result =
223*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms, masks), 6);
224*09537850SAkhilesh Sanikop
225*09537850SAkhilesh Sanikop const __m128i packed_result = _mm_packus_epi16(result, result);
226*09537850SAkhilesh Sanikop Store4(pred - prediction_stride, packed_result);
227*09537850SAkhilesh Sanikop Store4(pred, _mm_srli_si128(packed_result, 4));
228*09537850SAkhilesh Sanikop pred += prediction_stride;
229*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride << 1;
230*09537850SAkhilesh Sanikop y += 2;
231*09537850SAkhilesh Sanikop } while (y < compute_height);
232*09537850SAkhilesh Sanikop }
233*09537850SAkhilesh Sanikop
OverlapBlendFromTop8xH_SSE4_1(uint8_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int height,const uint8_t * LIBGAV1_RESTRICT const obmc_prediction)234*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop8xH_SSE4_1(
235*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const prediction,
236*09537850SAkhilesh Sanikop const ptrdiff_t prediction_stride, const int height,
237*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const obmc_prediction) {
238*09537850SAkhilesh Sanikop constexpr int obmc_prediction_stride = 8;
239*09537850SAkhilesh Sanikop uint8_t* pred = prediction;
240*09537850SAkhilesh Sanikop const uint8_t* obmc_pred = obmc_prediction;
241*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + height - 2;
242*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi8(64);
243*09537850SAkhilesh Sanikop const int compute_height = height - (height >> 2);
244*09537850SAkhilesh Sanikop int y = compute_height;
245*09537850SAkhilesh Sanikop do {
246*09537850SAkhilesh Sanikop const __m128i mask_val0 = _mm_set1_epi8(mask[compute_height - y]);
247*09537850SAkhilesh Sanikop // 64 - mask
248*09537850SAkhilesh Sanikop const __m128i obmc_mask_val0 = _mm_sub_epi8(mask_inverter, mask_val0);
249*09537850SAkhilesh Sanikop const __m128i masks0 = _mm_unpacklo_epi8(mask_val0, obmc_mask_val0);
250*09537850SAkhilesh Sanikop
251*09537850SAkhilesh Sanikop const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + prediction_stride);
252*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
253*09537850SAkhilesh Sanikop
254*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
255*09537850SAkhilesh Sanikop const __m128i result_lo =
256*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks0), 6);
257*09537850SAkhilesh Sanikop
258*09537850SAkhilesh Sanikop --y;
259*09537850SAkhilesh Sanikop const __m128i mask_val1 = _mm_set1_epi8(mask[compute_height - y]);
260*09537850SAkhilesh Sanikop // 64 - mask
261*09537850SAkhilesh Sanikop const __m128i obmc_mask_val1 = _mm_sub_epi8(mask_inverter, mask_val1);
262*09537850SAkhilesh Sanikop const __m128i masks1 = _mm_unpacklo_epi8(mask_val1, obmc_mask_val1);
263*09537850SAkhilesh Sanikop
264*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
265*09537850SAkhilesh Sanikop const __m128i result_hi =
266*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks1), 6);
267*09537850SAkhilesh Sanikop
268*09537850SAkhilesh Sanikop const __m128i result = _mm_packus_epi16(result_lo, result_hi);
269*09537850SAkhilesh Sanikop StoreLo8(pred, result);
270*09537850SAkhilesh Sanikop pred += prediction_stride;
271*09537850SAkhilesh Sanikop StoreHi8(pred, result);
272*09537850SAkhilesh Sanikop pred += prediction_stride;
273*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride << 1;
274*09537850SAkhilesh Sanikop } while (--y > 0);
275*09537850SAkhilesh Sanikop }
276*09537850SAkhilesh Sanikop
OverlapBlendFromTop_SSE4_1(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)277*09537850SAkhilesh Sanikop void OverlapBlendFromTop_SSE4_1(
278*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
279*09537850SAkhilesh Sanikop const int width, const int height,
280*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const obmc_prediction,
281*09537850SAkhilesh Sanikop const ptrdiff_t obmc_prediction_stride) {
282*09537850SAkhilesh Sanikop auto* pred = static_cast<uint8_t*>(prediction);
283*09537850SAkhilesh Sanikop const auto* obmc_pred = static_cast<const uint8_t*>(obmc_prediction);
284*09537850SAkhilesh Sanikop assert(width >= 4);
285*09537850SAkhilesh Sanikop assert(height >= 2);
286*09537850SAkhilesh Sanikop
287*09537850SAkhilesh Sanikop if (width == 4) {
288*09537850SAkhilesh Sanikop OverlapBlendFromTop4xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
289*09537850SAkhilesh Sanikop return;
290*09537850SAkhilesh Sanikop }
291*09537850SAkhilesh Sanikop if (width == 8) {
292*09537850SAkhilesh Sanikop OverlapBlendFromTop8xH_SSE4_1(pred, prediction_stride, height, obmc_pred);
293*09537850SAkhilesh Sanikop return;
294*09537850SAkhilesh Sanikop }
295*09537850SAkhilesh Sanikop
296*09537850SAkhilesh Sanikop // Stop when mask value becomes 64.
297*09537850SAkhilesh Sanikop const int compute_height = height - (height >> 2);
298*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi8(64);
299*09537850SAkhilesh Sanikop int y = 0;
300*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + height - 2;
301*09537850SAkhilesh Sanikop do {
302*09537850SAkhilesh Sanikop const __m128i mask_val = _mm_set1_epi8(mask[y]);
303*09537850SAkhilesh Sanikop // 64 - mask
304*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
305*09537850SAkhilesh Sanikop const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
306*09537850SAkhilesh Sanikop int x = 0;
307*09537850SAkhilesh Sanikop do {
308*09537850SAkhilesh Sanikop const __m128i pred_val = LoadUnaligned16(pred + x);
309*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
310*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi8(pred_val, obmc_pred_val);
311*09537850SAkhilesh Sanikop const __m128i result_lo =
312*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_lo, masks), 6);
313*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi8(pred_val, obmc_pred_val);
314*09537850SAkhilesh Sanikop const __m128i result_hi =
315*09537850SAkhilesh Sanikop RightShiftWithRounding_U16(_mm_maddubs_epi16(terms_hi, masks), 6);
316*09537850SAkhilesh Sanikop StoreUnaligned16(pred + x, _mm_packus_epi16(result_lo, result_hi));
317*09537850SAkhilesh Sanikop x += 16;
318*09537850SAkhilesh Sanikop } while (x < width);
319*09537850SAkhilesh Sanikop pred += prediction_stride;
320*09537850SAkhilesh Sanikop obmc_pred += obmc_prediction_stride;
321*09537850SAkhilesh Sanikop } while (++y < compute_height);
322*09537850SAkhilesh Sanikop }
323*09537850SAkhilesh Sanikop
Init8bpp()324*09537850SAkhilesh Sanikop void Init8bpp() {
325*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
326*09537850SAkhilesh Sanikop assert(dsp != nullptr);
327*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(ObmcVertical)
328*09537850SAkhilesh Sanikop dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop_SSE4_1;
329*09537850SAkhilesh Sanikop #endif
330*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(ObmcHorizontal)
331*09537850SAkhilesh Sanikop dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft_SSE4_1;
332*09537850SAkhilesh Sanikop #endif
333*09537850SAkhilesh Sanikop }
334*09537850SAkhilesh Sanikop
335*09537850SAkhilesh Sanikop } // namespace
336*09537850SAkhilesh Sanikop } // namespace low_bitdepth
337*09537850SAkhilesh Sanikop
338*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
339*09537850SAkhilesh Sanikop namespace high_bitdepth {
340*09537850SAkhilesh Sanikop namespace {
341*09537850SAkhilesh Sanikop
342*09537850SAkhilesh Sanikop #include "src/dsp/obmc.inc"
343*09537850SAkhilesh Sanikop
344*09537850SAkhilesh Sanikop constexpr int kRoundBitsObmcBlend = 6;
345*09537850SAkhilesh Sanikop
OverlapBlendFromLeft2xH_SSE4_1(uint16_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride,const int height,const uint16_t * LIBGAV1_RESTRICT const obmc_prediction)346*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft2xH_SSE4_1(
347*09537850SAkhilesh Sanikop uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
348*09537850SAkhilesh Sanikop const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
349*09537850SAkhilesh Sanikop constexpr int obmc_pred_stride = 2;
350*09537850SAkhilesh Sanikop uint16_t* pred = prediction;
351*09537850SAkhilesh Sanikop const uint16_t* obmc_pred = obmc_prediction;
352*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride2 = pred_stride << 1;
353*09537850SAkhilesh Sanikop const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
354*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
355*09537850SAkhilesh Sanikop const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
356*09537850SAkhilesh Sanikop // 64 - mask.
357*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
358*09537850SAkhilesh Sanikop const __m128i masks =
359*09537850SAkhilesh Sanikop _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
360*09537850SAkhilesh Sanikop int y = height;
361*09537850SAkhilesh Sanikop do {
362*09537850SAkhilesh Sanikop const __m128i pred_val = Load4x2(pred, pred + pred_stride);
363*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadLo8(obmc_pred);
364*09537850SAkhilesh Sanikop const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
365*09537850SAkhilesh Sanikop const __m128i result = RightShiftWithRounding_U32(
366*09537850SAkhilesh Sanikop _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
367*09537850SAkhilesh Sanikop const __m128i packed_result = _mm_packus_epi32(result, result);
368*09537850SAkhilesh Sanikop Store4(pred, packed_result);
369*09537850SAkhilesh Sanikop Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
370*09537850SAkhilesh Sanikop pred += pred_stride2;
371*09537850SAkhilesh Sanikop obmc_pred += obmc_pred_stride2;
372*09537850SAkhilesh Sanikop y -= 2;
373*09537850SAkhilesh Sanikop } while (y != 0);
374*09537850SAkhilesh Sanikop }
375*09537850SAkhilesh Sanikop
OverlapBlendFromLeft4xH_SSE4_1(uint16_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride,const int height,const uint16_t * LIBGAV1_RESTRICT const obmc_prediction)376*09537850SAkhilesh Sanikop inline void OverlapBlendFromLeft4xH_SSE4_1(
377*09537850SAkhilesh Sanikop uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
378*09537850SAkhilesh Sanikop const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
379*09537850SAkhilesh Sanikop constexpr int obmc_pred_stride = 4;
380*09537850SAkhilesh Sanikop uint16_t* pred = prediction;
381*09537850SAkhilesh Sanikop const uint16_t* obmc_pred = obmc_prediction;
382*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride2 = pred_stride << 1;
383*09537850SAkhilesh Sanikop const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
384*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
385*09537850SAkhilesh Sanikop const __m128i mask_val = Load4(kObmcMask + 2);
386*09537850SAkhilesh Sanikop // 64 - mask.
387*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
388*09537850SAkhilesh Sanikop const __m128i masks =
389*09537850SAkhilesh Sanikop _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
390*09537850SAkhilesh Sanikop int y = height;
391*09537850SAkhilesh Sanikop do {
392*09537850SAkhilesh Sanikop const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
393*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
394*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
395*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
396*09537850SAkhilesh Sanikop const __m128i result_lo = RightShiftWithRounding_U32(
397*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
398*09537850SAkhilesh Sanikop const __m128i result_hi = RightShiftWithRounding_U32(
399*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
400*09537850SAkhilesh Sanikop const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
401*09537850SAkhilesh Sanikop StoreLo8(pred, packed_result);
402*09537850SAkhilesh Sanikop StoreHi8(pred + pred_stride, packed_result);
403*09537850SAkhilesh Sanikop pred += pred_stride2;
404*09537850SAkhilesh Sanikop obmc_pred += obmc_pred_stride2;
405*09537850SAkhilesh Sanikop y -= 2;
406*09537850SAkhilesh Sanikop } while (y != 0);
407*09537850SAkhilesh Sanikop }
408*09537850SAkhilesh Sanikop
OverlapBlendFromLeft10bpp_SSE4_1(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)409*09537850SAkhilesh Sanikop void OverlapBlendFromLeft10bpp_SSE4_1(
410*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
411*09537850SAkhilesh Sanikop const int width, const int height,
412*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const obmc_prediction,
413*09537850SAkhilesh Sanikop const ptrdiff_t obmc_prediction_stride) {
414*09537850SAkhilesh Sanikop auto* pred = static_cast<uint16_t*>(prediction);
415*09537850SAkhilesh Sanikop const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
416*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
417*09537850SAkhilesh Sanikop const ptrdiff_t obmc_pred_stride =
418*09537850SAkhilesh Sanikop obmc_prediction_stride / sizeof(obmc_pred[0]);
419*09537850SAkhilesh Sanikop assert(width >= 2);
420*09537850SAkhilesh Sanikop assert(height >= 4);
421*09537850SAkhilesh Sanikop
422*09537850SAkhilesh Sanikop if (width == 2) {
423*09537850SAkhilesh Sanikop OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred);
424*09537850SAkhilesh Sanikop return;
425*09537850SAkhilesh Sanikop }
426*09537850SAkhilesh Sanikop if (width == 4) {
427*09537850SAkhilesh Sanikop OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
428*09537850SAkhilesh Sanikop return;
429*09537850SAkhilesh Sanikop }
430*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi8(64);
431*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + width - 2;
432*09537850SAkhilesh Sanikop int x = 0;
433*09537850SAkhilesh Sanikop do {
434*09537850SAkhilesh Sanikop pred = static_cast<uint16_t*>(prediction) + x;
435*09537850SAkhilesh Sanikop obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
436*09537850SAkhilesh Sanikop const __m128i mask_val = LoadLo8(mask + x);
437*09537850SAkhilesh Sanikop // 64 - mask
438*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
439*09537850SAkhilesh Sanikop const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
440*09537850SAkhilesh Sanikop const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
441*09537850SAkhilesh Sanikop const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
442*09537850SAkhilesh Sanikop int y = height;
443*09537850SAkhilesh Sanikop do {
444*09537850SAkhilesh Sanikop const __m128i pred_val = LoadUnaligned16(pred);
445*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
446*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
447*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
448*09537850SAkhilesh Sanikop const __m128i result_lo = RightShiftWithRounding_U32(
449*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
450*09537850SAkhilesh Sanikop const __m128i result_hi = RightShiftWithRounding_U32(
451*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
452*09537850SAkhilesh Sanikop StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
453*09537850SAkhilesh Sanikop
454*09537850SAkhilesh Sanikop pred += pred_stride;
455*09537850SAkhilesh Sanikop obmc_pred += obmc_pred_stride;
456*09537850SAkhilesh Sanikop } while (--y != 0);
457*09537850SAkhilesh Sanikop x += 8;
458*09537850SAkhilesh Sanikop } while (x < width);
459*09537850SAkhilesh Sanikop }
460*09537850SAkhilesh Sanikop
OverlapBlendFromTop4xH_SSE4_1(uint16_t * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride,const int height,const uint16_t * LIBGAV1_RESTRICT const obmc_prediction)461*09537850SAkhilesh Sanikop inline void OverlapBlendFromTop4xH_SSE4_1(
462*09537850SAkhilesh Sanikop uint16_t* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride,
463*09537850SAkhilesh Sanikop const int height, const uint16_t* LIBGAV1_RESTRICT const obmc_prediction) {
464*09537850SAkhilesh Sanikop constexpr int obmc_pred_stride = 4;
465*09537850SAkhilesh Sanikop uint16_t* pred = prediction;
466*09537850SAkhilesh Sanikop const uint16_t* obmc_pred = obmc_prediction;
467*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi16(64);
468*09537850SAkhilesh Sanikop const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
469*09537850SAkhilesh Sanikop const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
470*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + height - 2;
471*09537850SAkhilesh Sanikop const int compute_height = height - (height >> 2);
472*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride2 = pred_stride << 1;
473*09537850SAkhilesh Sanikop const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
474*09537850SAkhilesh Sanikop int y = 0;
475*09537850SAkhilesh Sanikop do {
476*09537850SAkhilesh Sanikop // First mask in the first half, second mask in the second half.
477*09537850SAkhilesh Sanikop const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
478*09537850SAkhilesh Sanikop const __m128i masks =
479*09537850SAkhilesh Sanikop _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
480*09537850SAkhilesh Sanikop const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
481*09537850SAkhilesh Sanikop const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
482*09537850SAkhilesh Sanikop
483*09537850SAkhilesh Sanikop const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
484*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
485*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
486*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
487*09537850SAkhilesh Sanikop const __m128i result_lo = RightShiftWithRounding_U32(
488*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
489*09537850SAkhilesh Sanikop const __m128i result_hi = RightShiftWithRounding_U32(
490*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
491*09537850SAkhilesh Sanikop const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
492*09537850SAkhilesh Sanikop
493*09537850SAkhilesh Sanikop StoreLo8(pred, packed_result);
494*09537850SAkhilesh Sanikop StoreHi8(pred + pred_stride, packed_result);
495*09537850SAkhilesh Sanikop pred += pred_stride2;
496*09537850SAkhilesh Sanikop obmc_pred += obmc_pred_stride2;
497*09537850SAkhilesh Sanikop y += 2;
498*09537850SAkhilesh Sanikop } while (y < compute_height);
499*09537850SAkhilesh Sanikop }
500*09537850SAkhilesh Sanikop
OverlapBlendFromTop10bpp_SSE4_1(void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t prediction_stride,const int width,const int height,const void * LIBGAV1_RESTRICT const obmc_prediction,const ptrdiff_t obmc_prediction_stride)501*09537850SAkhilesh Sanikop void OverlapBlendFromTop10bpp_SSE4_1(
502*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t prediction_stride,
503*09537850SAkhilesh Sanikop const int width, const int height,
504*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const obmc_prediction,
505*09537850SAkhilesh Sanikop const ptrdiff_t obmc_prediction_stride) {
506*09537850SAkhilesh Sanikop auto* pred = static_cast<uint16_t*>(prediction);
507*09537850SAkhilesh Sanikop const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
508*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
509*09537850SAkhilesh Sanikop const ptrdiff_t obmc_pred_stride =
510*09537850SAkhilesh Sanikop obmc_prediction_stride / sizeof(obmc_pred[0]);
511*09537850SAkhilesh Sanikop assert(width >= 4);
512*09537850SAkhilesh Sanikop assert(height >= 2);
513*09537850SAkhilesh Sanikop
514*09537850SAkhilesh Sanikop if (width == 4) {
515*09537850SAkhilesh Sanikop OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred);
516*09537850SAkhilesh Sanikop return;
517*09537850SAkhilesh Sanikop }
518*09537850SAkhilesh Sanikop
519*09537850SAkhilesh Sanikop const __m128i mask_inverter = _mm_set1_epi8(64);
520*09537850SAkhilesh Sanikop const int compute_height = height - (height >> 2);
521*09537850SAkhilesh Sanikop const uint8_t* mask = kObmcMask + height - 2;
522*09537850SAkhilesh Sanikop pred = static_cast<uint16_t*>(prediction);
523*09537850SAkhilesh Sanikop obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
524*09537850SAkhilesh Sanikop int y = 0;
525*09537850SAkhilesh Sanikop do {
526*09537850SAkhilesh Sanikop const __m128i mask_val = _mm_set1_epi8(mask[y]);
527*09537850SAkhilesh Sanikop // 64 - mask
528*09537850SAkhilesh Sanikop const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
529*09537850SAkhilesh Sanikop const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
530*09537850SAkhilesh Sanikop const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
531*09537850SAkhilesh Sanikop const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
532*09537850SAkhilesh Sanikop int x = 0;
533*09537850SAkhilesh Sanikop do {
534*09537850SAkhilesh Sanikop const __m128i pred_val = LoadUnaligned16(pred + x);
535*09537850SAkhilesh Sanikop const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
536*09537850SAkhilesh Sanikop const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
537*09537850SAkhilesh Sanikop const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
538*09537850SAkhilesh Sanikop const __m128i result_lo = RightShiftWithRounding_U32(
539*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
540*09537850SAkhilesh Sanikop const __m128i result_hi = RightShiftWithRounding_U32(
541*09537850SAkhilesh Sanikop _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
542*09537850SAkhilesh Sanikop StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
543*09537850SAkhilesh Sanikop x += 8;
544*09537850SAkhilesh Sanikop } while (x < width);
545*09537850SAkhilesh Sanikop pred += pred_stride;
546*09537850SAkhilesh Sanikop obmc_pred += obmc_pred_stride;
547*09537850SAkhilesh Sanikop } while (++y < compute_height);
548*09537850SAkhilesh Sanikop }
549*09537850SAkhilesh Sanikop
Init10bpp()550*09537850SAkhilesh Sanikop void Init10bpp() {
551*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
552*09537850SAkhilesh Sanikop assert(dsp != nullptr);
553*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
554*09537850SAkhilesh Sanikop dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
555*09537850SAkhilesh Sanikop #endif
556*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
557*09537850SAkhilesh Sanikop dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
558*09537850SAkhilesh Sanikop #endif
559*09537850SAkhilesh Sanikop }
560*09537850SAkhilesh Sanikop
561*09537850SAkhilesh Sanikop } // namespace
562*09537850SAkhilesh Sanikop } // namespace high_bitdepth
563*09537850SAkhilesh Sanikop #endif // LIBGAV1_MAX_BITDEPTH >= 10
564*09537850SAkhilesh Sanikop
ObmcInit_SSE4_1()565*09537850SAkhilesh Sanikop void ObmcInit_SSE4_1() {
566*09537850SAkhilesh Sanikop low_bitdepth::Init8bpp();
567*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
568*09537850SAkhilesh Sanikop high_bitdepth::Init10bpp();
569*09537850SAkhilesh Sanikop #endif // LIBGAV1_MAX_BITDEPTH >= 10
570*09537850SAkhilesh Sanikop }
571*09537850SAkhilesh Sanikop
572*09537850SAkhilesh Sanikop } // namespace dsp
573*09537850SAkhilesh Sanikop } // namespace libgav1
574*09537850SAkhilesh Sanikop
575*09537850SAkhilesh Sanikop #else // !LIBGAV1_TARGETING_SSE4_1
576*09537850SAkhilesh Sanikop
577*09537850SAkhilesh Sanikop namespace libgav1 {
578*09537850SAkhilesh Sanikop namespace dsp {
579*09537850SAkhilesh Sanikop
ObmcInit_SSE4_1()580*09537850SAkhilesh Sanikop void ObmcInit_SSE4_1() {}
581*09537850SAkhilesh Sanikop
582*09537850SAkhilesh Sanikop } // namespace dsp
583*09537850SAkhilesh Sanikop } // namespace libgav1
584*09537850SAkhilesh Sanikop #endif // LIBGAV1_TARGETING_SSE4_1
585