1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred_smooth.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <xmmintrin.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/x86/common_sse4.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31
32 namespace libgav1 {
33 namespace dsp {
34 namespace low_bitdepth {
35 namespace {
36
37 // Note these constants are duplicated from intrapred.cc to allow the compiler
38 // to have visibility of the values. This helps reduce loads and in the
39 // creation of the inverse weights.
40 constexpr uint8_t kSmoothWeights[] = {
41 #include "src/dsp/smooth_weights.inc"
42 };
43
44 template <int y_mask>
WriteSmoothHorizontalSum4(void * LIBGAV1_RESTRICT const dest,const __m128i & left,const __m128i & weights,const __m128i & scaled_top_right,const __m128i & round)45 inline void WriteSmoothHorizontalSum4(void* LIBGAV1_RESTRICT const dest,
46 const __m128i& left,
47 const __m128i& weights,
48 const __m128i& scaled_top_right,
49 const __m128i& round) {
50 const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
51 const __m128i weighted_left_y = _mm_mullo_epi16(left_y, weights);
52 const __m128i pred_sum = _mm_add_epi32(scaled_top_right, weighted_left_y);
53 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
54 const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
55 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
56 Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
57 }
58
59 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
60 // |pixels| is a segment of the top row or the whole top row, and |weights| is
61 // repeated.
SmoothDirectionalSum8(const __m128i & pixels,const __m128i & weights,const __m128i & scaled_corner)62 inline __m128i SmoothDirectionalSum8(const __m128i& pixels,
63 const __m128i& weights,
64 const __m128i& scaled_corner) {
65 const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
66 return _mm_add_epi16(scaled_corner, weighted_px);
67 }
68
WriteSmoothDirectionalSum8(uint8_t * LIBGAV1_RESTRICT dest,const __m128i & pixels,const __m128i & weights,const __m128i & scaled_corner,const __m128i & round)69 inline void WriteSmoothDirectionalSum8(uint8_t* LIBGAV1_RESTRICT dest,
70 const __m128i& pixels,
71 const __m128i& weights,
72 const __m128i& scaled_corner,
73 const __m128i& round) {
74 const __m128i pred_sum =
75 SmoothDirectionalSum8(pixels, weights, scaled_corner);
76 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
77 const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, round), 8);
78 StoreLo8(dest, _mm_packus_epi16(pred, pred));
79 }
80
81 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
82 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
83 // scaled_corner2 are the same.
WriteSmoothDirectionalSum16(uint8_t * LIBGAV1_RESTRICT dest,const __m128i & pixels1,const __m128i & pixels2,const __m128i & weights1,const __m128i & weights2,const __m128i & scaled_corner1,const __m128i & scaled_corner2,const __m128i & round)84 inline void WriteSmoothDirectionalSum16(
85 uint8_t* LIBGAV1_RESTRICT dest, const __m128i& pixels1,
86 const __m128i& pixels2, const __m128i& weights1, const __m128i& weights2,
87 const __m128i& scaled_corner1, const __m128i& scaled_corner2,
88 const __m128i& round) {
89 const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
90 const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
91 const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
92 const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
93 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
94 const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
95 const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
96 StoreUnaligned16(dest, _mm_packus_epi16(pred1, pred2));
97 }
98
99 template <int y_mask>
WriteSmoothPredSum4(uint8_t * LIBGAV1_RESTRICT const dest,const __m128i & top,const __m128i & left,const __m128i & weights_x,const __m128i & weights_y,const __m128i & scaled_bottom_left,const __m128i & scaled_top_right,const __m128i & round)100 inline void WriteSmoothPredSum4(uint8_t* LIBGAV1_RESTRICT const dest,
101 const __m128i& top, const __m128i& left,
102 const __m128i& weights_x,
103 const __m128i& weights_y,
104 const __m128i& scaled_bottom_left,
105 const __m128i& scaled_top_right,
106 const __m128i& round) {
107 const __m128i left_y = _mm_shuffle_epi32(left, y_mask);
108 const __m128i weighted_left_y = _mm_mullo_epi32(left_y, weights_x);
109 const __m128i weight_y = _mm_shuffle_epi32(weights_y, y_mask);
110 const __m128i weighted_top = _mm_mullo_epi32(weight_y, top);
111 const __m128i scaled_bottom_left_y =
112 _mm_shuffle_epi32(scaled_bottom_left, y_mask);
113 const __m128i col_pred = _mm_add_epi32(scaled_bottom_left_y, weighted_left_y);
114 const __m128i row_pred = _mm_add_epi32(scaled_top_right, weighted_top);
115 const __m128i pred_sum = _mm_add_epi32(row_pred, col_pred);
116
117 // Equivalent to RightShiftWithRounding(pred[x][y], 9).
118 const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 9);
119
120 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
121 Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
122 }
123
124 // pixels[0]: above and below_pred interleave vector
125 // pixels[1]: left vector
126 // pixels[2]: right_pred vector
LoadSmoothPixels4(const uint8_t * LIBGAV1_RESTRICT above,const uint8_t * LIBGAV1_RESTRICT left,const int height,__m128i * pixels)127 inline void LoadSmoothPixels4(const uint8_t* LIBGAV1_RESTRICT above,
128 const uint8_t* LIBGAV1_RESTRICT left,
129 const int height, __m128i* pixels) {
130 if (height == 4) {
131 pixels[1] = Load4(left);
132 } else if (height == 8) {
133 pixels[1] = LoadLo8(left);
134 } else {
135 pixels[1] = LoadUnaligned16(left);
136 }
137
138 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
139 const __m128i top = _mm_cvtepu8_epi16(Load4(above));
140 pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
141 pixels[2] = _mm_set1_epi16(above[3]);
142 }
143
144 // weight_h[0]: weight_h vector
145 // weight_h[1]: scale - weight_h vector
146 // weight_h[2]: same as [0], second half for height = 16 only
147 // weight_h[3]: same as [1], second half for height = 16 only
148 // weight_w[0]: weights_w and scale - weights_w interleave vector
LoadSmoothWeights4(const uint8_t * LIBGAV1_RESTRICT weight_array,const int height,__m128i * weight_h,__m128i * weight_w)149 inline void LoadSmoothWeights4(const uint8_t* LIBGAV1_RESTRICT weight_array,
150 const int height, __m128i* weight_h,
151 __m128i* weight_w) {
152 const __m128i scale = _mm_set1_epi16(256);
153 const __m128i x_weights = Load4(weight_array);
154 weight_h[0] = _mm_cvtepu8_epi16(x_weights);
155 weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
156 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
157
158 if (height == 8) {
159 const __m128i y_weights = LoadLo8(weight_array + 4);
160 weight_h[0] = _mm_cvtepu8_epi16(y_weights);
161 weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
162 } else if (height == 16) {
163 const __m128i zero = _mm_setzero_si128();
164 const __m128i y_weights = LoadUnaligned16(weight_array + 12);
165 weight_h[0] = _mm_cvtepu8_epi16(y_weights);
166 weight_h[1] = _mm_sub_epi16(scale, weight_h[0]);
167 weight_h[2] = _mm_unpackhi_epi8(y_weights, zero);
168 weight_h[3] = _mm_sub_epi16(scale, weight_h[2]);
169 }
170 }
171
WriteSmoothPred4x8(const __m128i * pixel,const __m128i * weights_y,const __m128i * weight_x,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const bool use_second_half)172 inline void WriteSmoothPred4x8(const __m128i* pixel, const __m128i* weights_y,
173 const __m128i* weight_x,
174 uint8_t* LIBGAV1_RESTRICT dst,
175 const ptrdiff_t stride,
176 const bool use_second_half) {
177 const __m128i round = _mm_set1_epi32(256);
178 const __m128i mask_increment = _mm_set1_epi16(0x0202);
179 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
180 const __m128i zero = _mm_setzero_si128();
181 const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixel[1], zero)
182 : _mm_unpacklo_epi8(pixel[1], zero);
183 __m128i y_select = _mm_set1_epi16(0x0100);
184
185 for (int i = 0; i < 8; ++i) {
186 const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
187 const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
188 const __m128i interleaved_weights =
189 _mm_unpacklo_epi16(weight_y, inverted_weight_y);
190 __m128i vertical_pred = _mm_madd_epi16(pixel[0], interleaved_weights);
191
192 __m128i horizontal_vect = _mm_shuffle_epi8(left, y_select);
193 horizontal_vect = _mm_unpacklo_epi16(horizontal_vect, pixel[2]);
194 __m128i sum = _mm_madd_epi16(horizontal_vect, weight_x[0]);
195
196 sum = _mm_add_epi32(vertical_pred, sum);
197 sum = _mm_add_epi32(sum, round);
198 sum = _mm_srai_epi32(sum, 9);
199
200 sum = _mm_shuffle_epi8(sum, cvtepi32_epi8);
201 Store4(dst, sum);
202 dst += stride;
203
204 y_select = _mm_add_epi16(y_select, mask_increment);
205 }
206 }
207
208 // The interleaving approach has some overhead that causes it to underperform in
209 // the 4x4 case.
Smooth4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)210 void Smooth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
211 const void* LIBGAV1_RESTRICT top_row,
212 const void* LIBGAV1_RESTRICT left_column) {
213 const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
214 const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
215 const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
216 const __m128i scale = _mm_set1_epi32(256);
217 // Fourth short is top_row[3].
218 const __m128i top_right = _mm_shuffle_epi32(top, 0xFF);
219 // Fourth short is left_column[3].
220 const __m128i bottom_left = _mm_shuffle_epi32(left, 0xFF);
221 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
222 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
223 const __m128i scaled_bottom_left =
224 _mm_mullo_epi16(inverted_weights, bottom_left);
225 auto* dst = static_cast<uint8_t*>(dest);
226 // AV1 spec 7.11.2.6 (3) describes the sum:
227 // smoothPred[y][x:x+3] = weighted_top + scaled_right + weighted_left[y] +
228 // scaled_bottom[y] This could be a loop, but for the immediate value in the
229 // shuffles.
230 WriteSmoothPredSum4<0>(dst, top, left, weights, weights, scaled_bottom_left,
231 scaled_top_right, scale);
232 dst += stride;
233 WriteSmoothPredSum4<0x55>(dst, top, left, weights, weights,
234 scaled_bottom_left, scaled_top_right, scale);
235 dst += stride;
236 WriteSmoothPredSum4<0xAA>(dst, top, left, weights, weights,
237 scaled_bottom_left, scaled_top_right, scale);
238 dst += stride;
239 WriteSmoothPredSum4<0xFF>(dst, top, left, weights, weights,
240 scaled_bottom_left, scaled_top_right, scale);
241 }
242
Smooth4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)243 void Smooth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
244 const void* LIBGAV1_RESTRICT top_row,
245 const void* LIBGAV1_RESTRICT left_column) {
246 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
247 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
248 __m128i weights_x[1];
249 __m128i weights_y[2];
250 LoadSmoothWeights4(kSmoothWeights, 8, weights_y, weights_x);
251 __m128i pixels[3];
252 LoadSmoothPixels4(top_ptr, left_ptr, 8, pixels);
253 auto* dst = static_cast<uint8_t*>(dest);
254 WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
255 }
256
Smooth4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)257 void Smooth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
258 const ptrdiff_t stride,
259 const void* LIBGAV1_RESTRICT top_row,
260 const void* LIBGAV1_RESTRICT left_column) {
261 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
262 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
263 __m128i weights_x[1];
264 __m128i weights_y[4];
265 LoadSmoothWeights4(kSmoothWeights, 16, weights_y, weights_x);
266 __m128i pixels[3];
267 LoadSmoothPixels4(top_ptr, left_ptr, 16, pixels);
268 auto* dst = static_cast<uint8_t*>(dest);
269 WriteSmoothPred4x8(pixels, weights_y, weights_x, dst, stride, false);
270 dst += stride << 3;
271 WriteSmoothPred4x8(pixels, &weights_y[2], weights_x, dst, stride, true);
272 }
273
274 // pixels[0]: above and below_pred interleave vector, first half
275 // pixels[1]: above and below_pred interleave vector, second half
276 // pixels[2]: left vector
277 // pixels[3]: right_pred vector
278 // pixels[4]: above and below_pred interleave vector, first half
279 // pixels[5]: above and below_pred interleave vector, second half
280 // pixels[6]: left vector + 16
281 // pixels[7]: right_pred vector
LoadSmoothPixels8(const uint8_t * LIBGAV1_RESTRICT above,const uint8_t * LIBGAV1_RESTRICT left,const int height,__m128i * pixels)282 inline void LoadSmoothPixels8(const uint8_t* LIBGAV1_RESTRICT above,
283 const uint8_t* LIBGAV1_RESTRICT left,
284 const int height, __m128i* pixels) {
285 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
286 __m128i top_row = _mm_cvtepu8_epi16(LoadLo8(above));
287 pixels[0] = _mm_unpacklo_epi16(top_row, bottom_left);
288 pixels[1] = _mm_unpackhi_epi16(top_row, bottom_left);
289
290 pixels[3] = _mm_set1_epi16(above[7]);
291
292 if (height == 4) {
293 pixels[2] = Load4(left);
294 } else if (height == 8) {
295 pixels[2] = LoadLo8(left);
296 } else if (height == 16) {
297 pixels[2] = LoadUnaligned16(left);
298 } else {
299 pixels[2] = LoadUnaligned16(left);
300 pixels[4] = pixels[0];
301 pixels[5] = pixels[1];
302 pixels[6] = LoadUnaligned16(left + 16);
303 pixels[7] = pixels[3];
304 }
305 }
306
307 // weight_h[0]: weight_h vector
308 // weight_h[1]: scale - weight_h vector
309 // weight_h[2]: same as [0], offset 8
310 // weight_h[3]: same as [1], offset 8
311 // weight_h[4]: same as [0], offset 16
312 // weight_h[5]: same as [1], offset 16
313 // weight_h[6]: same as [0], offset 24
314 // weight_h[7]: same as [1], offset 24
315 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
316 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
LoadSmoothWeights8(const uint8_t * LIBGAV1_RESTRICT weight_array,const int height,__m128i * weight_w,__m128i * weight_h)317 inline void LoadSmoothWeights8(const uint8_t* LIBGAV1_RESTRICT weight_array,
318 const int height, __m128i* weight_w,
319 __m128i* weight_h) {
320 const int offset = (height < 8) ? 0 : 4;
321 __m128i loaded_weights = LoadUnaligned16(&weight_array[offset]);
322 weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
323 const __m128i inverter = _mm_set1_epi16(256);
324 weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
325
326 if (height == 4) {
327 loaded_weights = _mm_srli_si128(loaded_weights, 4);
328 __m128i weights_x = _mm_cvtepu8_epi16(loaded_weights);
329 __m128i inverted_weights_x = _mm_sub_epi16(inverter, weights_x);
330 weight_w[0] = _mm_unpacklo_epi16(weights_x, inverted_weights_x);
331 weight_w[1] = _mm_unpackhi_epi16(weights_x, inverted_weights_x);
332 } else {
333 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
334 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
335 }
336
337 if (height == 16) {
338 const __m128i zero = _mm_setzero_si128();
339 loaded_weights = LoadUnaligned16(weight_array + 12);
340 weight_h[0] = _mm_cvtepu8_epi16(loaded_weights);
341 weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
342 weight_h[2] = _mm_unpackhi_epi8(loaded_weights, zero);
343 weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
344 } else if (height == 32) {
345 const __m128i zero = _mm_setzero_si128();
346 const __m128i weight_lo = LoadUnaligned16(weight_array + 28);
347 weight_h[0] = _mm_cvtepu8_epi16(weight_lo);
348 weight_h[1] = _mm_sub_epi16(inverter, weight_h[0]);
349 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
350 weight_h[3] = _mm_sub_epi16(inverter, weight_h[2]);
351 const __m128i weight_hi = LoadUnaligned16(weight_array + 44);
352 weight_h[4] = _mm_cvtepu8_epi16(weight_hi);
353 weight_h[5] = _mm_sub_epi16(inverter, weight_h[4]);
354 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
355 weight_h[7] = _mm_sub_epi16(inverter, weight_h[6]);
356 }
357 }
358
WriteSmoothPred8xH(const __m128i * pixels,const __m128i * weights_x,const __m128i * weights_y,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride,const bool use_second_half)359 inline void WriteSmoothPred8xH(const __m128i* pixels, const __m128i* weights_x,
360 const __m128i* weights_y, const int height,
361 uint8_t* LIBGAV1_RESTRICT dst,
362 const ptrdiff_t stride,
363 const bool use_second_half) {
364 const __m128i round = _mm_set1_epi32(256);
365 const __m128i mask_increment = _mm_set1_epi16(0x0202);
366 const __m128i cvt_epu16_epi8 = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
367
368 const __m128i zero = _mm_setzero_si128();
369 const __m128i left = use_second_half ? _mm_unpackhi_epi8(pixels[2], zero)
370 : _mm_unpacklo_epi8(pixels[2], zero);
371 __m128i y_select = _mm_set1_epi16(0x100);
372
373 for (int i = 0; i < height; ++i) {
374 const __m128i weight_y = _mm_shuffle_epi8(weights_y[0], y_select);
375 const __m128i inverted_weight_y = _mm_shuffle_epi8(weights_y[1], y_select);
376 const __m128i interleaved_weights =
377 _mm_unpacklo_epi16(weight_y, inverted_weight_y);
378 const __m128i vertical_sum0 =
379 _mm_madd_epi16(pixels[0], interleaved_weights);
380 const __m128i vertical_sum1 =
381 _mm_madd_epi16(pixels[1], interleaved_weights);
382
383 __m128i horizontal_pixels = _mm_shuffle_epi8(left, y_select);
384 horizontal_pixels = _mm_unpacklo_epi16(horizontal_pixels, pixels[3]);
385 const __m128i horizontal_sum0 =
386 _mm_madd_epi16(horizontal_pixels, weights_x[0]);
387 const __m128i horizontal_sum1 =
388 _mm_madd_epi16(horizontal_pixels, weights_x[1]);
389
390 __m128i sum0 = _mm_add_epi32(vertical_sum0, horizontal_sum0);
391 sum0 = _mm_add_epi32(sum0, round);
392 sum0 = _mm_srai_epi32(sum0, 9);
393
394 __m128i sum1 = _mm_add_epi32(vertical_sum1, horizontal_sum1);
395 sum1 = _mm_add_epi32(sum1, round);
396 sum1 = _mm_srai_epi32(sum1, 9);
397
398 sum0 = _mm_packus_epi16(sum0, sum1);
399 sum0 = _mm_shuffle_epi8(sum0, cvt_epu16_epi8);
400 StoreLo8(dst, sum0);
401 dst += stride;
402
403 y_select = _mm_add_epi16(y_select, mask_increment);
404 }
405 }
406
Smooth8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)407 void Smooth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
408 const void* LIBGAV1_RESTRICT top_row,
409 const void* LIBGAV1_RESTRICT left_column) {
410 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
411 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
412 __m128i pixels[4];
413 LoadSmoothPixels8(top_ptr, left_ptr, 4, pixels);
414
415 __m128i weights_x[2], weights_y[2];
416 LoadSmoothWeights8(kSmoothWeights, 4, weights_x, weights_y);
417
418 auto* dst = static_cast<uint8_t*>(dest);
419 WriteSmoothPred8xH(pixels, weights_x, weights_y, 4, dst, stride, false);
420 }
421
Smooth8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)422 void Smooth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
423 const void* LIBGAV1_RESTRICT top_row,
424 const void* LIBGAV1_RESTRICT left_column) {
425 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
426 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
427
428 __m128i pixels[4];
429 LoadSmoothPixels8(top_ptr, left_ptr, 8, pixels);
430
431 __m128i weights_x[2], weights_y[2];
432 LoadSmoothWeights8(kSmoothWeights, 8, weights_x, weights_y);
433
434 auto* dst = static_cast<uint8_t*>(dest);
435 WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
436 }
437
Smooth8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)438 void Smooth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
439 const ptrdiff_t stride,
440 const void* LIBGAV1_RESTRICT top_row,
441 const void* LIBGAV1_RESTRICT left_column) {
442 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
443 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
444 __m128i pixels[4];
445 LoadSmoothPixels8(top_ptr, left_ptr, 16, pixels);
446
447 __m128i weights_x[2], weights_y[4];
448 LoadSmoothWeights8(kSmoothWeights, 16, weights_x, weights_y);
449
450 auto* dst = static_cast<uint8_t*>(dest);
451 WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
452 dst += stride << 3;
453 WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
454 }
455
Smooth8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)456 void Smooth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
457 const ptrdiff_t stride,
458 const void* LIBGAV1_RESTRICT top_row,
459 const void* LIBGAV1_RESTRICT left_column) {
460 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
461 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
462 __m128i pixels[8];
463 LoadSmoothPixels8(top_ptr, left_ptr, 32, pixels);
464
465 __m128i weights_x[2], weights_y[8];
466 LoadSmoothWeights8(kSmoothWeights, 32, weights_x, weights_y);
467
468 auto* dst = static_cast<uint8_t*>(dest);
469 WriteSmoothPred8xH(pixels, weights_x, weights_y, 8, dst, stride, false);
470 dst += stride << 3;
471 WriteSmoothPred8xH(pixels, weights_x, &weights_y[2], 8, dst, stride, true);
472 dst += stride << 3;
473 WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[4], 8, dst, stride,
474 false);
475 dst += stride << 3;
476 WriteSmoothPred8xH(&pixels[4], weights_x, &weights_y[6], 8, dst, stride,
477 true);
478 }
479
480 template <int width, int height>
SmoothWxH(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)481 void SmoothWxH(void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
482 const void* LIBGAV1_RESTRICT const top_row,
483 const void* LIBGAV1_RESTRICT const left_column) {
484 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
485 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
486 const uint8_t* const sm_weights_h = kSmoothWeights + height - 4;
487 const uint8_t* const sm_weights_w = kSmoothWeights + width - 4;
488 const __m128i zero = _mm_setzero_si128();
489 const __m128i scale_value = _mm_set1_epi16(256);
490 const __m128i bottom_left = _mm_cvtsi32_si128(left_ptr[height - 1]);
491 const __m128i top_right = _mm_set1_epi16(top_ptr[width - 1]);
492 const __m128i round = _mm_set1_epi32(256);
493 auto* dst = static_cast<uint8_t*>(dest);
494 for (int y = 0; y < height; ++y) {
495 const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
496 const __m128i left_y = _mm_cvtsi32_si128(left_ptr[y]);
497 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
498 __m128i scaled_bottom_left =
499 _mm_mullo_epi16(scale_m_weights_y, bottom_left);
500 const __m128i weight_left_y =
501 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
502 scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
503 scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
504 for (int x = 0; x < width; x += 8) {
505 const __m128i top_x = LoadLo8(top_ptr + x);
506 const __m128i weights_x = LoadLo8(sm_weights_w + x);
507 const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
508 const __m128i top_weights_x_lo = _mm_cvtepu8_epi16(top_weights_x);
509 const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
510
511 // Here opposite weights and pixels are multiplied, where the order of
512 // interleaving is indicated in the names.
513 __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
514 __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
515
516 // |scaled_bottom_left| is always scaled by the same weight each row, so
517 // we only derive |scaled_top_right| values here.
518 const __m128i inverted_weights_x =
519 _mm_sub_epi16(scale_value, _mm_cvtepu8_epi16(weights_x));
520 const __m128i scaled_top_right =
521 _mm_mullo_epi16(inverted_weights_x, top_right);
522 const __m128i scaled_top_right_lo = _mm_cvtepu16_epi32(scaled_top_right);
523 const __m128i scaled_top_right_hi =
524 _mm_unpackhi_epi16(scaled_top_right, zero);
525 pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
526 pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
527 pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
528 pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
529
530 // The round value for RightShiftWithRounding was added with
531 // |scaled_bottom_left|.
532 pred_lo = _mm_srli_epi32(pred_lo, 9);
533 pred_hi = _mm_srli_epi32(pred_hi, 9);
534 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
535 StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
536 }
537 dst += stride;
538 }
539 }
540
SmoothHorizontal4x4_SSE4_1(void * LIBGAV1_RESTRICT dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT top_row,const void * LIBGAV1_RESTRICT left_column)541 void SmoothHorizontal4x4_SSE4_1(void* LIBGAV1_RESTRICT dest,
542 const ptrdiff_t stride,
543 const void* LIBGAV1_RESTRICT top_row,
544 const void* LIBGAV1_RESTRICT left_column) {
545 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
546 const __m128i top_right = _mm_set1_epi32(top_ptr[3]);
547 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
548 const __m128i left = _mm_cvtepu8_epi32(Load4(left_ptr));
549 const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
550 __m128i scale = _mm_set1_epi32(256);
551 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
552 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
553 scale = _mm_set1_epi32(128);
554 auto* dst = static_cast<uint8_t*>(dest);
555 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
556 dst += stride;
557 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
558 dst += stride;
559 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
560 dst += stride;
561 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
562 }
563
SmoothHorizontal4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)564 void SmoothHorizontal4x8_SSE4_1(
565 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
566 const void* LIBGAV1_RESTRICT const top_row,
567 const void* LIBGAV1_RESTRICT const left_column) {
568 const auto* const top = static_cast<const uint8_t*>(top_row);
569 const __m128i top_right = _mm_set1_epi32(top[3]);
570 const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
571 __m128i scale = _mm_set1_epi32(256);
572 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
573 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
574 scale = _mm_set1_epi32(128);
575 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
576 __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
577 auto* dst = static_cast<uint8_t*>(dest);
578 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
579 dst += stride;
580 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
581 dst += stride;
582 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
583 dst += stride;
584 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
585 dst += stride;
586
587 left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
588 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
589 dst += stride;
590 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
591 dst += stride;
592 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
593 dst += stride;
594 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
595 }
596
SmoothHorizontal4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)597 void SmoothHorizontal4x16_SSE4_1(
598 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
599 const void* LIBGAV1_RESTRICT const top_row,
600 const void* LIBGAV1_RESTRICT const left_column) {
601 const auto* const top = static_cast<const uint8_t*>(top_row);
602 const __m128i top_right = _mm_set1_epi32(top[3]);
603 const __m128i weights = _mm_cvtepu8_epi32(Load4(kSmoothWeights));
604 __m128i scale = _mm_set1_epi32(256);
605 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
606 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
607 scale = _mm_set1_epi32(128);
608 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
609 __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
610 auto* dst = static_cast<uint8_t*>(dest);
611 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
612 dst += stride;
613 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
614 dst += stride;
615 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
616 dst += stride;
617 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
618 dst += stride;
619
620 left = _mm_cvtepu8_epi32(Load4(left_ptr + 4));
621 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
622 dst += stride;
623 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
624 dst += stride;
625 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
626 dst += stride;
627 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
628 dst += stride;
629
630 left = _mm_cvtepu8_epi32(Load4(left_ptr + 8));
631 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
632 dst += stride;
633 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
634 dst += stride;
635 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
636 dst += stride;
637 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
638 dst += stride;
639
640 left = _mm_cvtepu8_epi32(Load4(left_ptr + 12));
641 WriteSmoothHorizontalSum4<0>(dst, left, weights, scaled_top_right, scale);
642 dst += stride;
643 WriteSmoothHorizontalSum4<0x55>(dst, left, weights, scaled_top_right, scale);
644 dst += stride;
645 WriteSmoothHorizontalSum4<0xAA>(dst, left, weights, scaled_top_right, scale);
646 dst += stride;
647 WriteSmoothHorizontalSum4<0xFF>(dst, left, weights, scaled_top_right, scale);
648 }
649
SmoothHorizontal8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)650 void SmoothHorizontal8x4_SSE4_1(
651 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
652 const void* LIBGAV1_RESTRICT const top_row,
653 const void* LIBGAV1_RESTRICT const left_column) {
654 const auto* const top = static_cast<const uint8_t*>(top_row);
655 const __m128i top_right = _mm_set1_epi16(top[7]);
656 const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
657 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
658 __m128i scale = _mm_set1_epi16(256);
659 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
660 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
661 scale = _mm_set1_epi16(128);
662 __m128i y_select = _mm_set1_epi32(0x01000100);
663 __m128i left_y = _mm_shuffle_epi8(left, y_select);
664 auto* dst = static_cast<uint8_t*>(dest);
665 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
666 dst += stride;
667 y_select = _mm_set1_epi32(0x03020302);
668 left_y = _mm_shuffle_epi8(left, y_select);
669 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
670 dst += stride;
671 y_select = _mm_set1_epi32(0x05040504);
672 left_y = _mm_shuffle_epi8(left, y_select);
673 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
674 dst += stride;
675 y_select = _mm_set1_epi32(0x07060706);
676 left_y = _mm_shuffle_epi8(left, y_select);
677 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
678 }
679
SmoothHorizontal8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)680 void SmoothHorizontal8x8_SSE4_1(
681 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
682 const void* LIBGAV1_RESTRICT const top_row,
683 const void* LIBGAV1_RESTRICT const left_column) {
684 const auto* const top = static_cast<const uint8_t*>(top_row);
685 const __m128i top_right = _mm_set1_epi16(top[7]);
686 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
687 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
688 __m128i scale = _mm_set1_epi16(256);
689 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
690 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
691 scale = _mm_set1_epi16(128);
692 auto* dst = static_cast<uint8_t*>(dest);
693 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
694 const __m128i y_select = _mm_set1_epi32(y_mask);
695 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
696 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
697 dst += stride;
698 }
699 }
700
SmoothHorizontal8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)701 void SmoothHorizontal8x16_SSE4_1(
702 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
703 const void* LIBGAV1_RESTRICT const top_row,
704 const void* LIBGAV1_RESTRICT const left_column) {
705 const auto* const top = static_cast<const uint8_t*>(top_row);
706 const __m128i top_right = _mm_set1_epi16(top[7]);
707 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
708 __m128i scale = _mm_set1_epi16(256);
709 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
710 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
711 scale = _mm_set1_epi16(128);
712 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
713 __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
714 auto* dst = static_cast<uint8_t*>(dest);
715 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
716 const __m128i y_select = _mm_set1_epi32(y_mask);
717 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
718 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
719 dst += stride;
720 }
721 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
722 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
723 const __m128i y_select = _mm_set1_epi32(y_mask);
724 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
725 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
726 dst += stride;
727 }
728 }
729
SmoothHorizontal8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)730 void SmoothHorizontal8x32_SSE4_1(
731 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
732 const void* LIBGAV1_RESTRICT const top_row,
733 const void* LIBGAV1_RESTRICT const left_column) {
734 const auto* const top = static_cast<const uint8_t*>(top_row);
735 const __m128i top_right = _mm_set1_epi16(top[7]);
736 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
737 __m128i scale = _mm_set1_epi16(256);
738 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
739 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
740 scale = _mm_set1_epi16(128);
741 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
742 __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
743 auto* dst = static_cast<uint8_t*>(dest);
744 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
745 const __m128i y_select = _mm_set1_epi32(y_mask);
746 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
747 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
748 dst += stride;
749 }
750 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
751 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
752 const __m128i y_select = _mm_set1_epi32(y_mask);
753 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
754 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
755 dst += stride;
756 }
757 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
758 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
759 const __m128i y_select = _mm_set1_epi32(y_mask);
760 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
761 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
762 dst += stride;
763 }
764 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
765 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
766 const __m128i y_select = _mm_set1_epi32(y_mask);
767 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
768 WriteSmoothDirectionalSum8(dst, left_y, weights, scaled_top_right, scale);
769 dst += stride;
770 }
771 }
772
SmoothHorizontal16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)773 void SmoothHorizontal16x4_SSE4_1(
774 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
775 const void* LIBGAV1_RESTRICT const top_row,
776 const void* LIBGAV1_RESTRICT const left_column) {
777 const auto* const top = static_cast<const uint8_t*>(top_row);
778 const __m128i top_right = _mm_set1_epi16(top[15]);
779 const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
780 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
781 __m128i scale = _mm_set1_epi16(256);
782 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
783 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
784 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
785 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
786 const __m128i scaled_top_right1 =
787 _mm_mullo_epi16(inverted_weights1, top_right);
788 const __m128i scaled_top_right2 =
789 _mm_mullo_epi16(inverted_weights2, top_right);
790 scale = _mm_set1_epi16(128);
791 __m128i y_mask = _mm_set1_epi32(0x01000100);
792 __m128i left_y = _mm_shuffle_epi8(left, y_mask);
793 auto* dst = static_cast<uint8_t*>(dest);
794 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
795 scaled_top_right1, scaled_top_right2, scale);
796 dst += stride;
797 y_mask = _mm_set1_epi32(0x03020302);
798 left_y = _mm_shuffle_epi8(left, y_mask);
799 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
800 scaled_top_right1, scaled_top_right2, scale);
801 dst += stride;
802 y_mask = _mm_set1_epi32(0x05040504);
803 left_y = _mm_shuffle_epi8(left, y_mask);
804 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
805 scaled_top_right1, scaled_top_right2, scale);
806 dst += stride;
807 y_mask = _mm_set1_epi32(0x07060706);
808 left_y = _mm_shuffle_epi8(left, y_mask);
809 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
810 scaled_top_right1, scaled_top_right2, scale);
811 }
812
SmoothHorizontal16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)813 void SmoothHorizontal16x8_SSE4_1(
814 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
815 const void* LIBGAV1_RESTRICT const top_row,
816 const void* LIBGAV1_RESTRICT const left_column) {
817 const auto* const top = static_cast<const uint8_t*>(top_row);
818 const __m128i top_right = _mm_set1_epi16(top[15]);
819 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
820 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
821 __m128i scale = _mm_set1_epi16(256);
822 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
823 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
824 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
825 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
826 const __m128i scaled_top_right1 =
827 _mm_mullo_epi16(inverted_weights1, top_right);
828 const __m128i scaled_top_right2 =
829 _mm_mullo_epi16(inverted_weights2, top_right);
830 scale = _mm_set1_epi16(128);
831 auto* dst = static_cast<uint8_t*>(dest);
832 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
833 const __m128i y_select = _mm_set1_epi32(y_mask);
834 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
835 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
836 scaled_top_right1, scaled_top_right2, scale);
837 dst += stride;
838 }
839 }
840
SmoothHorizontal16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)841 void SmoothHorizontal16x16_SSE4_1(
842 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
843 const void* LIBGAV1_RESTRICT const top_row,
844 const void* LIBGAV1_RESTRICT const left_column) {
845 const auto* const top = static_cast<const uint8_t*>(top_row);
846 const __m128i top_right = _mm_set1_epi16(top[15]);
847 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
848 __m128i scale = _mm_set1_epi16(256);
849 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
850 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
851 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
852 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
853 const __m128i scaled_top_right1 =
854 _mm_mullo_epi16(inverted_weights1, top_right);
855 const __m128i scaled_top_right2 =
856 _mm_mullo_epi16(inverted_weights2, top_right);
857 scale = _mm_set1_epi16(128);
858 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
859 __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
860 auto* dst = static_cast<uint8_t*>(dest);
861 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
862 const __m128i y_select = _mm_set1_epi32(y_mask);
863 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
864 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
865 scaled_top_right1, scaled_top_right2, scale);
866 dst += stride;
867 }
868 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
869 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
870 const __m128i y_select = _mm_set1_epi32(y_mask);
871 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
872 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
873 scaled_top_right1, scaled_top_right2, scale);
874 dst += stride;
875 }
876 }
877
SmoothHorizontal16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)878 void SmoothHorizontal16x32_SSE4_1(
879 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
880 const void* LIBGAV1_RESTRICT const top_row,
881 const void* LIBGAV1_RESTRICT const left_column) {
882 const auto* const top = static_cast<const uint8_t*>(top_row);
883 const __m128i top_right = _mm_set1_epi16(top[15]);
884 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
885 __m128i scale = _mm_set1_epi16(256);
886 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
887 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
888 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
889 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
890 const __m128i scaled_top_right1 =
891 _mm_mullo_epi16(inverted_weights1, top_right);
892 const __m128i scaled_top_right2 =
893 _mm_mullo_epi16(inverted_weights2, top_right);
894 scale = _mm_set1_epi16(128);
895 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
896 __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
897 auto* dst = static_cast<uint8_t*>(dest);
898 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
899 const __m128i y_select = _mm_set1_epi32(y_mask);
900 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
901 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
902 scaled_top_right1, scaled_top_right2, scale);
903 dst += stride;
904 }
905 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
906 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
907 const __m128i y_select = _mm_set1_epi32(y_mask);
908 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
909 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
910 scaled_top_right1, scaled_top_right2, scale);
911 dst += stride;
912 }
913 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
914 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
915 const __m128i y_select = _mm_set1_epi32(y_mask);
916 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
917 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
918 scaled_top_right1, scaled_top_right2, scale);
919 dst += stride;
920 }
921 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
922 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
923 const __m128i y_select = _mm_set1_epi32(y_mask);
924 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
925 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
926 scaled_top_right1, scaled_top_right2, scale);
927 dst += stride;
928 }
929 }
930
SmoothHorizontal16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)931 void SmoothHorizontal16x64_SSE4_1(
932 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
933 const void* LIBGAV1_RESTRICT const top_row,
934 const void* LIBGAV1_RESTRICT const left_column) {
935 const auto* const top = static_cast<const uint8_t*>(top_row);
936 const __m128i top_right = _mm_set1_epi16(top[15]);
937 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
938 __m128i scale = _mm_set1_epi16(256);
939 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
940 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
941 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
942 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
943 const __m128i scaled_top_right1 =
944 _mm_mullo_epi16(inverted_weights1, top_right);
945 const __m128i scaled_top_right2 =
946 _mm_mullo_epi16(inverted_weights2, top_right);
947 scale = _mm_set1_epi16(128);
948 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
949 auto* dst = static_cast<uint8_t*>(dest);
950 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
951 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
952 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
953 const __m128i y_select = _mm_set1_epi32(y_mask);
954 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
955 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
956 scaled_top_right1, scaled_top_right2, scale);
957 dst += stride;
958 }
959 }
960 }
961
SmoothHorizontal32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)962 void SmoothHorizontal32x8_SSE4_1(
963 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
964 const void* LIBGAV1_RESTRICT const top_row,
965 const void* LIBGAV1_RESTRICT const left_column) {
966 const auto* const top = static_cast<const uint8_t*>(top_row);
967 const __m128i top_right = _mm_set1_epi16(top[31]);
968 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
969 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
970 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
971 __m128i scale = _mm_set1_epi16(256);
972 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
973 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
974 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
975 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
976 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
977 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
978 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
979 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
980 const __m128i scaled_top_right1 =
981 _mm_mullo_epi16(inverted_weights1, top_right);
982 const __m128i scaled_top_right2 =
983 _mm_mullo_epi16(inverted_weights2, top_right);
984 const __m128i scaled_top_right3 =
985 _mm_mullo_epi16(inverted_weights3, top_right);
986 const __m128i scaled_top_right4 =
987 _mm_mullo_epi16(inverted_weights4, top_right);
988 scale = _mm_set1_epi16(128);
989 auto* dst = static_cast<uint8_t*>(dest);
990 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
991 __m128i y_select = _mm_set1_epi32(y_mask);
992 __m128i left_y = _mm_shuffle_epi8(left, y_select);
993 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
994 scaled_top_right1, scaled_top_right2, scale);
995 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
996 scaled_top_right3, scaled_top_right4, scale);
997 dst += stride;
998 }
999 }
1000
SmoothHorizontal32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1001 void SmoothHorizontal32x16_SSE4_1(
1002 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1003 const void* LIBGAV1_RESTRICT const top_row,
1004 const void* LIBGAV1_RESTRICT const left_column) {
1005 const auto* const top = static_cast<const uint8_t*>(top_row);
1006 const __m128i top_right = _mm_set1_epi16(top[31]);
1007 const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
1008 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1009 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1010 __m128i scale = _mm_set1_epi16(256);
1011 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1012 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
1013 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1014 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
1015 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1016 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1017 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1018 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1019 const __m128i scaled_top_right1 =
1020 _mm_mullo_epi16(inverted_weights1, top_right);
1021 const __m128i scaled_top_right2 =
1022 _mm_mullo_epi16(inverted_weights2, top_right);
1023 const __m128i scaled_top_right3 =
1024 _mm_mullo_epi16(inverted_weights3, top_right);
1025 const __m128i scaled_top_right4 =
1026 _mm_mullo_epi16(inverted_weights4, top_right);
1027 scale = _mm_set1_epi16(128);
1028 auto* dst = static_cast<uint8_t*>(dest);
1029 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1030 __m128i y_select = _mm_set1_epi32(y_mask);
1031 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
1032 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1033 scaled_top_right1, scaled_top_right2, scale);
1034 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1035 scaled_top_right3, scaled_top_right4, scale);
1036 dst += stride;
1037 }
1038 const __m128i left2 =
1039 _mm_cvtepu8_epi16(LoadLo8(static_cast<const uint8_t*>(left_column) + 8));
1040 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1041 __m128i y_select = _mm_set1_epi32(y_mask);
1042 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
1043 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1044 scaled_top_right1, scaled_top_right2, scale);
1045 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1046 scaled_top_right3, scaled_top_right4, scale);
1047 dst += stride;
1048 }
1049 }
1050
SmoothHorizontal32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1051 void SmoothHorizontal32x32_SSE4_1(
1052 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1053 const void* LIBGAV1_RESTRICT const top_row,
1054 const void* LIBGAV1_RESTRICT const left_column) {
1055 const auto* const top = static_cast<const uint8_t*>(top_row);
1056 const __m128i top_right = _mm_set1_epi16(top[31]);
1057 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1058 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1059 __m128i scale = _mm_set1_epi16(256);
1060 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1061 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
1062 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1063 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
1064 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1065 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1066 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1067 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1068 const __m128i scaled_top_right1 =
1069 _mm_mullo_epi16(inverted_weights1, top_right);
1070 const __m128i scaled_top_right2 =
1071 _mm_mullo_epi16(inverted_weights2, top_right);
1072 const __m128i scaled_top_right3 =
1073 _mm_mullo_epi16(inverted_weights3, top_right);
1074 const __m128i scaled_top_right4 =
1075 _mm_mullo_epi16(inverted_weights4, top_right);
1076 scale = _mm_set1_epi16(128);
1077 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1078 __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
1079 auto* dst = static_cast<uint8_t*>(dest);
1080 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1081 __m128i y_select = _mm_set1_epi32(y_mask);
1082 __m128i left_y = _mm_shuffle_epi8(left, y_select);
1083 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1084 scaled_top_right1, scaled_top_right2, scale);
1085 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1086 scaled_top_right3, scaled_top_right4, scale);
1087 dst += stride;
1088 }
1089 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
1090 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1091 __m128i y_select = _mm_set1_epi32(y_mask);
1092 __m128i left_y = _mm_shuffle_epi8(left, y_select);
1093 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1094 scaled_top_right1, scaled_top_right2, scale);
1095 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1096 scaled_top_right3, scaled_top_right4, scale);
1097 dst += stride;
1098 }
1099 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
1100 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1101 __m128i y_select = _mm_set1_epi32(y_mask);
1102 __m128i left_y = _mm_shuffle_epi8(left, y_select);
1103 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1104 scaled_top_right1, scaled_top_right2, scale);
1105 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1106 scaled_top_right3, scaled_top_right4, scale);
1107 dst += stride;
1108 }
1109 left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
1110 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1111 __m128i y_select = _mm_set1_epi32(y_mask);
1112 __m128i left_y = _mm_shuffle_epi8(left, y_select);
1113 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1114 scaled_top_right1, scaled_top_right2, scale);
1115 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1116 scaled_top_right3, scaled_top_right4, scale);
1117 dst += stride;
1118 }
1119 }
1120
SmoothHorizontal32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1121 void SmoothHorizontal32x64_SSE4_1(
1122 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1123 const void* LIBGAV1_RESTRICT const top_row,
1124 const void* LIBGAV1_RESTRICT const left_column) {
1125 const auto* const top = static_cast<const uint8_t*>(top_row);
1126 const __m128i top_right = _mm_set1_epi16(top[31]);
1127 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1128 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1129 __m128i scale = _mm_set1_epi16(256);
1130 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1131 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
1132 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1133 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
1134 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1135 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1136 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1137 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1138 const __m128i scaled_top_right1 =
1139 _mm_mullo_epi16(inverted_weights1, top_right);
1140 const __m128i scaled_top_right2 =
1141 _mm_mullo_epi16(inverted_weights2, top_right);
1142 const __m128i scaled_top_right3 =
1143 _mm_mullo_epi16(inverted_weights3, top_right);
1144 const __m128i scaled_top_right4 =
1145 _mm_mullo_epi16(inverted_weights4, top_right);
1146 scale = _mm_set1_epi16(128);
1147 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1148 auto* dst = static_cast<uint8_t*>(dest);
1149 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
1150 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
1151 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1152 const __m128i y_select = _mm_set1_epi32(y_mask);
1153 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
1154 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1155 scaled_top_right1, scaled_top_right2, scale);
1156 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1157 scaled_top_right3, scaled_top_right4, scale);
1158 dst += stride;
1159 }
1160 }
1161 }
1162
SmoothHorizontal64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1163 void SmoothHorizontal64x16_SSE4_1(
1164 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1165 const void* LIBGAV1_RESTRICT const top_row,
1166 const void* LIBGAV1_RESTRICT const left_column) {
1167 const auto* const top = static_cast<const uint8_t*>(top_row);
1168 const __m128i top_right = _mm_set1_epi16(top[63]);
1169 const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
1170 const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
1171 const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
1172 __m128i scale = _mm_set1_epi16(256);
1173 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
1174 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
1175 const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
1176 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
1177 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1178 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1179 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1180 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1181 const __m128i scaled_top_right1 =
1182 _mm_mullo_epi16(inverted_weights1, top_right);
1183 const __m128i scaled_top_right2 =
1184 _mm_mullo_epi16(inverted_weights2, top_right);
1185 const __m128i scaled_top_right3 =
1186 _mm_mullo_epi16(inverted_weights3, top_right);
1187 const __m128i scaled_top_right4 =
1188 _mm_mullo_epi16(inverted_weights4, top_right);
1189 const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
1190 const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
1191 const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
1192 const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
1193 const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
1194 const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
1195 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
1196 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
1197 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
1198 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
1199 const __m128i scaled_top_right5 =
1200 _mm_mullo_epi16(inverted_weights5, top_right);
1201 const __m128i scaled_top_right6 =
1202 _mm_mullo_epi16(inverted_weights6, top_right);
1203 const __m128i scaled_top_right7 =
1204 _mm_mullo_epi16(inverted_weights7, top_right);
1205 const __m128i scaled_top_right8 =
1206 _mm_mullo_epi16(inverted_weights8, top_right);
1207 scale = _mm_set1_epi16(128);
1208 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1209 auto* dst = static_cast<uint8_t*>(dest);
1210 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1211 __m128i y_select = _mm_set1_epi32(y_mask);
1212 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
1213 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1214 scaled_top_right1, scaled_top_right2, scale);
1215 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1216 scaled_top_right3, scaled_top_right4, scale);
1217 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1218 scaled_top_right5, scaled_top_right6, scale);
1219 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1220 scaled_top_right7, scaled_top_right8, scale);
1221 dst += stride;
1222 }
1223 const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
1224 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1225 __m128i y_select = _mm_set1_epi32(y_mask);
1226 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
1227 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1228 scaled_top_right1, scaled_top_right2, scale);
1229 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1230 scaled_top_right3, scaled_top_right4, scale);
1231 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1232 scaled_top_right5, scaled_top_right6, scale);
1233 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1234 scaled_top_right7, scaled_top_right8, scale);
1235 dst += stride;
1236 }
1237 }
1238
SmoothHorizontal64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1239 void SmoothHorizontal64x32_SSE4_1(
1240 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1241 const void* LIBGAV1_RESTRICT const top_row,
1242 const void* LIBGAV1_RESTRICT const left_column) {
1243 const auto* const top = static_cast<const uint8_t*>(top_row);
1244 const __m128i top_right = _mm_set1_epi16(top[63]);
1245 const __m128i left1 = _mm_cvtepu8_epi16(LoadLo8(left_column));
1246 const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
1247 const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
1248 __m128i scale = _mm_set1_epi16(256);
1249 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
1250 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
1251 const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
1252 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
1253 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1254 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1255 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1256 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1257 const __m128i scaled_top_right1 =
1258 _mm_mullo_epi16(inverted_weights1, top_right);
1259 const __m128i scaled_top_right2 =
1260 _mm_mullo_epi16(inverted_weights2, top_right);
1261 const __m128i scaled_top_right3 =
1262 _mm_mullo_epi16(inverted_weights3, top_right);
1263 const __m128i scaled_top_right4 =
1264 _mm_mullo_epi16(inverted_weights4, top_right);
1265 const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
1266 const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
1267 const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
1268 const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
1269 const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
1270 const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
1271 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
1272 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
1273 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
1274 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
1275 const __m128i scaled_top_right5 =
1276 _mm_mullo_epi16(inverted_weights5, top_right);
1277 const __m128i scaled_top_right6 =
1278 _mm_mullo_epi16(inverted_weights6, top_right);
1279 const __m128i scaled_top_right7 =
1280 _mm_mullo_epi16(inverted_weights7, top_right);
1281 const __m128i scaled_top_right8 =
1282 _mm_mullo_epi16(inverted_weights8, top_right);
1283 scale = _mm_set1_epi16(128);
1284 auto* dst = static_cast<uint8_t*>(dest);
1285 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1286 const __m128i y_select = _mm_set1_epi32(y_mask);
1287 const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
1288 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1289 scaled_top_right1, scaled_top_right2, scale);
1290 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1291 scaled_top_right3, scaled_top_right4, scale);
1292 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1293 scaled_top_right5, scaled_top_right6, scale);
1294 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1295 scaled_top_right7, scaled_top_right8, scale);
1296 dst += stride;
1297 }
1298 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1299 const __m128i left2 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 8));
1300 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1301 const __m128i y_select = _mm_set1_epi32(y_mask);
1302 const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
1303 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1304 scaled_top_right1, scaled_top_right2, scale);
1305 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1306 scaled_top_right3, scaled_top_right4, scale);
1307 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1308 scaled_top_right5, scaled_top_right6, scale);
1309 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1310 scaled_top_right7, scaled_top_right8, scale);
1311 dst += stride;
1312 }
1313 const __m128i left3 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 16));
1314 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1315 const __m128i y_select = _mm_set1_epi32(y_mask);
1316 const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
1317 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1318 scaled_top_right1, scaled_top_right2, scale);
1319 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1320 scaled_top_right3, scaled_top_right4, scale);
1321 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1322 scaled_top_right5, scaled_top_right6, scale);
1323 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1324 scaled_top_right7, scaled_top_right8, scale);
1325 dst += stride;
1326 }
1327 const __m128i left4 = _mm_cvtepu8_epi16(LoadLo8(left_ptr + 24));
1328 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1329 const __m128i y_select = _mm_set1_epi32(y_mask);
1330 const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
1331 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1332 scaled_top_right1, scaled_top_right2, scale);
1333 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1334 scaled_top_right3, scaled_top_right4, scale);
1335 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1336 scaled_top_right5, scaled_top_right6, scale);
1337 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1338 scaled_top_right7, scaled_top_right8, scale);
1339 dst += stride;
1340 }
1341 }
1342
SmoothHorizontal64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1343 void SmoothHorizontal64x64_SSE4_1(
1344 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1345 const void* LIBGAV1_RESTRICT const top_row,
1346 const void* LIBGAV1_RESTRICT const left_column) {
1347 const auto* const top = static_cast<const uint8_t*>(top_row);
1348 const __m128i top_right = _mm_set1_epi16(top[63]);
1349 const __m128i weights_lolo = LoadUnaligned16(kSmoothWeights + 60);
1350 const __m128i weights_lohi = LoadUnaligned16(kSmoothWeights + 76);
1351 __m128i scale = _mm_set1_epi16(256);
1352 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lolo);
1353 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
1354 const __m128i weights3 = _mm_cvtepu8_epi16(weights_lohi);
1355 const __m128i weights4 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
1356 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1357 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1358 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1359 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1360 const __m128i scaled_top_right1 =
1361 _mm_mullo_epi16(inverted_weights1, top_right);
1362 const __m128i scaled_top_right2 =
1363 _mm_mullo_epi16(inverted_weights2, top_right);
1364 const __m128i scaled_top_right3 =
1365 _mm_mullo_epi16(inverted_weights3, top_right);
1366 const __m128i scaled_top_right4 =
1367 _mm_mullo_epi16(inverted_weights4, top_right);
1368 const __m128i weights_hilo = LoadUnaligned16(kSmoothWeights + 92);
1369 const __m128i weights_hihi = LoadUnaligned16(kSmoothWeights + 108);
1370 const __m128i weights5 = _mm_cvtepu8_epi16(weights_hilo);
1371 const __m128i weights6 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
1372 const __m128i weights7 = _mm_cvtepu8_epi16(weights_hihi);
1373 const __m128i weights8 = _mm_cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
1374 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
1375 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
1376 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
1377 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
1378 const __m128i scaled_top_right5 =
1379 _mm_mullo_epi16(inverted_weights5, top_right);
1380 const __m128i scaled_top_right6 =
1381 _mm_mullo_epi16(inverted_weights6, top_right);
1382 const __m128i scaled_top_right7 =
1383 _mm_mullo_epi16(inverted_weights7, top_right);
1384 const __m128i scaled_top_right8 =
1385 _mm_mullo_epi16(inverted_weights8, top_right);
1386 scale = _mm_set1_epi16(128);
1387 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1388 auto* dst = static_cast<uint8_t*>(dest);
1389 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
1390 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_ptr + left_offset));
1391 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1392 const __m128i y_select = _mm_set1_epi32(y_mask);
1393 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
1394 WriteSmoothDirectionalSum16(dst, left_y, left_y, weights1, weights2,
1395 scaled_top_right1, scaled_top_right2, scale);
1396 WriteSmoothDirectionalSum16(dst + 16, left_y, left_y, weights3, weights4,
1397 scaled_top_right3, scaled_top_right4, scale);
1398 WriteSmoothDirectionalSum16(dst + 32, left_y, left_y, weights5, weights6,
1399 scaled_top_right5, scaled_top_right6, scale);
1400 WriteSmoothDirectionalSum16(dst + 48, left_y, left_y, weights7, weights8,
1401 scaled_top_right7, scaled_top_right8, scale);
1402 dst += stride;
1403 }
1404 }
1405 }
1406
LoadSmoothVerticalPixels4(const uint8_t * LIBGAV1_RESTRICT above,const uint8_t * LIBGAV1_RESTRICT left,const int height,__m128i * pixels)1407 inline void LoadSmoothVerticalPixels4(const uint8_t* LIBGAV1_RESTRICT above,
1408 const uint8_t* LIBGAV1_RESTRICT left,
1409 const int height, __m128i* pixels) {
1410 __m128i top = Load4(above);
1411 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1412 top = _mm_cvtepu8_epi16(top);
1413 pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1414 }
1415
1416 // |weight_array| alternates weight vectors from the table with their inverted
1417 // (256-w) counterparts. This is precomputed by the compiler when the weights
1418 // table is visible to this module. Removing this visibility can cut speed by up
1419 // to half in both 4xH and 8xH transforms.
LoadSmoothVerticalWeights4(const uint8_t * LIBGAV1_RESTRICT weight_array,const int height,__m128i * weights)1420 inline void LoadSmoothVerticalWeights4(const uint8_t* LIBGAV1_RESTRICT
1421 weight_array,
1422 const int height, __m128i* weights) {
1423 const __m128i inverter = _mm_set1_epi16(256);
1424
1425 if (height == 4) {
1426 const __m128i weight = Load4(weight_array);
1427 weights[0] = _mm_cvtepu8_epi16(weight);
1428 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1429 } else if (height == 8) {
1430 const __m128i weight = LoadLo8(weight_array + 4);
1431 weights[0] = _mm_cvtepu8_epi16(weight);
1432 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1433 } else {
1434 const __m128i weight = LoadUnaligned16(weight_array + 12);
1435 const __m128i zero = _mm_setzero_si128();
1436 weights[0] = _mm_cvtepu8_epi16(weight);
1437 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1438 weights[2] = _mm_unpackhi_epi8(weight, zero);
1439 weights[3] = _mm_sub_epi16(inverter, weights[2]);
1440 }
1441 }
1442
WriteSmoothVertical4xH(const __m128i * pixel,const __m128i * weight,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t stride)1443 inline void WriteSmoothVertical4xH(const __m128i* pixel, const __m128i* weight,
1444 const int height,
1445 uint8_t* LIBGAV1_RESTRICT dst,
1446 const ptrdiff_t stride) {
1447 const __m128i pred_round = _mm_set1_epi32(128);
1448 const __m128i mask_increment = _mm_set1_epi16(0x0202);
1449 const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1450 __m128i y_select = _mm_set1_epi16(0x0100);
1451
1452 for (int y = 0; y < height; ++y) {
1453 const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1454 const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1455 const __m128i alternate_weights =
1456 _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1457 // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1458 // The madd instruction yields four results of the form:
1459 // (top_row[x] * weight[y] + corner * inverted_weight[y])
1460 __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1461 sum = _mm_add_epi32(sum, pred_round);
1462 sum = _mm_srai_epi32(sum, 8);
1463 sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1464 Store4(dst, sum);
1465 dst += stride;
1466 y_select = _mm_add_epi16(y_select, mask_increment);
1467 }
1468 }
1469
SmoothVertical4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1470 void SmoothVertical4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1471 const ptrdiff_t stride,
1472 const void* LIBGAV1_RESTRICT const top_row,
1473 const void* LIBGAV1_RESTRICT const left_column) {
1474 const auto* const left = static_cast<const uint8_t*>(left_column);
1475 const auto* const above = static_cast<const uint8_t*>(top_row);
1476 auto* dst = static_cast<uint8_t*>(dest);
1477 __m128i pixels;
1478 LoadSmoothVerticalPixels4(above, left, 4, &pixels);
1479
1480 __m128i weights[2];
1481 LoadSmoothVerticalWeights4(kSmoothWeights, 4, weights);
1482
1483 WriteSmoothVertical4xH(&pixels, weights, 4, dst, stride);
1484 }
1485
SmoothVertical4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1486 void SmoothVertical4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1487 const ptrdiff_t stride,
1488 const void* LIBGAV1_RESTRICT const top_row,
1489 const void* LIBGAV1_RESTRICT const left_column) {
1490 const auto* const left = static_cast<const uint8_t*>(left_column);
1491 const auto* const above = static_cast<const uint8_t*>(top_row);
1492 auto* dst = static_cast<uint8_t*>(dest);
1493 __m128i pixels;
1494 LoadSmoothVerticalPixels4(above, left, 8, &pixels);
1495
1496 __m128i weights[2];
1497 LoadSmoothVerticalWeights4(kSmoothWeights, 8, weights);
1498
1499 WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
1500 }
1501
SmoothVertical4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1502 void SmoothVertical4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1503 const ptrdiff_t stride,
1504 const void* LIBGAV1_RESTRICT const top_row,
1505 const void* LIBGAV1_RESTRICT const left_column) {
1506 const auto* const left = static_cast<const uint8_t*>(left_column);
1507 const auto* const above = static_cast<const uint8_t*>(top_row);
1508 auto* dst = static_cast<uint8_t*>(dest);
1509 __m128i pixels;
1510 LoadSmoothVerticalPixels4(above, left, 16, &pixels);
1511
1512 __m128i weights[4];
1513 LoadSmoothVerticalWeights4(kSmoothWeights, 16, weights);
1514
1515 WriteSmoothVertical4xH(&pixels, weights, 8, dst, stride);
1516 dst += stride << 3;
1517 WriteSmoothVertical4xH(&pixels, &weights[2], 8, dst, stride);
1518 }
1519
SmoothVertical8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1520 void SmoothVertical8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1521 const ptrdiff_t stride,
1522 const void* LIBGAV1_RESTRICT const top_row,
1523 const void* LIBGAV1_RESTRICT const left_column) {
1524 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1525 const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
1526 const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
1527 __m128i scale = _mm_set1_epi16(256);
1528 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1529 const __m128i scaled_bottom_left =
1530 _mm_mullo_epi16(inverted_weights, bottom_left);
1531 scale = _mm_set1_epi16(128);
1532
1533 auto* dst = static_cast<uint8_t*>(dest);
1534 __m128i y_select = _mm_set1_epi32(0x01000100);
1535 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1536 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1537 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1538 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1539 dst += stride;
1540 y_select = _mm_set1_epi32(0x03020302);
1541 weights_y = _mm_shuffle_epi8(weights, y_select);
1542 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1543 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1544 dst += stride;
1545 y_select = _mm_set1_epi32(0x05040504);
1546 weights_y = _mm_shuffle_epi8(weights, y_select);
1547 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1548 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1549 dst += stride;
1550 y_select = _mm_set1_epi32(0x07060706);
1551 weights_y = _mm_shuffle_epi8(weights, y_select);
1552 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1553 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y, scale);
1554 }
1555
SmoothVertical8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1556 void SmoothVertical8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1557 const ptrdiff_t stride,
1558 const void* LIBGAV1_RESTRICT const top_row,
1559 const void* LIBGAV1_RESTRICT const left_column) {
1560 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1561 const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
1562 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
1563 __m128i scale = _mm_set1_epi16(256);
1564 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1565 const __m128i scaled_bottom_left =
1566 _mm_mullo_epi16(inverted_weights, bottom_left);
1567 scale = _mm_set1_epi16(128);
1568 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1569 auto* dst = static_cast<uint8_t*>(dest);
1570 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1571 const __m128i y_select = _mm_set1_epi32(y_mask);
1572 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1573 const __m128i scaled_bottom_left_y =
1574 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1575 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1576 scale);
1577 dst += stride;
1578 }
1579 }
1580
SmoothVertical8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1581 void SmoothVertical8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1582 const ptrdiff_t stride,
1583 const void* LIBGAV1_RESTRICT const top_row,
1584 const void* LIBGAV1_RESTRICT const left_column) {
1585 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1586 const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
1587 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
1588
1589 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
1590 const __m128i weights2 = _mm_cvtepu8_epi16(_mm_srli_si128(weights, 8));
1591 __m128i scale = _mm_set1_epi16(256);
1592 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1593 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1594 const __m128i scaled_bottom_left1 =
1595 _mm_mullo_epi16(inverted_weights1, bottom_left);
1596 const __m128i scaled_bottom_left2 =
1597 _mm_mullo_epi16(inverted_weights2, bottom_left);
1598 scale = _mm_set1_epi16(128);
1599 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1600 auto* dst = static_cast<uint8_t*>(dest);
1601 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1602 const __m128i y_select = _mm_set1_epi32(y_mask);
1603 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1604 const __m128i scaled_bottom_left_y =
1605 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1606 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1607 scale);
1608 dst += stride;
1609 }
1610 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1611 const __m128i y_select = _mm_set1_epi32(y_mask);
1612 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1613 const __m128i scaled_bottom_left_y =
1614 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1615 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1616 scale);
1617 dst += stride;
1618 }
1619 }
1620
SmoothVertical8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1621 void SmoothVertical8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1622 const ptrdiff_t stride,
1623 const void* LIBGAV1_RESTRICT const top_row,
1624 const void* LIBGAV1_RESTRICT const left_column) {
1625 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1626 const __m128i zero = _mm_setzero_si128();
1627 const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
1628 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1629 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1630 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1631 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1632 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1633 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1634 __m128i scale = _mm_set1_epi16(256);
1635 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1636 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1637 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1638 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1639 const __m128i scaled_bottom_left1 =
1640 _mm_mullo_epi16(inverted_weights1, bottom_left);
1641 const __m128i scaled_bottom_left2 =
1642 _mm_mullo_epi16(inverted_weights2, bottom_left);
1643 const __m128i scaled_bottom_left3 =
1644 _mm_mullo_epi16(inverted_weights3, bottom_left);
1645 const __m128i scaled_bottom_left4 =
1646 _mm_mullo_epi16(inverted_weights4, bottom_left);
1647 scale = _mm_set1_epi16(128);
1648 auto* dst = static_cast<uint8_t*>(dest);
1649 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
1650 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1651 const __m128i y_select = _mm_set1_epi32(y_mask);
1652 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1653 const __m128i scaled_bottom_left_y =
1654 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1655 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1656 scale);
1657 dst += stride;
1658 }
1659 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1660 const __m128i y_select = _mm_set1_epi32(y_mask);
1661 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1662 const __m128i scaled_bottom_left_y =
1663 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1664 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1665 scale);
1666 dst += stride;
1667 }
1668 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1669 const __m128i y_select = _mm_set1_epi32(y_mask);
1670 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1671 const __m128i scaled_bottom_left_y =
1672 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1673 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1674 scale);
1675 dst += stride;
1676 }
1677 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1678 const __m128i y_select = _mm_set1_epi32(y_mask);
1679 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1680 const __m128i scaled_bottom_left_y =
1681 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1682 WriteSmoothDirectionalSum8(dst, top, weights_y, scaled_bottom_left_y,
1683 scale);
1684 dst += stride;
1685 }
1686 }
1687
SmoothVertical16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1688 void SmoothVertical16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1689 const ptrdiff_t stride,
1690 const void* LIBGAV1_RESTRICT const top_row,
1691 const void* LIBGAV1_RESTRICT const left_column) {
1692 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1693 auto* dst = static_cast<uint8_t*>(dest);
1694 const __m128i bottom_left = _mm_set1_epi16(left_ptr[3]);
1695 const __m128i weights = _mm_cvtepu8_epi16(Load4(kSmoothWeights));
1696 __m128i scale = _mm_set1_epi16(256);
1697 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1698 const __m128i scaled_bottom_left =
1699 _mm_mullo_epi16(inverted_weights, bottom_left);
1700 scale = _mm_set1_epi16(128);
1701 const __m128i top = LoadUnaligned16(top_row);
1702 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1703 const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1704
1705 __m128i y_select = _mm_set1_epi32(0x01000100);
1706 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1707 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1708 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1709 scaled_bottom_left_y, scaled_bottom_left_y,
1710 scale);
1711 dst += stride;
1712 y_select = _mm_set1_epi32(0x03020302);
1713 weights_y = _mm_shuffle_epi8(weights, y_select);
1714 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1715 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1716 scaled_bottom_left_y, scaled_bottom_left_y,
1717 scale);
1718 dst += stride;
1719 y_select = _mm_set1_epi32(0x05040504);
1720 weights_y = _mm_shuffle_epi8(weights, y_select);
1721 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1722 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1723 scaled_bottom_left_y, scaled_bottom_left_y,
1724 scale);
1725 dst += stride;
1726 y_select = _mm_set1_epi32(0x07060706);
1727 weights_y = _mm_shuffle_epi8(weights, y_select);
1728 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1729 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1730 scaled_bottom_left_y, scaled_bottom_left_y,
1731 scale);
1732 }
1733
SmoothVertical16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1734 void SmoothVertical16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1735 const ptrdiff_t stride,
1736 const void* LIBGAV1_RESTRICT const top_row,
1737 const void* LIBGAV1_RESTRICT const left_column) {
1738 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1739 auto* dst = static_cast<uint8_t*>(dest);
1740 const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
1741 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
1742 __m128i scale = _mm_set1_epi16(256);
1743 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1744 const __m128i scaled_bottom_left =
1745 _mm_mullo_epi16(inverted_weights, bottom_left);
1746 scale = _mm_set1_epi16(128);
1747
1748 const __m128i top = LoadUnaligned16(top_row);
1749 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1750 const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1751 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1752 const __m128i y_select = _mm_set1_epi32(y_mask);
1753 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1754 const __m128i scaled_bottom_left_y =
1755 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1756 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1757 scaled_bottom_left_y, scaled_bottom_left_y,
1758 scale);
1759 dst += stride;
1760 }
1761 }
1762
SmoothVertical16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1763 void SmoothVertical16x16_SSE4_1(
1764 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1765 const void* LIBGAV1_RESTRICT const top_row,
1766 const void* LIBGAV1_RESTRICT const left_column) {
1767 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1768 auto* dst = static_cast<uint8_t*>(dest);
1769 const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
1770 const __m128i zero = _mm_setzero_si128();
1771 __m128i scale = _mm_set1_epi16(256);
1772 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
1773 const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
1774 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1775 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1776 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1777 const __m128i scaled_bottom_left_lo =
1778 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1779 const __m128i scaled_bottom_left_hi =
1780 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1781 scale = _mm_set1_epi16(128);
1782
1783 const __m128i top = LoadUnaligned16(top_row);
1784 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1785 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1786 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1787 const __m128i y_select = _mm_set1_epi32(y_mask);
1788 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1789 const __m128i scaled_bottom_left_y =
1790 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1791 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1792 scaled_bottom_left_y, scaled_bottom_left_y,
1793 scale);
1794 dst += stride;
1795 }
1796 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1797 const __m128i y_select = _mm_set1_epi32(y_mask);
1798 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1799 const __m128i scaled_bottom_left_y =
1800 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1801 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1802 scaled_bottom_left_y, scaled_bottom_left_y,
1803 scale);
1804 dst += stride;
1805 }
1806 }
1807
SmoothVertical16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1808 void SmoothVertical16x32_SSE4_1(
1809 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1810 const void* LIBGAV1_RESTRICT const top_row,
1811 const void* LIBGAV1_RESTRICT const left_column) {
1812 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1813 auto* dst = static_cast<uint8_t*>(dest);
1814 const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
1815 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
1816 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
1817 __m128i scale = _mm_set1_epi16(256);
1818 const __m128i zero = _mm_setzero_si128();
1819 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
1820 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1821 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
1822 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1823 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1824 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1825 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1826 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1827 const __m128i scaled_bottom_left1 =
1828 _mm_mullo_epi16(inverted_weights1, bottom_left);
1829 const __m128i scaled_bottom_left2 =
1830 _mm_mullo_epi16(inverted_weights2, bottom_left);
1831 const __m128i scaled_bottom_left3 =
1832 _mm_mullo_epi16(inverted_weights3, bottom_left);
1833 const __m128i scaled_bottom_left4 =
1834 _mm_mullo_epi16(inverted_weights4, bottom_left);
1835 scale = _mm_set1_epi16(128);
1836
1837 const __m128i top = LoadUnaligned16(top_row);
1838 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1839 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1840 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1841 const __m128i y_select = _mm_set1_epi32(y_mask);
1842 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1843 const __m128i scaled_bottom_left_y =
1844 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1845 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1846 scaled_bottom_left_y, scaled_bottom_left_y,
1847 scale);
1848 dst += stride;
1849 }
1850 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1851 const __m128i y_select = _mm_set1_epi32(y_mask);
1852 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1853 const __m128i scaled_bottom_left_y =
1854 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1855 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1856 scaled_bottom_left_y, scaled_bottom_left_y,
1857 scale);
1858 dst += stride;
1859 }
1860 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1861 const __m128i y_select = _mm_set1_epi32(y_mask);
1862 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1863 const __m128i scaled_bottom_left_y =
1864 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1865 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1866 scaled_bottom_left_y, scaled_bottom_left_y,
1867 scale);
1868 dst += stride;
1869 }
1870 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1871 const __m128i y_select = _mm_set1_epi32(y_mask);
1872 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1873 const __m128i scaled_bottom_left_y =
1874 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1875 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1876 scaled_bottom_left_y, scaled_bottom_left_y,
1877 scale);
1878 dst += stride;
1879 }
1880 }
1881
SmoothVertical16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1882 void SmoothVertical16x64_SSE4_1(
1883 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1884 const void* LIBGAV1_RESTRICT const top_row,
1885 const void* LIBGAV1_RESTRICT const left_column) {
1886 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1887 auto* dst = static_cast<uint8_t*>(dest);
1888 const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
1889 const __m128i scale = _mm_set1_epi16(256);
1890 const __m128i round = _mm_set1_epi16(128);
1891 const __m128i zero = _mm_setzero_si128();
1892
1893 const __m128i top = LoadUnaligned16(top_row);
1894 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1895 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1896 const uint8_t* weights_base_ptr = kSmoothWeights + 60;
1897 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1898 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1899 const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
1900 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1901 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1902 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1903 const __m128i scaled_bottom_left_lo =
1904 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1905 const __m128i scaled_bottom_left_hi =
1906 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1907
1908 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1909 const __m128i y_select = _mm_set1_epi32(y_mask);
1910 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1911 const __m128i scaled_bottom_left_y =
1912 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1913 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1914 scaled_bottom_left_y, scaled_bottom_left_y,
1915 round);
1916 dst += stride;
1917 }
1918 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1919 const __m128i y_select = _mm_set1_epi32(y_mask);
1920 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1921 const __m128i scaled_bottom_left_y =
1922 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1923 WriteSmoothDirectionalSum16(dst, top_lo, top_hi, weights_y, weights_y,
1924 scaled_bottom_left_y, scaled_bottom_left_y,
1925 round);
1926 dst += stride;
1927 }
1928 }
1929 }
1930
SmoothVertical32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1931 void SmoothVertical32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1932 const ptrdiff_t stride,
1933 const void* LIBGAV1_RESTRICT const top_row,
1934 const void* LIBGAV1_RESTRICT const left_column) {
1935 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1936 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1937 auto* dst = static_cast<uint8_t*>(dest);
1938 const __m128i zero = _mm_setzero_si128();
1939 const __m128i bottom_left = _mm_set1_epi16(left_ptr[7]);
1940 const __m128i top_lo = LoadUnaligned16(top_ptr);
1941 const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
1942 const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
1943 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1944 const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
1945 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1946 __m128i scale = _mm_set1_epi16(256);
1947 const __m128i weights = _mm_cvtepu8_epi16(LoadLo8(kSmoothWeights + 4));
1948 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1949 const __m128i scaled_bottom_left =
1950 _mm_mullo_epi16(inverted_weights, bottom_left);
1951 scale = _mm_set1_epi16(128);
1952 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1953 __m128i y_select = _mm_set1_epi32(y_mask);
1954 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1955 const __m128i scaled_bottom_left_y =
1956 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1957 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
1958 scaled_bottom_left_y, scaled_bottom_left_y,
1959 scale);
1960 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
1961 scaled_bottom_left_y, scaled_bottom_left_y,
1962 scale);
1963 dst += stride;
1964 }
1965 }
1966
SmoothVertical32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1967 void SmoothVertical32x16_SSE4_1(
1968 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
1969 const void* LIBGAV1_RESTRICT const top_row,
1970 const void* LIBGAV1_RESTRICT const left_column) {
1971 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1972 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1973 auto* dst = static_cast<uint8_t*>(dest);
1974 const __m128i zero = _mm_setzero_si128();
1975 const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
1976 const __m128i top_lo = LoadUnaligned16(top_ptr);
1977 const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
1978 const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
1979 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1980 const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
1981 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1982 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
1983 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
1984 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1985 __m128i scale = _mm_set1_epi16(256);
1986 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1987 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1988 const __m128i scaled_bottom_left1 =
1989 _mm_mullo_epi16(inverted_weights1, bottom_left);
1990 const __m128i scaled_bottom_left2 =
1991 _mm_mullo_epi16(inverted_weights2, bottom_left);
1992 scale = _mm_set1_epi16(128);
1993 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1994 __m128i y_select = _mm_set1_epi32(y_mask);
1995 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1996 const __m128i scaled_bottom_left_y =
1997 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1998 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
1999 scaled_bottom_left_y, scaled_bottom_left_y,
2000 scale);
2001 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2002 scaled_bottom_left_y, scaled_bottom_left_y,
2003 scale);
2004 dst += stride;
2005 }
2006 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2007 __m128i y_select = _mm_set1_epi32(y_mask);
2008 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2009 const __m128i scaled_bottom_left_y =
2010 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2011 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2012 scaled_bottom_left_y, scaled_bottom_left_y,
2013 scale);
2014 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2015 scaled_bottom_left_y, scaled_bottom_left_y,
2016 scale);
2017 dst += stride;
2018 }
2019 }
2020
SmoothVertical32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2021 void SmoothVertical32x32_SSE4_1(
2022 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2023 const void* LIBGAV1_RESTRICT const top_row,
2024 const void* LIBGAV1_RESTRICT const left_column) {
2025 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2026 auto* dst = static_cast<uint8_t*>(dest);
2027 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2028 const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
2029 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
2030 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
2031 const __m128i zero = _mm_setzero_si128();
2032 __m128i scale = _mm_set1_epi16(256);
2033 const __m128i top_lo = LoadUnaligned16(top_ptr);
2034 const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
2035 const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
2036 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
2037 const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
2038 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
2039 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
2040 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
2041 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
2042 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
2043 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2044 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2045 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2046 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2047 const __m128i scaled_bottom_left1 =
2048 _mm_mullo_epi16(inverted_weights1, bottom_left);
2049 const __m128i scaled_bottom_left2 =
2050 _mm_mullo_epi16(inverted_weights2, bottom_left);
2051 const __m128i scaled_bottom_left3 =
2052 _mm_mullo_epi16(inverted_weights3, bottom_left);
2053 const __m128i scaled_bottom_left4 =
2054 _mm_mullo_epi16(inverted_weights4, bottom_left);
2055 scale = _mm_set1_epi16(128);
2056 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2057 const __m128i y_select = _mm_set1_epi32(y_mask);
2058 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
2059 const __m128i scaled_bottom_left_y =
2060 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
2061 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2062 scaled_bottom_left_y, scaled_bottom_left_y,
2063 scale);
2064 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2065 scaled_bottom_left_y, scaled_bottom_left_y,
2066 scale);
2067 dst += stride;
2068 }
2069 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2070 const __m128i y_select = _mm_set1_epi32(y_mask);
2071 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2072 const __m128i scaled_bottom_left_y =
2073 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2074 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2075 scaled_bottom_left_y, scaled_bottom_left_y,
2076 scale);
2077 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2078 scaled_bottom_left_y, scaled_bottom_left_y,
2079 scale);
2080 dst += stride;
2081 }
2082 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2083 const __m128i y_select = _mm_set1_epi32(y_mask);
2084 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
2085 const __m128i scaled_bottom_left_y =
2086 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
2087 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2088 scaled_bottom_left_y, scaled_bottom_left_y,
2089 scale);
2090 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2091 scaled_bottom_left_y, scaled_bottom_left_y,
2092 scale);
2093 dst += stride;
2094 }
2095 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2096 const __m128i y_select = _mm_set1_epi32(y_mask);
2097 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2098 const __m128i scaled_bottom_left_y =
2099 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2100 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2101 scaled_bottom_left_y, scaled_bottom_left_y,
2102 scale);
2103 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2104 scaled_bottom_left_y, scaled_bottom_left_y,
2105 scale);
2106 dst += stride;
2107 }
2108 }
2109
SmoothVertical32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2110 void SmoothVertical32x64_SSE4_1(
2111 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2112 const void* LIBGAV1_RESTRICT const top_row,
2113 const void* LIBGAV1_RESTRICT const left_column) {
2114 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2115 auto* dst = static_cast<uint8_t*>(dest);
2116 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2117 const __m128i zero = _mm_setzero_si128();
2118 const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
2119 const __m128i top_lo = LoadUnaligned16(top_ptr);
2120 const __m128i top_hi = LoadUnaligned16(top_ptr + 16);
2121 const __m128i top1 = _mm_cvtepu8_epi16(top_lo);
2122 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
2123 const __m128i top3 = _mm_cvtepu8_epi16(top_hi);
2124 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
2125 const __m128i scale = _mm_set1_epi16(256);
2126 const __m128i round = _mm_set1_epi16(128);
2127 const uint8_t* weights_base_ptr = kSmoothWeights + 60;
2128 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2129 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2130 const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
2131 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2132 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2133 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2134 const __m128i scaled_bottom_left_lo =
2135 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2136 const __m128i scaled_bottom_left_hi =
2137 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2138
2139 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2140 const __m128i y_select = _mm_set1_epi32(y_mask);
2141 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2142 const __m128i scaled_bottom_left_y =
2143 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2144 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2145 scaled_bottom_left_y, scaled_bottom_left_y,
2146 round);
2147 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2148 scaled_bottom_left_y, scaled_bottom_left_y,
2149 round);
2150 dst += stride;
2151 }
2152 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2153 const __m128i y_select = _mm_set1_epi32(y_mask);
2154 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2155 const __m128i scaled_bottom_left_y =
2156 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2157 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2158 scaled_bottom_left_y, scaled_bottom_left_y,
2159 round);
2160 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2161 scaled_bottom_left_y, scaled_bottom_left_y,
2162 round);
2163 dst += stride;
2164 }
2165 }
2166 }
2167
SmoothVertical64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2168 void SmoothVertical64x16_SSE4_1(
2169 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2170 const void* LIBGAV1_RESTRICT const top_row,
2171 const void* LIBGAV1_RESTRICT const left_column) {
2172 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2173 auto* dst = static_cast<uint8_t*>(dest);
2174 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2175 const __m128i bottom_left = _mm_set1_epi16(left_ptr[15]);
2176 __m128i scale = _mm_set1_epi16(256);
2177 const __m128i zero = _mm_setzero_si128();
2178 const __m128i top_lolo = LoadUnaligned16(top_ptr);
2179 const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
2180 const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
2181 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2182 const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
2183 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2184
2185 const __m128i weights = LoadUnaligned16(kSmoothWeights + 12);
2186 const __m128i weights1 = _mm_cvtepu8_epi16(weights);
2187 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
2188 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2189 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2190 const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
2191 const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
2192 const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
2193 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2194 const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
2195 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2196 const __m128i scaled_bottom_left1 =
2197 _mm_mullo_epi16(inverted_weights1, bottom_left);
2198 const __m128i scaled_bottom_left2 =
2199 _mm_mullo_epi16(inverted_weights2, bottom_left);
2200 scale = _mm_set1_epi16(128);
2201 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2202 const __m128i y_select = _mm_set1_epi32(y_mask);
2203 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
2204 const __m128i scaled_bottom_left_y =
2205 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
2206 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2207 scaled_bottom_left_y, scaled_bottom_left_y,
2208 scale);
2209 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2210 scaled_bottom_left_y, scaled_bottom_left_y,
2211 scale);
2212 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2213 scaled_bottom_left_y, scaled_bottom_left_y,
2214 scale);
2215 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2216 scaled_bottom_left_y, scaled_bottom_left_y,
2217 scale);
2218 dst += stride;
2219 }
2220 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2221 const __m128i y_select = _mm_set1_epi32(y_mask);
2222 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2223 const __m128i scaled_bottom_left_y =
2224 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2225 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2226 scaled_bottom_left_y, scaled_bottom_left_y,
2227 scale);
2228 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2229 scaled_bottom_left_y, scaled_bottom_left_y,
2230 scale);
2231 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2232 scaled_bottom_left_y, scaled_bottom_left_y,
2233 scale);
2234 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2235 scaled_bottom_left_y, scaled_bottom_left_y,
2236 scale);
2237 dst += stride;
2238 }
2239 }
2240
SmoothVertical64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2241 void SmoothVertical64x32_SSE4_1(
2242 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2243 const void* LIBGAV1_RESTRICT const top_row,
2244 const void* LIBGAV1_RESTRICT const left_column) {
2245 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2246 auto* dst = static_cast<uint8_t*>(dest);
2247 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2248 const __m128i zero = _mm_setzero_si128();
2249 const __m128i bottom_left = _mm_set1_epi16(left_ptr[31]);
2250 const __m128i top_lolo = LoadUnaligned16(top_ptr);
2251 const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
2252 const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
2253 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2254 const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
2255 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2256 const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
2257 const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
2258 const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
2259 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2260 const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
2261 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2262 const __m128i weights_lo = LoadUnaligned16(kSmoothWeights + 28);
2263 const __m128i weights_hi = LoadUnaligned16(kSmoothWeights + 44);
2264 const __m128i weights1 = _mm_cvtepu8_epi16(weights_lo);
2265 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
2266 const __m128i weights3 = _mm_cvtepu8_epi16(weights_hi);
2267 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
2268 __m128i scale = _mm_set1_epi16(256);
2269 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2270 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2271 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2272 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2273 const __m128i scaled_bottom_left1 =
2274 _mm_mullo_epi16(inverted_weights1, bottom_left);
2275 const __m128i scaled_bottom_left2 =
2276 _mm_mullo_epi16(inverted_weights2, bottom_left);
2277 const __m128i scaled_bottom_left3 =
2278 _mm_mullo_epi16(inverted_weights3, bottom_left);
2279 const __m128i scaled_bottom_left4 =
2280 _mm_mullo_epi16(inverted_weights4, bottom_left);
2281 scale = _mm_set1_epi16(128);
2282
2283 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2284 const __m128i y_select = _mm_set1_epi32(y_mask);
2285 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
2286 const __m128i scaled_bottom_left_y =
2287 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
2288 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2289 scaled_bottom_left_y, scaled_bottom_left_y,
2290 scale);
2291 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2292 scaled_bottom_left_y, scaled_bottom_left_y,
2293 scale);
2294 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2295 scaled_bottom_left_y, scaled_bottom_left_y,
2296 scale);
2297 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2298 scaled_bottom_left_y, scaled_bottom_left_y,
2299 scale);
2300 dst += stride;
2301 }
2302 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2303 const __m128i y_select = _mm_set1_epi32(y_mask);
2304 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
2305 const __m128i scaled_bottom_left_y =
2306 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
2307 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2308 scaled_bottom_left_y, scaled_bottom_left_y,
2309 scale);
2310 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2311 scaled_bottom_left_y, scaled_bottom_left_y,
2312 scale);
2313 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2314 scaled_bottom_left_y, scaled_bottom_left_y,
2315 scale);
2316 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2317 scaled_bottom_left_y, scaled_bottom_left_y,
2318 scale);
2319 dst += stride;
2320 }
2321 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2322 const __m128i y_select = _mm_set1_epi32(y_mask);
2323 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
2324 const __m128i scaled_bottom_left_y =
2325 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
2326 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2327 scaled_bottom_left_y, scaled_bottom_left_y,
2328 scale);
2329 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2330 scaled_bottom_left_y, scaled_bottom_left_y,
2331 scale);
2332 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2333 scaled_bottom_left_y, scaled_bottom_left_y,
2334 scale);
2335 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2336 scaled_bottom_left_y, scaled_bottom_left_y,
2337 scale);
2338 dst += stride;
2339 }
2340 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2341 const __m128i y_select = _mm_set1_epi32(y_mask);
2342 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2343 const __m128i scaled_bottom_left_y =
2344 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2345 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2346 scaled_bottom_left_y, scaled_bottom_left_y,
2347 scale);
2348 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2349 scaled_bottom_left_y, scaled_bottom_left_y,
2350 scale);
2351 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2352 scaled_bottom_left_y, scaled_bottom_left_y,
2353 scale);
2354 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2355 scaled_bottom_left_y, scaled_bottom_left_y,
2356 scale);
2357 dst += stride;
2358 }
2359 }
2360
SmoothVertical64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)2361 void SmoothVertical64x64_SSE4_1(
2362 void* LIBGAV1_RESTRICT const dest, const ptrdiff_t stride,
2363 const void* LIBGAV1_RESTRICT const top_row,
2364 const void* LIBGAV1_RESTRICT const left_column) {
2365 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
2366 auto* dst = static_cast<uint8_t*>(dest);
2367 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
2368 const __m128i zero = _mm_setzero_si128();
2369 const __m128i bottom_left = _mm_set1_epi16(left_ptr[63]);
2370 const __m128i top_lolo = LoadUnaligned16(top_ptr);
2371 const __m128i top_lohi = LoadUnaligned16(top_ptr + 16);
2372 const __m128i top1 = _mm_cvtepu8_epi16(top_lolo);
2373 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2374 const __m128i top3 = _mm_cvtepu8_epi16(top_lohi);
2375 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2376 const __m128i top_hilo = LoadUnaligned16(top_ptr + 32);
2377 const __m128i top_hihi = LoadUnaligned16(top_ptr + 48);
2378 const __m128i top5 = _mm_cvtepu8_epi16(top_hilo);
2379 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2380 const __m128i top7 = _mm_cvtepu8_epi16(top_hihi);
2381 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2382 const __m128i scale = _mm_set1_epi16(256);
2383 const __m128i round = _mm_set1_epi16(128);
2384 const uint8_t* weights_base_ptr = kSmoothWeights + 60;
2385 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2386 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2387 const __m128i weights_lo = _mm_cvtepu8_epi16(weights);
2388 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2389 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2390 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2391 const __m128i scaled_bottom_left_lo =
2392 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2393 const __m128i scaled_bottom_left_hi =
2394 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2395 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2396 const __m128i y_select = _mm_set1_epi32(y_mask);
2397 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2398 const __m128i scaled_bottom_left_y =
2399 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2400 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2401 scaled_bottom_left_y, scaled_bottom_left_y,
2402 round);
2403 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2404 scaled_bottom_left_y, scaled_bottom_left_y,
2405 round);
2406 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2407 scaled_bottom_left_y, scaled_bottom_left_y,
2408 round);
2409 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2410 scaled_bottom_left_y, scaled_bottom_left_y,
2411 round);
2412 dst += stride;
2413 }
2414 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2415 const __m128i y_select = _mm_set1_epi32(y_mask);
2416 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2417 const __m128i scaled_bottom_left_y =
2418 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2419 WriteSmoothDirectionalSum16(dst, top1, top2, weights_y, weights_y,
2420 scaled_bottom_left_y, scaled_bottom_left_y,
2421 round);
2422 WriteSmoothDirectionalSum16(dst + 16, top3, top4, weights_y, weights_y,
2423 scaled_bottom_left_y, scaled_bottom_left_y,
2424 round);
2425 WriteSmoothDirectionalSum16(dst + 32, top5, top6, weights_y, weights_y,
2426 scaled_bottom_left_y, scaled_bottom_left_y,
2427 round);
2428 WriteSmoothDirectionalSum16(dst + 48, top7, top8, weights_y, weights_y,
2429 scaled_bottom_left_y, scaled_bottom_left_y,
2430 round);
2431 dst += stride;
2432 }
2433 }
2434 }
2435
Init8bpp()2436 void Init8bpp() {
2437 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2438 assert(dsp != nullptr);
2439 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmooth)
2440 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
2441 Smooth4x4_SSE4_1;
2442 #endif
2443 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmooth)
2444 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
2445 Smooth4x8_SSE4_1;
2446 #endif
2447 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmooth)
2448 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
2449 Smooth4x16_SSE4_1;
2450 #endif
2451 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmooth)
2452 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
2453 Smooth8x4_SSE4_1;
2454 #endif
2455 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmooth)
2456 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
2457 Smooth8x8_SSE4_1;
2458 #endif
2459 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmooth)
2460 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
2461 Smooth8x16_SSE4_1;
2462 #endif
2463 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmooth)
2464 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
2465 Smooth8x32_SSE4_1;
2466 #endif
2467 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmooth)
2468 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
2469 SmoothWxH<16, 4>;
2470 #endif
2471 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmooth)
2472 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
2473 SmoothWxH<16, 8>;
2474 #endif
2475 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmooth)
2476 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
2477 SmoothWxH<16, 16>;
2478 #endif
2479 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmooth)
2480 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
2481 SmoothWxH<16, 32>;
2482 #endif
2483 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmooth)
2484 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
2485 SmoothWxH<16, 64>;
2486 #endif
2487 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmooth)
2488 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
2489 SmoothWxH<32, 8>;
2490 #endif
2491 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmooth)
2492 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
2493 SmoothWxH<32, 16>;
2494 #endif
2495 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmooth)
2496 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
2497 SmoothWxH<32, 32>;
2498 #endif
2499 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmooth)
2500 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
2501 SmoothWxH<32, 64>;
2502 #endif
2503 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmooth)
2504 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
2505 SmoothWxH<64, 16>;
2506 #endif
2507 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmooth)
2508 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
2509 SmoothWxH<64, 32>;
2510 #endif
2511 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmooth)
2512 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
2513 SmoothWxH<64, 64>;
2514 #endif
2515 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothVertical)
2516 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
2517 SmoothVertical4x4_SSE4_1;
2518 #endif
2519 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothVertical)
2520 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
2521 SmoothVertical4x8_SSE4_1;
2522 #endif
2523 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothVertical)
2524 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
2525 SmoothVertical4x16_SSE4_1;
2526 #endif
2527 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothVertical)
2528 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
2529 SmoothVertical8x4_SSE4_1;
2530 #endif
2531 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothVertical)
2532 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
2533 SmoothVertical8x8_SSE4_1;
2534 #endif
2535 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothVertical)
2536 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
2537 SmoothVertical8x16_SSE4_1;
2538 #endif
2539 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothVertical)
2540 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
2541 SmoothVertical8x32_SSE4_1;
2542 #endif
2543 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothVertical)
2544 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
2545 SmoothVertical16x4_SSE4_1;
2546 #endif
2547 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothVertical)
2548 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
2549 SmoothVertical16x8_SSE4_1;
2550 #endif
2551 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothVertical)
2552 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
2553 SmoothVertical16x16_SSE4_1;
2554 #endif
2555 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothVertical)
2556 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
2557 SmoothVertical16x32_SSE4_1;
2558 #endif
2559 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothVertical)
2560 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
2561 SmoothVertical16x64_SSE4_1;
2562 #endif
2563 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothVertical)
2564 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
2565 SmoothVertical32x8_SSE4_1;
2566 #endif
2567 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothVertical)
2568 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
2569 SmoothVertical32x16_SSE4_1;
2570 #endif
2571 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothVertical)
2572 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
2573 SmoothVertical32x32_SSE4_1;
2574 #endif
2575 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothVertical)
2576 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
2577 SmoothVertical32x64_SSE4_1;
2578 #endif
2579 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothVertical)
2580 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
2581 SmoothVertical64x16_SSE4_1;
2582 #endif
2583 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothVertical)
2584 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
2585 SmoothVertical64x32_SSE4_1;
2586 #endif
2587 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothVertical)
2588 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
2589 SmoothVertical64x64_SSE4_1;
2590 #endif
2591 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorSmoothHorizontal)
2592 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
2593 SmoothHorizontal4x4_SSE4_1;
2594 #endif
2595 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorSmoothHorizontal)
2596 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
2597 SmoothHorizontal4x8_SSE4_1;
2598 #endif
2599 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorSmoothHorizontal)
2600 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
2601 SmoothHorizontal4x16_SSE4_1;
2602 #endif
2603 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorSmoothHorizontal)
2604 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
2605 SmoothHorizontal8x4_SSE4_1;
2606 #endif
2607 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorSmoothHorizontal)
2608 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
2609 SmoothHorizontal8x8_SSE4_1;
2610 #endif
2611 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorSmoothHorizontal)
2612 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
2613 SmoothHorizontal8x16_SSE4_1;
2614 #endif
2615 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorSmoothHorizontal)
2616 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
2617 SmoothHorizontal8x32_SSE4_1;
2618 #endif
2619 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorSmoothHorizontal)
2620 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
2621 SmoothHorizontal16x4_SSE4_1;
2622 #endif
2623 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorSmoothHorizontal)
2624 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
2625 SmoothHorizontal16x8_SSE4_1;
2626 #endif
2627 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorSmoothHorizontal)
2628 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
2629 SmoothHorizontal16x16_SSE4_1;
2630 #endif
2631 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorSmoothHorizontal)
2632 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
2633 SmoothHorizontal16x32_SSE4_1;
2634 #endif
2635 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorSmoothHorizontal)
2636 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
2637 SmoothHorizontal16x64_SSE4_1;
2638 #endif
2639 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorSmoothHorizontal)
2640 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
2641 SmoothHorizontal32x8_SSE4_1;
2642 #endif
2643 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorSmoothHorizontal)
2644 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
2645 SmoothHorizontal32x16_SSE4_1;
2646 #endif
2647 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorSmoothHorizontal)
2648 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
2649 SmoothHorizontal32x32_SSE4_1;
2650 #endif
2651 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorSmoothHorizontal)
2652 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
2653 SmoothHorizontal32x64_SSE4_1;
2654 #endif
2655 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorSmoothHorizontal)
2656 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
2657 SmoothHorizontal64x16_SSE4_1;
2658 #endif
2659 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorSmoothHorizontal)
2660 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
2661 SmoothHorizontal64x32_SSE4_1;
2662 #endif
2663 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorSmoothHorizontal)
2664 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
2665 SmoothHorizontal64x64_SSE4_1;
2666 #endif
2667 }
2668
2669 } // namespace
2670 } // namespace low_bitdepth
2671
IntraPredSmoothInit_SSE4_1()2672 void IntraPredSmoothInit_SSE4_1() { low_bitdepth::Init8bpp(); }
2673
2674 } // namespace dsp
2675 } // namespace libgav1
2676
2677 #else // !LIBGAV1_TARGETING_SSE4_1
2678
2679 namespace libgav1 {
2680 namespace dsp {
2681
IntraPredSmoothInit_SSE4_1()2682 void IntraPredSmoothInit_SSE4_1() {}
2683
2684 } // namespace dsp
2685 } // namespace libgav1
2686
2687 #endif // LIBGAV1_TARGETING_SSE4_1
2688