1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred_cfl.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <smmintrin.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32 #include "src/utils/constants.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37
38 // This duplicates the last two 16-bit values in |row|.
LastRowSamples(const __m128i row)39 inline __m128i LastRowSamples(const __m128i row) {
40 return _mm_shuffle_epi32(row, 0xFF);
41 }
42
43 // This duplicates the last 16-bit value in |row|.
LastRowResult(const __m128i row)44 inline __m128i LastRowResult(const __m128i row) {
45 const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
46 return _mm_shuffle_epi32(dup_row, 0xFF);
47 }
48
49 // Takes in two sums of input row pairs, and completes the computation for two
50 // output rows.
StoreLumaResults4_420(const __m128i vertical_sum0,const __m128i vertical_sum1,int16_t * luma_ptr)51 inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
52 const __m128i vertical_sum1,
53 int16_t* luma_ptr) {
54 __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
55 result = _mm_slli_epi16(result, 1);
56 StoreLo8(luma_ptr, result);
57 StoreHi8(luma_ptr + kCflLumaBufferStride, result);
58 return result;
59 }
60
61 // Takes two halves of a vertically added pair of rows and completes the
62 // computation for one output row.
StoreLumaResults8_420(const __m128i vertical_sum0,const __m128i vertical_sum1,int16_t * luma_ptr)63 inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
64 const __m128i vertical_sum1,
65 int16_t* luma_ptr) {
66 __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
67 result = _mm_slli_epi16(result, 1);
68 StoreUnaligned16(luma_ptr, result);
69 return result;
70 }
71
72 } // namespace
73
74 namespace low_bitdepth {
75 namespace {
76
77 //------------------------------------------------------------------------------
78 // CflIntraPredictor_SSE4_1
79
CflPredictUnclipped(const __m128i * input,__m128i alpha_q12,__m128i alpha_sign,__m128i dc_q0)80 inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
81 __m128i alpha_sign, __m128i dc_q0) {
82 const __m128i ac_q3 = LoadUnaligned16(input);
83 const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
84 __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
85 scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
86 return _mm_add_epi16(scaled_luma_q0, dc_q0);
87 }
88
89 template <int width, int height>
CflIntraPredictor_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int alpha)90 void CflIntraPredictor_SSE4_1(
91 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
92 const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
93 const int alpha) {
94 auto* dst = static_cast<uint8_t*>(dest);
95 const __m128i alpha_sign = _mm_set1_epi16(alpha);
96 const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
97 auto* row = reinterpret_cast<const __m128i*>(luma);
98 const int kCflLumaBufferStrideLog2_16i = 5;
99 const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
100 const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
101 const __m128i dc_val = _mm_set1_epi16(dst[0]);
102 do {
103 __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
104 if (width < 16) {
105 res = _mm_packus_epi16(res, res);
106 if (width == 4) {
107 Store4(dst, res);
108 } else {
109 StoreLo8(dst, res);
110 }
111 } else {
112 __m128i next =
113 CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
114 res = _mm_packus_epi16(res, next);
115 StoreUnaligned16(dst, res);
116 if (width == 32) {
117 res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
118 next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
119 res = _mm_packus_epi16(res, next);
120 StoreUnaligned16(dst + 16, res);
121 }
122 }
123 dst += stride;
124 } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
125 }
126
127 template <int block_height_log2, bool is_inside>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)128 void CflSubsampler444_4xH_SSE4_1(
129 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
130 const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
131 ptrdiff_t stride) {
132 static_assert(block_height_log2 <= 4, "");
133 const int block_height = 1 << block_height_log2;
134 const int visible_height = max_luma_height;
135 const auto* src = static_cast<const uint8_t*>(source);
136 __m128i sum = _mm_setzero_si128();
137 int16_t* luma_ptr = luma[0];
138 const __m128i zero = _mm_setzero_si128();
139 __m128i samples;
140 int y = 0;
141 do {
142 samples = Load4(src);
143 src += stride;
144 int src_bytes;
145 memcpy(&src_bytes, src, 4);
146 samples = _mm_insert_epi32(samples, src_bytes, 1);
147 src += stride;
148 samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
149 StoreLo8(luma_ptr, samples);
150 luma_ptr += kCflLumaBufferStride;
151 StoreHi8(luma_ptr, samples);
152 luma_ptr += kCflLumaBufferStride;
153
154 // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
155 // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
156 sum = _mm_add_epi16(sum, samples);
157 y += 2;
158 } while (y < visible_height);
159
160 if (!is_inside) {
161 // Replicate the 2 high lanes.
162 samples = _mm_shuffle_epi32(samples, 0xee);
163 do {
164 StoreLo8(luma_ptr, samples);
165 luma_ptr += kCflLumaBufferStride;
166 StoreHi8(luma_ptr, samples);
167 luma_ptr += kCflLumaBufferStride;
168 sum = _mm_add_epi16(sum, samples);
169 y += 2;
170 } while (y < block_height);
171 }
172
173 __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
174 sum = _mm_cvtepu16_epi32(sum);
175 sum = _mm_add_epi32(sum, sum_tmp);
176 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
177 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
178
179 __m128i averages = RightShiftWithRounding_U32(
180 sum, block_height_log2 + 2 /* log2 of width 4 */);
181 averages = _mm_shufflelo_epi16(averages, 0);
182 luma_ptr = luma[0];
183 for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
184 const __m128i samples = LoadLo8(luma_ptr);
185 StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
186 }
187 }
188
189 template <int block_height_log2>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)190 void CflSubsampler444_4xH_SSE4_1(
191 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
192 const int max_luma_width, const int max_luma_height,
193 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
194 static_assert(block_height_log2 <= 4, "");
195 assert(max_luma_width >= 4);
196 assert(max_luma_height >= 4);
197 static_cast<void>(max_luma_width);
198 constexpr int block_height = 1 << block_height_log2;
199
200 if (block_height <= max_luma_height) {
201 CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
202 source, stride);
203 } else {
204 CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
205 source, stride);
206 }
207 }
208
209 template <int block_height_log2, bool inside>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)210 void CflSubsampler444_8xH_SSE4_1(
211 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
212 const int max_luma_width, const int max_luma_height,
213 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
214 static_assert(block_height_log2 <= 5, "");
215 const int block_height = 1 << block_height_log2, block_width = 8;
216 const int visible_height = max_luma_height;
217 const int invisible_width = inside ? 0 : block_width - max_luma_width;
218 const int visible_width = max_luma_width;
219 const __m128i blend_mask =
220 inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
221 const __m128i dup16 = _mm_set1_epi32(0x01000100);
222 const auto* src = static_cast<const uint8_t*>(source);
223 int16_t* luma_ptr = luma[0];
224 const __m128i zero = _mm_setzero_si128();
225 // Since the maximum height is 32, if we split them by parity, each one only
226 // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
227 // store them in 16 bits without casting to 32 bits.
228 __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
229 __m128i sum;
230 __m128i samples1;
231
232 int y = 0;
233 do {
234 __m128i samples0 = LoadLo8(src);
235 if (!inside) {
236 const __m128i border0 =
237 _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
238 samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
239 }
240 src += stride;
241 samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
242 StoreUnaligned16(luma_ptr, samples0);
243 luma_ptr += kCflLumaBufferStride;
244
245 sum_even = _mm_add_epi16(sum_even, samples0);
246
247 samples1 = LoadLo8(src);
248 if (!inside) {
249 const __m128i border1 =
250 _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
251 samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
252 }
253 src += stride;
254 samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
255 StoreUnaligned16(luma_ptr, samples1);
256 luma_ptr += kCflLumaBufferStride;
257
258 sum_odd = _mm_add_epi16(sum_odd, samples1);
259 y += 2;
260 } while (y < visible_height);
261
262 if (!inside) {
263 for (int y = visible_height; y < block_height; y += 2) {
264 sum_even = _mm_add_epi16(sum_even, samples1);
265 StoreUnaligned16(luma_ptr, samples1);
266 luma_ptr += kCflLumaBufferStride;
267
268 sum_odd = _mm_add_epi16(sum_odd, samples1);
269 StoreUnaligned16(luma_ptr, samples1);
270 luma_ptr += kCflLumaBufferStride;
271 }
272 }
273
274 sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
275 _mm_cvtepu16_epi32(sum_even));
276 sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
277 sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
278
279 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
280 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
281
282 __m128i averages = RightShiftWithRounding_U32(
283 sum, block_height_log2 + 3 /* log2 of width 8 */);
284 averages = _mm_shuffle_epi8(averages, dup16);
285 luma_ptr = luma[0];
286 for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
287 const __m128i samples = LoadUnaligned16(luma_ptr);
288 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
289 }
290 }
291
292 template <int block_height_log2>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)293 void CflSubsampler444_8xH_SSE4_1(
294 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
295 const int max_luma_width, const int max_luma_height,
296 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
297 static_assert(block_height_log2 <= 5, "");
298 assert(max_luma_width >= 4);
299 assert(max_luma_height >= 4);
300 const int block_height = 1 << block_height_log2;
301 const int block_width = 8;
302
303 const int horz_inside = block_width <= max_luma_width;
304 const int vert_inside = block_height <= max_luma_height;
305 if (horz_inside && vert_inside) {
306 CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
307 luma, max_luma_width, max_luma_height, source, stride);
308 } else {
309 CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
310 luma, max_luma_width, max_luma_height, source, stride);
311 }
312 }
313
314 // This function will only work for block_width 16 and 32.
315 template <int block_width_log2, int block_height_log2, bool inside>
CflSubsampler444_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)316 void CflSubsampler444_SSE4_1(
317 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
318 const int max_luma_width, const int max_luma_height,
319 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
320 static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
321 static_assert(block_height_log2 <= 5, "");
322 assert(max_luma_width >= 4);
323 assert(max_luma_height >= 4);
324 const int block_height = 1 << block_height_log2;
325 const int block_width = 1 << block_width_log2;
326
327 const int visible_height = max_luma_height;
328 const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
329 const int invisible_width_16 = 16 - visible_width_16;
330 const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
331 const int visible_width_32 = inside ? 32 : max_luma_width;
332 const int invisible_width_32 = 32 - visible_width_32;
333 const __m128i blend_mask_32 =
334 MaskHighNBytes(std::min(16, invisible_width_32));
335
336 const __m128i dup16 = _mm_set1_epi32(0x01000100);
337 const __m128i zero = _mm_setzero_si128();
338 const auto* src = static_cast<const uint8_t*>(source);
339 int16_t* luma_ptr = luma[0];
340 __m128i sum = _mm_setzero_si128();
341
342 __m128i samples0, samples1;
343 __m128i samples2, samples3;
344 __m128i inner_sum_lo, inner_sum_hi;
345 int y = 0;
346 do {
347 // We can load uninitialized values here. Even though they are then masked
348 // off by blendv, MSAN doesn't model that behavior.
349 __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
350
351 if (!inside) {
352 const __m128i border16 =
353 _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
354 samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
355 }
356 samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
357 samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
358
359 StoreUnaligned16(luma_ptr, samples0);
360 StoreUnaligned16(luma_ptr + 8, samples1);
361 __m128i inner_sum = _mm_add_epi16(samples0, samples1);
362
363 if (block_width == 32) {
364 // We can load uninitialized values here. Even though they are then masked
365 // off by blendv, MSAN doesn't model that behavior.
366 __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
367 if (!inside) {
368 const __m128i border32 =
369 _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
370 samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
371 }
372 samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
373 samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
374
375 StoreUnaligned16(luma_ptr + 16, samples2);
376 StoreUnaligned16(luma_ptr + 24, samples3);
377 inner_sum = _mm_add_epi16(samples2, inner_sum);
378 inner_sum = _mm_add_epi16(samples3, inner_sum);
379 }
380
381 inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
382 inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
383 sum = _mm_add_epi32(sum, inner_sum_lo);
384 sum = _mm_add_epi32(sum, inner_sum_hi);
385 luma_ptr += kCflLumaBufferStride;
386 src += stride;
387 } while (++y < visible_height);
388
389 if (!inside) {
390 for (int y = visible_height; y < block_height;
391 luma_ptr += kCflLumaBufferStride, ++y) {
392 sum = _mm_add_epi32(sum, inner_sum_lo);
393 StoreUnaligned16(luma_ptr, samples0);
394 sum = _mm_add_epi32(sum, inner_sum_hi);
395 StoreUnaligned16(luma_ptr + 8, samples1);
396 if (block_width == 32) {
397 StoreUnaligned16(luma_ptr + 16, samples2);
398 StoreUnaligned16(luma_ptr + 24, samples3);
399 }
400 }
401 }
402
403 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
404 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
405
406 __m128i averages =
407 RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
408 averages = _mm_shuffle_epi8(averages, dup16);
409 luma_ptr = luma[0];
410 for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
411 for (int x = 0; x < block_width; x += 8) {
412 __m128i samples = LoadUnaligned16(&luma_ptr[x]);
413 StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
414 }
415 }
416 }
417
418 template <int block_width_log2, int block_height_log2>
CflSubsampler444_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)419 void CflSubsampler444_SSE4_1(
420 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
421 const int max_luma_width, const int max_luma_height,
422 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
423 static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
424 static_assert(block_height_log2 <= 5, "");
425 assert(max_luma_width >= 4);
426 assert(max_luma_height >= 4);
427
428 const int block_height = 1 << block_height_log2;
429 const int block_width = 1 << block_width_log2;
430 const int horz_inside = block_width <= max_luma_width;
431 const int vert_inside = block_height <= max_luma_height;
432 if (horz_inside && vert_inside) {
433 CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
434 luma, max_luma_width, max_luma_height, source, stride);
435 } else {
436 CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
437 luma, max_luma_width, max_luma_height, source, stride);
438 }
439 }
440
441 template <int block_height_log2>
CflSubsampler420_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)442 void CflSubsampler420_4xH_SSE4_1(
443 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
444 const int /*max_luma_width*/, const int max_luma_height,
445 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
446 const int block_height = 1 << block_height_log2;
447 const auto* src = static_cast<const uint8_t*>(source);
448 int16_t* luma_ptr = luma[0];
449 const __m128i zero = _mm_setzero_si128();
450 __m128i final_sum = zero;
451 const int luma_height = std::min(block_height, max_luma_height >> 1);
452 int y = 0;
453 do {
454 // Note that double sampling and converting to 16bit makes a row fill the
455 // vector.
456 const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
457 src += stride;
458 const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
459 src += stride;
460 const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
461
462 const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
463 src += stride;
464 const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
465 src += stride;
466 const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
467 __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
468 luma_ptr += kCflLumaBufferStride << 1;
469
470 const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
471 src += stride;
472 const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
473 src += stride;
474 const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
475
476 const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
477 src += stride;
478 const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
479 src += stride;
480 const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
481 sum = _mm_add_epi16(
482 sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
483 luma_ptr += kCflLumaBufferStride << 1;
484
485 final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
486 final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
487 y += 4;
488 } while (y < luma_height);
489 const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
490 const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
491 for (; y < block_height; ++y) {
492 StoreLo8(luma_ptr, final_fill);
493 luma_ptr += kCflLumaBufferStride;
494
495 final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
496 }
497 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
498 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
499
500 __m128i averages = RightShiftWithRounding_U32(
501 final_sum, block_height_log2 + 2 /*log2 of width 4*/);
502
503 averages = _mm_shufflelo_epi16(averages, 0);
504 luma_ptr = luma[0];
505 for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
506 const __m128i samples = LoadLo8(luma_ptr);
507 StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
508 }
509 }
510
511 template <int block_height_log2, int max_luma_width>
CflSubsampler420Impl_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)512 inline void CflSubsampler420Impl_8xH_SSE4_1(
513 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
514 const int /*max_luma_width*/, const int max_luma_height,
515 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
516 const int block_height = 1 << block_height_log2;
517 const auto* src = static_cast<const uint8_t*>(source);
518 const __m128i zero = _mm_setzero_si128();
519 __m128i final_sum = zero;
520 int16_t* luma_ptr = luma[0];
521 const int luma_height = std::min(block_height, max_luma_height >> 1);
522 int y = 0;
523
524 do {
525 const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
526 const __m128i samples_row01 = (max_luma_width == 16)
527 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
528 : LastRowSamples(samples_row00);
529 src += stride;
530 const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
531 const __m128i samples_row11 = (max_luma_width == 16)
532 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
533 : LastRowSamples(samples_row10);
534 src += stride;
535 const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
536 const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
537 __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
538 luma_ptr += kCflLumaBufferStride;
539
540 const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
541 const __m128i samples_row21 = (max_luma_width == 16)
542 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
543 : LastRowSamples(samples_row20);
544 src += stride;
545 const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
546 const __m128i samples_row31 = (max_luma_width == 16)
547 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
548 : LastRowSamples(samples_row30);
549 src += stride;
550 const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
551 const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
552 sum = _mm_add_epi16(
553 sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
554 luma_ptr += kCflLumaBufferStride;
555
556 const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
557 const __m128i samples_row41 = (max_luma_width == 16)
558 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
559 : LastRowSamples(samples_row40);
560 src += stride;
561 const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
562 const __m128i samples_row51 = (max_luma_width == 16)
563 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
564 : LastRowSamples(samples_row50);
565 src += stride;
566 const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
567 const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
568 sum = _mm_add_epi16(
569 sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
570 luma_ptr += kCflLumaBufferStride;
571
572 const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
573 const __m128i samples_row61 = (max_luma_width == 16)
574 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
575 : LastRowSamples(samples_row60);
576 src += stride;
577 const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
578 const __m128i samples_row71 = (max_luma_width == 16)
579 ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
580 : LastRowSamples(samples_row70);
581 src += stride;
582 const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
583 const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
584 sum = _mm_add_epi16(
585 sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
586 luma_ptr += kCflLumaBufferStride;
587
588 final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
589 final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
590 y += 4;
591 } while (y < luma_height);
592 // Duplicate the final row downward to the end after max_luma_height.
593 const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
594 const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
595 const __m128i final_fill_to_sum1 =
596 _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
597 const __m128i final_fill_to_sum =
598 _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
599 for (; y < block_height; ++y) {
600 StoreUnaligned16(luma_ptr, final_fill);
601 luma_ptr += kCflLumaBufferStride;
602
603 final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
604 }
605 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
606 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
607
608 __m128i averages = RightShiftWithRounding_S32(
609 final_sum, block_height_log2 + 3 /*log2 of width 8*/);
610
611 averages = _mm_shufflelo_epi16(averages, 0);
612 averages = _mm_shuffle_epi32(averages, 0);
613 luma_ptr = luma[0];
614 for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
615 const __m128i samples = LoadUnaligned16(luma_ptr);
616 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
617 }
618 }
619
620 template <int block_height_log2>
CflSubsampler420_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)621 void CflSubsampler420_8xH_SSE4_1(
622 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
623 const int max_luma_width, const int max_luma_height,
624 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
625 if (max_luma_width == 8) {
626 CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
627 luma, max_luma_width, max_luma_height, source, stride);
628 } else {
629 CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
630 luma, max_luma_width, max_luma_height, source, stride);
631 }
632 }
633
634 template <int block_width_log2, int block_height_log2, int max_luma_width>
CflSubsampler420Impl_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)635 inline void CflSubsampler420Impl_WxH_SSE4_1(
636 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
637 const int /*max_luma_width*/, const int max_luma_height,
638 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
639 const auto* src = static_cast<const uint8_t*>(source);
640 const __m128i zero = _mm_setzero_si128();
641 __m128i final_sum = zero;
642 const int block_height = 1 << block_height_log2;
643 const int luma_height = std::min(block_height, max_luma_height >> 1);
644 static_assert(max_luma_width <= 32, "");
645
646 int16_t* luma_ptr = luma[0];
647 __m128i final_row_result;
648 // Begin first y section, covering width up to 32.
649 int y = 0;
650 do {
651 const uint8_t* src_next = src + stride;
652 const __m128i samples_row0_lo = LoadUnaligned16(src);
653 const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
654 const __m128i samples_row01 = (max_luma_width >= 16)
655 ? _mm_unpackhi_epi8(samples_row0_lo, zero)
656 : LastRowSamples(samples_row00);
657 const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
658 const __m128i samples_row02 = (max_luma_width >= 24)
659 ? _mm_cvtepu8_epi16(samples_row0_hi)
660 : LastRowSamples(samples_row01);
661 const __m128i samples_row03 = (max_luma_width == 32)
662 ? _mm_unpackhi_epi8(samples_row0_hi, zero)
663 : LastRowSamples(samples_row02);
664 const __m128i samples_row1_lo = LoadUnaligned16(src_next);
665 const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
666 const __m128i samples_row11 = (max_luma_width >= 16)
667 ? _mm_unpackhi_epi8(samples_row1_lo, zero)
668 : LastRowSamples(samples_row10);
669 const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
670 const __m128i samples_row12 = (max_luma_width >= 24)
671 ? _mm_cvtepu8_epi16(samples_row1_hi)
672 : LastRowSamples(samples_row11);
673 const __m128i samples_row13 = (max_luma_width == 32)
674 ? _mm_unpackhi_epi8(samples_row1_hi, zero)
675 : LastRowSamples(samples_row12);
676 const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
677 const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
678 const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
679 const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
680 __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
681 final_row_result =
682 StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
683 sum = _mm_add_epi16(sum, final_row_result);
684 if (block_width_log2 == 5) {
685 const __m128i wide_fill = LastRowResult(final_row_result);
686 sum = _mm_add_epi16(sum, wide_fill);
687 sum = _mm_add_epi16(sum, wide_fill);
688 }
689 final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
690 final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
691 src += stride << 1;
692 luma_ptr += kCflLumaBufferStride;
693 } while (++y < luma_height);
694
695 // Begin second y section.
696 if (y < block_height) {
697 const __m128i final_fill0 =
698 LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
699 const __m128i final_fill1 =
700 LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
701 __m128i wide_fill;
702
703 if (block_width_log2 == 5) {
704 // There are 16 16-bit fill values per row, shifting by 2 accounts for
705 // the widening to 32-bit.
706 wide_fill =
707 _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
708 }
709
710 const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
711 const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
712 const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
713 const __m128i final_fill_to_sum =
714 _mm_add_epi32(final_inner_sum0, final_inner_sum1);
715
716 do {
717 StoreUnaligned16(luma_ptr, final_fill0);
718 StoreUnaligned16(luma_ptr + 8, final_fill1);
719 if (block_width_log2 == 5) {
720 final_sum = _mm_add_epi32(final_sum, wide_fill);
721 }
722 luma_ptr += kCflLumaBufferStride;
723
724 final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
725 } while (++y < block_height);
726 } // End second y section.
727
728 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
729 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
730
731 __m128i averages = RightShiftWithRounding_S32(
732 final_sum, block_width_log2 + block_height_log2);
733 averages = _mm_shufflelo_epi16(averages, 0);
734 averages = _mm_shuffle_epi32(averages, 0);
735
736 luma_ptr = luma[0];
737 for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
738 const __m128i samples0 = LoadUnaligned16(luma_ptr);
739 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
740 const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
741 final_row_result = _mm_sub_epi16(samples1, averages);
742 StoreUnaligned16(luma_ptr + 8, final_row_result);
743 if (block_width_log2 == 5) {
744 const __m128i wide_fill = LastRowResult(final_row_result);
745 StoreUnaligned16(luma_ptr + 16, wide_fill);
746 StoreUnaligned16(luma_ptr + 24, wide_fill);
747 }
748 }
749 }
750
751 template <int block_width_log2, int block_height_log2>
CflSubsampler420_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)752 void CflSubsampler420_WxH_SSE4_1(
753 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
754 const int max_luma_width, const int max_luma_height,
755 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
756 switch (max_luma_width) {
757 case 8:
758 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
759 luma, max_luma_width, max_luma_height, source, stride);
760 return;
761 case 16:
762 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
763 luma, max_luma_width, max_luma_height, source, stride);
764 return;
765 case 24:
766 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
767 luma, max_luma_width, max_luma_height, source, stride);
768 return;
769 default:
770 assert(max_luma_width == 32);
771 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
772 luma, max_luma_width, max_luma_height, source, stride);
773 return;
774 }
775 }
776
Init8bpp()777 void Init8bpp() {
778 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
779 assert(dsp != nullptr);
780 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
781 dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
782 CflSubsampler420_4xH_SSE4_1<2>;
783 #endif
784 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
785 dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
786 CflSubsampler420_4xH_SSE4_1<3>;
787 #endif
788 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
789 dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
790 CflSubsampler420_4xH_SSE4_1<4>;
791 #endif
792 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
793 dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
794 CflSubsampler420_8xH_SSE4_1<2>;
795 #endif
796 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
797 dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
798 CflSubsampler420_8xH_SSE4_1<3>;
799 #endif
800 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
801 dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
802 CflSubsampler420_8xH_SSE4_1<4>;
803 #endif
804 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
805 dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
806 CflSubsampler420_8xH_SSE4_1<5>;
807 #endif
808 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
809 dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
810 CflSubsampler420_WxH_SSE4_1<4, 2>;
811 #endif
812 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
813 dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
814 CflSubsampler420_WxH_SSE4_1<4, 3>;
815 #endif
816 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
817 dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
818 CflSubsampler420_WxH_SSE4_1<4, 4>;
819 #endif
820 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
821 dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
822 CflSubsampler420_WxH_SSE4_1<4, 5>;
823 #endif
824 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
825 dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
826 CflSubsampler420_WxH_SSE4_1<5, 3>;
827 #endif
828 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
829 dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
830 CflSubsampler420_WxH_SSE4_1<5, 4>;
831 #endif
832 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
833 dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
834 CflSubsampler420_WxH_SSE4_1<5, 5>;
835 #endif
836
837 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
838 dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
839 CflSubsampler444_4xH_SSE4_1<2>;
840 #endif
841 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
842 dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
843 CflSubsampler444_4xH_SSE4_1<3>;
844 #endif
845 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
846 dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
847 CflSubsampler444_4xH_SSE4_1<4>;
848 #endif
849 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
850 dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
851 CflSubsampler444_8xH_SSE4_1<2>;
852 #endif
853 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
854 dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
855 CflSubsampler444_8xH_SSE4_1<3>;
856 #endif
857 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
858 dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
859 CflSubsampler444_8xH_SSE4_1<4>;
860 #endif
861 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
862 dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
863 CflSubsampler444_8xH_SSE4_1<5>;
864 #endif
865 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
866 dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
867 CflSubsampler444_SSE4_1<4, 2>;
868 #endif
869 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
870 dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
871 CflSubsampler444_SSE4_1<4, 3>;
872 #endif
873 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
874 dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
875 CflSubsampler444_SSE4_1<4, 4>;
876 #endif
877 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
878 dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
879 CflSubsampler444_SSE4_1<4, 5>;
880 #endif
881 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
882 dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
883 CflSubsampler444_SSE4_1<5, 3>;
884 #endif
885 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
886 dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
887 CflSubsampler444_SSE4_1<5, 4>;
888 #endif
889 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
890 dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
891 CflSubsampler444_SSE4_1<5, 5>;
892 #endif
893 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
894 dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
895 #endif
896 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
897 dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
898 #endif
899 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
900 dsp->cfl_intra_predictors[kTransformSize4x16] =
901 CflIntraPredictor_SSE4_1<4, 16>;
902 #endif
903 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
904 dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
905 #endif
906 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
907 dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
908 #endif
909 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
910 dsp->cfl_intra_predictors[kTransformSize8x16] =
911 CflIntraPredictor_SSE4_1<8, 16>;
912 #endif
913 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
914 dsp->cfl_intra_predictors[kTransformSize8x32] =
915 CflIntraPredictor_SSE4_1<8, 32>;
916 #endif
917 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
918 dsp->cfl_intra_predictors[kTransformSize16x4] =
919 CflIntraPredictor_SSE4_1<16, 4>;
920 #endif
921 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
922 dsp->cfl_intra_predictors[kTransformSize16x8] =
923 CflIntraPredictor_SSE4_1<16, 8>;
924 #endif
925 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
926 dsp->cfl_intra_predictors[kTransformSize16x16] =
927 CflIntraPredictor_SSE4_1<16, 16>;
928 #endif
929 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
930 dsp->cfl_intra_predictors[kTransformSize16x32] =
931 CflIntraPredictor_SSE4_1<16, 32>;
932 #endif
933 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
934 dsp->cfl_intra_predictors[kTransformSize32x8] =
935 CflIntraPredictor_SSE4_1<32, 8>;
936 #endif
937 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
938 dsp->cfl_intra_predictors[kTransformSize32x16] =
939 CflIntraPredictor_SSE4_1<32, 16>;
940 #endif
941 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
942 dsp->cfl_intra_predictors[kTransformSize32x32] =
943 CflIntraPredictor_SSE4_1<32, 32>;
944 #endif
945 }
946
947 } // namespace
948 } // namespace low_bitdepth
949
950 #if LIBGAV1_MAX_BITDEPTH >= 10
951 namespace high_bitdepth {
952 namespace {
953
954 //------------------------------------------------------------------------------
955 // CflIntraPredictor_10bpp_SSE4_1
956
CflPredictUnclipped(const __m128i * input,__m128i alpha_q12,__m128i alpha_sign,__m128i dc_q0)957 inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
958 __m128i alpha_sign, __m128i dc_q0) {
959 const __m128i ac_q3 = LoadUnaligned16(input);
960 const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
961 __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
962 scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
963 return _mm_add_epi16(scaled_luma_q0, dc_q0);
964 }
965
ClipEpi16(__m128i x,__m128i min,__m128i max)966 inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
967 return _mm_max_epi16(_mm_min_epi16(x, max), min);
968 }
969
970 template <int width, int height>
CflIntraPredictor_10bpp_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int alpha)971 void CflIntraPredictor_10bpp_SSE4_1(
972 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
973 const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
974 const int alpha) {
975 constexpr int kCflLumaBufferStrideLog2_16i = 5;
976 constexpr int kCflLumaBufferStrideLog2_128i =
977 kCflLumaBufferStrideLog2_16i - 3;
978 constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
979 auto* dst = static_cast<uint16_t*>(dest);
980 const __m128i alpha_sign = _mm_set1_epi16(alpha);
981 const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
982 auto* row = reinterpret_cast<const __m128i*>(luma);
983 const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
984 const __m128i dc_val = _mm_set1_epi16(dst[0]);
985 const __m128i min = _mm_setzero_si128();
986 const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
987
988 stride >>= 1;
989
990 do {
991 __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
992 res = ClipEpi16(res, min, max);
993 if (width == 4) {
994 StoreLo8(dst, res);
995 } else if (width == 8) {
996 StoreUnaligned16(dst, res);
997 } else if (width == 16) {
998 StoreUnaligned16(dst, res);
999 const __m128i res_1 =
1000 CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
1001 StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
1002 } else {
1003 StoreUnaligned16(dst, res);
1004 const __m128i res_1 =
1005 CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
1006 StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
1007 const __m128i res_2 =
1008 CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
1009 StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
1010 const __m128i res_3 =
1011 CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
1012 StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
1013 }
1014
1015 dst += stride;
1016 } while ((row += kRowIncr) < row_end);
1017 }
1018
1019 template <int block_height_log2, bool is_inside>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1020 void CflSubsampler444_4xH_SSE4_1(
1021 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1022 const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1023 ptrdiff_t stride) {
1024 static_assert(block_height_log2 <= 4, "");
1025 const int block_height = 1 << block_height_log2;
1026 const int visible_height = max_luma_height;
1027 const auto* src = static_cast<const uint16_t*>(source);
1028 const ptrdiff_t src_stride = stride / sizeof(src[0]);
1029 int16_t* luma_ptr = luma[0];
1030 __m128i zero = _mm_setzero_si128();
1031 __m128i sum = zero;
1032 __m128i samples;
1033 int y = visible_height;
1034
1035 do {
1036 samples = LoadHi8(LoadLo8(src), src + src_stride);
1037 src += src_stride << 1;
1038 sum = _mm_add_epi16(sum, samples);
1039 y -= 2;
1040 } while (y != 0);
1041
1042 if (!is_inside) {
1043 y = visible_height;
1044 samples = _mm_unpackhi_epi64(samples, samples);
1045 do {
1046 sum = _mm_add_epi16(sum, samples);
1047 y += 2;
1048 } while (y < block_height);
1049 }
1050
1051 sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
1052 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
1053 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
1054
1055 // Here the left shift by 3 (to increase precision) is nullified in right
1056 // shift ((log2 of width 4) + 1).
1057 __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
1058 averages = _mm_shufflelo_epi16(averages, 0);
1059 src = static_cast<const uint16_t*>(source);
1060 luma_ptr = luma[0];
1061 y = visible_height;
1062 do {
1063 samples = LoadLo8(src);
1064 samples = _mm_slli_epi16(samples, 3);
1065 StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
1066 src += src_stride;
1067 luma_ptr += kCflLumaBufferStride;
1068 } while (--y != 0);
1069
1070 if (!is_inside) {
1071 y = visible_height;
1072 // Replicate last line
1073 do {
1074 StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
1075 luma_ptr += kCflLumaBufferStride;
1076 } while (++y < block_height);
1077 }
1078 }
1079
1080 template <int block_height_log2>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1081 void CflSubsampler444_4xH_SSE4_1(
1082 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1083 const int max_luma_width, const int max_luma_height,
1084 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1085 static_cast<void>(max_luma_width);
1086 static_cast<void>(max_luma_height);
1087 static_assert(block_height_log2 <= 4, "");
1088 assert(max_luma_width >= 4);
1089 assert(max_luma_height >= 4);
1090 const int block_height = 1 << block_height_log2;
1091
1092 if (block_height <= max_luma_height) {
1093 CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
1094 source, stride);
1095 } else {
1096 CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
1097 source, stride);
1098 }
1099 }
1100
1101 template <int block_height_log2, bool is_inside>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1102 void CflSubsampler444_8xH_SSE4_1(
1103 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1104 const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1105 ptrdiff_t stride) {
1106 const int block_height = 1 << block_height_log2;
1107 const int visible_height = max_luma_height;
1108 const __m128i dup16 = _mm_set1_epi32(0x01000100);
1109 const auto* src = static_cast<const uint16_t*>(source);
1110 const ptrdiff_t src_stride = stride / sizeof(src[0]);
1111 int16_t* luma_ptr = luma[0];
1112 const __m128i zero = _mm_setzero_si128();
1113 __m128i sum = zero;
1114 __m128i samples;
1115 int y = visible_height;
1116
1117 do {
1118 samples = LoadUnaligned16(src);
1119 src += src_stride;
1120 sum = _mm_add_epi16(sum, samples);
1121 } while (--y != 0);
1122
1123 if (!is_inside) {
1124 y = visible_height;
1125 do {
1126 sum = _mm_add_epi16(sum, samples);
1127 } while (++y < block_height);
1128 }
1129
1130 sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
1131 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
1132 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
1133
1134 // Here the left shift by 3 (to increase precision) is nullified in right
1135 // shift (log2 of width 8).
1136 __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
1137 averages = _mm_shuffle_epi8(averages, dup16);
1138
1139 src = static_cast<const uint16_t*>(source);
1140 luma_ptr = luma[0];
1141 y = visible_height;
1142 do {
1143 samples = LoadUnaligned16(src);
1144 samples = _mm_slli_epi16(samples, 3);
1145 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
1146 src += src_stride;
1147 luma_ptr += kCflLumaBufferStride;
1148 } while (--y != 0);
1149
1150 if (!is_inside) {
1151 y = visible_height;
1152 // Replicate last line
1153 do {
1154 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
1155 luma_ptr += kCflLumaBufferStride;
1156 } while (++y < block_height);
1157 }
1158 }
1159
1160 template <int block_height_log2>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1161 void CflSubsampler444_8xH_SSE4_1(
1162 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1163 const int max_luma_width, const int max_luma_height,
1164 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1165 static_cast<void>(max_luma_width);
1166 static_cast<void>(max_luma_height);
1167 static_assert(block_height_log2 <= 5, "");
1168 assert(max_luma_width >= 4);
1169 assert(max_luma_height >= 4);
1170 const int block_height = 1 << block_height_log2;
1171 const int block_width = 8;
1172
1173 const int horz_inside = block_width <= max_luma_width;
1174 const int vert_inside = block_height <= max_luma_height;
1175 if (horz_inside && vert_inside) {
1176 CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
1177 source, stride);
1178 } else {
1179 CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
1180 source, stride);
1181 }
1182 }
1183
1184 template <int block_width_log2, int block_height_log2, bool is_inside>
CflSubsampler444_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1185 void CflSubsampler444_WxH_SSE4_1(
1186 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1187 const int max_luma_width, const int max_luma_height,
1188 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1189 const int block_height = 1 << block_height_log2;
1190 const int visible_height = max_luma_height;
1191 const int block_width = 1 << block_width_log2;
1192 const __m128i dup16 = _mm_set1_epi32(0x01000100);
1193 const __m128i zero = _mm_setzero_si128();
1194 const auto* src = static_cast<const uint16_t*>(source);
1195 const ptrdiff_t src_stride = stride / sizeof(src[0]);
1196 int16_t* luma_ptr = luma[0];
1197 __m128i sum = zero;
1198 __m128i inner_sum_lo, inner_sum_hi;
1199 __m128i samples[4];
1200 int y = visible_height;
1201
1202 do {
1203 samples[0] = LoadUnaligned16(src);
1204 samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
1205 : LastRowResult(samples[0]);
1206 __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
1207 if (block_width == 32) {
1208 samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
1209 : LastRowResult(samples[1]);
1210 samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
1211 : LastRowResult(samples[2]);
1212
1213 inner_sum = _mm_add_epi16(samples[2], inner_sum);
1214 inner_sum = _mm_add_epi16(samples[3], inner_sum);
1215 }
1216 inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
1217 inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
1218 sum = _mm_add_epi32(sum, inner_sum_lo);
1219 sum = _mm_add_epi32(sum, inner_sum_hi);
1220 src += src_stride;
1221 } while (--y != 0);
1222
1223 if (!is_inside) {
1224 y = visible_height;
1225 __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
1226 if (block_width == 32) {
1227 inner_sum = _mm_add_epi16(samples[2], inner_sum);
1228 inner_sum = _mm_add_epi16(samples[3], inner_sum);
1229 }
1230 inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
1231 inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
1232 do {
1233 sum = _mm_add_epi32(sum, inner_sum_lo);
1234 sum = _mm_add_epi32(sum, inner_sum_hi);
1235 } while (++y < block_height);
1236 }
1237
1238 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
1239 sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
1240
1241 // Here the left shift by 3 (to increase precision) is subtracted in right
1242 // shift factor (block_width_log2 + block_height_log2 - 3).
1243 __m128i averages =
1244 RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
1245 averages = _mm_shuffle_epi8(averages, dup16);
1246
1247 src = static_cast<const uint16_t*>(source);
1248 __m128i samples_ext = zero;
1249 luma_ptr = luma[0];
1250 y = visible_height;
1251 do {
1252 int idx = 0;
1253 for (int x = 0; x < block_width; x += 8) {
1254 if (max_luma_width > x) {
1255 samples[idx] = LoadUnaligned16(&src[x]);
1256 samples[idx] = _mm_slli_epi16(samples[idx], 3);
1257 samples_ext = samples[idx];
1258 } else {
1259 samples[idx] = LastRowResult(samples_ext);
1260 }
1261 StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
1262 }
1263 src += src_stride;
1264 luma_ptr += kCflLumaBufferStride;
1265 } while (--y != 0);
1266
1267 if (!is_inside) {
1268 y = visible_height;
1269 // Replicate last line
1270 do {
1271 int idx = 0;
1272 for (int x = 0; x < block_width; x += 8) {
1273 StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
1274 }
1275 luma_ptr += kCflLumaBufferStride;
1276 } while (++y < block_height);
1277 }
1278 }
1279
1280 template <int block_width_log2, int block_height_log2>
CflSubsampler444_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1281 void CflSubsampler444_WxH_SSE4_1(
1282 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1283 const int max_luma_width, const int max_luma_height,
1284 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1285 static_assert(block_width_log2 == 4 || block_width_log2 == 5,
1286 "This function will only work for block_width 16 and 32.");
1287 static_assert(block_height_log2 <= 5, "");
1288 assert(max_luma_width >= 4);
1289 assert(max_luma_height >= 4);
1290
1291 const int block_height = 1 << block_height_log2;
1292 const int vert_inside = block_height <= max_luma_height;
1293 if (vert_inside) {
1294 CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
1295 luma, max_luma_width, max_luma_height, source, stride);
1296 } else {
1297 CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
1298 luma, max_luma_width, max_luma_height, source, stride);
1299 }
1300 }
1301
1302 template <int block_height_log2>
CflSubsampler420_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1303 void CflSubsampler420_4xH_SSE4_1(
1304 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1305 const int /*max_luma_width*/, const int max_luma_height,
1306 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1307 const int block_height = 1 << block_height_log2;
1308 const auto* src = static_cast<const uint16_t*>(source);
1309 const ptrdiff_t src_stride = stride / sizeof(src[0]);
1310 int16_t* luma_ptr = luma[0];
1311 const __m128i zero = _mm_setzero_si128();
1312 __m128i final_sum = zero;
1313 const int luma_height = std::min(block_height, max_luma_height >> 1);
1314 int y = luma_height;
1315
1316 do {
1317 const __m128i samples_row0 = LoadUnaligned16(src);
1318 src += src_stride;
1319 const __m128i samples_row1 = LoadUnaligned16(src);
1320 src += src_stride;
1321 const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
1322
1323 const __m128i samples_row2 = LoadUnaligned16(src);
1324 src += src_stride;
1325 const __m128i samples_row3 = LoadUnaligned16(src);
1326 src += src_stride;
1327 const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
1328 __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
1329 luma_ptr += kCflLumaBufferStride << 1;
1330
1331 const __m128i samples_row4 = LoadUnaligned16(src);
1332 src += src_stride;
1333 const __m128i samples_row5 = LoadUnaligned16(src);
1334 src += src_stride;
1335 const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
1336
1337 const __m128i samples_row6 = LoadUnaligned16(src);
1338 src += src_stride;
1339 const __m128i samples_row7 = LoadUnaligned16(src);
1340 src += src_stride;
1341 const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
1342 sum = _mm_add_epi16(
1343 sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
1344 luma_ptr += kCflLumaBufferStride << 1;
1345
1346 final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
1347 final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
1348 y -= 4;
1349 } while (y != 0);
1350
1351 const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
1352 const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
1353 for (y = luma_height; y < block_height; ++y) {
1354 StoreLo8(luma_ptr, final_fill);
1355 luma_ptr += kCflLumaBufferStride;
1356 final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
1357 }
1358 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
1359 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
1360
1361 __m128i averages = RightShiftWithRounding_U32(
1362 final_sum, block_height_log2 + 2 /*log2 of width 4*/);
1363
1364 averages = _mm_shufflelo_epi16(averages, 0);
1365 luma_ptr = luma[0];
1366 y = block_height;
1367 do {
1368 const __m128i samples = LoadLo8(luma_ptr);
1369 StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
1370 luma_ptr += kCflLumaBufferStride;
1371 } while (--y != 0);
1372 }
1373
1374 template <int block_height_log2, int max_luma_width>
CflSubsampler420Impl_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1375 inline void CflSubsampler420Impl_8xH_SSE4_1(
1376 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1377 const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1378 ptrdiff_t stride) {
1379 const int block_height = 1 << block_height_log2;
1380 const auto* src = static_cast<const uint16_t*>(source);
1381 const ptrdiff_t src_stride = stride / sizeof(src[0]);
1382 const __m128i zero = _mm_setzero_si128();
1383 __m128i final_sum = zero;
1384 int16_t* luma_ptr = luma[0];
1385 const int luma_height = std::min(block_height, max_luma_height >> 1);
1386 int y = luma_height;
1387
1388 do {
1389 const __m128i samples_row00 = LoadUnaligned16(src);
1390 const __m128i samples_row01 = (max_luma_width == 16)
1391 ? LoadUnaligned16(src + 8)
1392 : LastRowSamples(samples_row00);
1393 src += src_stride;
1394 const __m128i samples_row10 = LoadUnaligned16(src);
1395 const __m128i samples_row11 = (max_luma_width == 16)
1396 ? LoadUnaligned16(src + 8)
1397 : LastRowSamples(samples_row10);
1398 src += src_stride;
1399 const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
1400 const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
1401 __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
1402 luma_ptr += kCflLumaBufferStride;
1403
1404 const __m128i samples_row20 = LoadUnaligned16(src);
1405 const __m128i samples_row21 = (max_luma_width == 16)
1406 ? LoadUnaligned16(src + 8)
1407 : LastRowSamples(samples_row20);
1408 src += src_stride;
1409 const __m128i samples_row30 = LoadUnaligned16(src);
1410 const __m128i samples_row31 = (max_luma_width == 16)
1411 ? LoadUnaligned16(src + 8)
1412 : LastRowSamples(samples_row30);
1413 src += src_stride;
1414 const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
1415 const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
1416 sum = _mm_add_epi16(
1417 sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
1418 luma_ptr += kCflLumaBufferStride;
1419
1420 const __m128i samples_row40 = LoadUnaligned16(src);
1421 const __m128i samples_row41 = (max_luma_width == 16)
1422 ? LoadUnaligned16(src + 8)
1423 : LastRowSamples(samples_row40);
1424 src += src_stride;
1425 const __m128i samples_row50 = LoadUnaligned16(src);
1426 const __m128i samples_row51 = (max_luma_width == 16)
1427 ? LoadUnaligned16(src + 8)
1428 : LastRowSamples(samples_row50);
1429 src += src_stride;
1430 const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
1431 const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
1432 sum = _mm_add_epi16(
1433 sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
1434 luma_ptr += kCflLumaBufferStride;
1435
1436 const __m128i samples_row60 = LoadUnaligned16(src);
1437 const __m128i samples_row61 = (max_luma_width == 16)
1438 ? LoadUnaligned16(src + 8)
1439 : LastRowSamples(samples_row60);
1440 src += src_stride;
1441 const __m128i samples_row70 = LoadUnaligned16(src);
1442 const __m128i samples_row71 = (max_luma_width == 16)
1443 ? LoadUnaligned16(src + 8)
1444 : LastRowSamples(samples_row70);
1445 src += src_stride;
1446 const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
1447 const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
1448 sum = _mm_add_epi16(
1449 sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
1450 luma_ptr += kCflLumaBufferStride;
1451
1452 final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
1453 final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
1454 y -= 4;
1455 } while (y != 0);
1456
1457 // Duplicate the final row downward to the end after max_luma_height.
1458 const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
1459 const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
1460 const __m128i final_fill_to_sum1 =
1461 _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
1462 const __m128i final_fill_to_sum =
1463 _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
1464 for (y = luma_height; y < block_height; ++y) {
1465 StoreUnaligned16(luma_ptr, final_fill);
1466 luma_ptr += kCflLumaBufferStride;
1467 final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
1468 }
1469 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
1470 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
1471
1472 __m128i averages = RightShiftWithRounding_S32(
1473 final_sum, block_height_log2 + 3 /*log2 of width 8*/);
1474
1475 averages = _mm_shufflelo_epi16(averages, 0);
1476 averages = _mm_shuffle_epi32(averages, 0);
1477 luma_ptr = luma[0];
1478 y = block_height;
1479 do {
1480 const __m128i samples = LoadUnaligned16(luma_ptr);
1481 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
1482 luma_ptr += kCflLumaBufferStride;
1483 } while (--y != 0);
1484 }
1485
1486 template <int block_height_log2>
CflSubsampler420_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1487 void CflSubsampler420_8xH_SSE4_1(
1488 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1489 const int max_luma_width, const int max_luma_height,
1490 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1491 if (max_luma_width == 8) {
1492 CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
1493 source, stride);
1494 } else {
1495 CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
1496 luma, max_luma_height, source, stride);
1497 }
1498 }
1499
1500 template <int block_width_log2, int block_height_log2, int max_luma_width>
CflSubsampler420Impl_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1501 inline void CflSubsampler420Impl_WxH_SSE4_1(
1502 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1503 const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1504 ptrdiff_t stride) {
1505 const auto* src = static_cast<const uint16_t*>(source);
1506 const ptrdiff_t src_stride = stride / sizeof(src[0]);
1507 const __m128i zero = _mm_setzero_si128();
1508 __m128i final_sum = zero;
1509 const int block_height = 1 << block_height_log2;
1510 const int luma_height = std::min(block_height, max_luma_height >> 1);
1511 int16_t* luma_ptr = luma[0];
1512 __m128i final_row_result;
1513 // Begin first y section, covering width up to 32.
1514 int y = luma_height;
1515
1516 do {
1517 const uint16_t* src_next = src + src_stride;
1518 const __m128i samples_row00 = LoadUnaligned16(src);
1519 const __m128i samples_row01 = (max_luma_width >= 16)
1520 ? LoadUnaligned16(src + 8)
1521 : LastRowSamples(samples_row00);
1522 const __m128i samples_row02 = (max_luma_width >= 24)
1523 ? LoadUnaligned16(src + 16)
1524 : LastRowSamples(samples_row01);
1525 const __m128i samples_row03 = (max_luma_width == 32)
1526 ? LoadUnaligned16(src + 24)
1527 : LastRowSamples(samples_row02);
1528 const __m128i samples_row10 = LoadUnaligned16(src_next);
1529 const __m128i samples_row11 = (max_luma_width >= 16)
1530 ? LoadUnaligned16(src_next + 8)
1531 : LastRowSamples(samples_row10);
1532 const __m128i samples_row12 = (max_luma_width >= 24)
1533 ? LoadUnaligned16(src_next + 16)
1534 : LastRowSamples(samples_row11);
1535 const __m128i samples_row13 = (max_luma_width == 32)
1536 ? LoadUnaligned16(src_next + 24)
1537 : LastRowSamples(samples_row12);
1538 const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
1539 const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
1540 const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
1541 const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
1542 __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
1543 final_row_result =
1544 StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
1545 sum = _mm_add_epi16(sum, final_row_result);
1546 final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
1547 final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
1548
1549 // Because max_luma_width is at most 32, any values beyond x=16 will
1550 // necessarily be duplicated.
1551 if (block_width_log2 == 5) {
1552 const __m128i wide_fill = LastRowResult(final_row_result);
1553 // There are 16 16-bit fill values per row, shifting by 2 accounts for
1554 // the widening to 32-bit.
1555 final_sum = _mm_add_epi32(
1556 final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
1557 }
1558 src += src_stride << 1;
1559 luma_ptr += kCflLumaBufferStride;
1560 } while (--y != 0);
1561
1562 // Begin second y section.
1563 y = luma_height;
1564 if (y < block_height) {
1565 const __m128i final_fill0 =
1566 LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
1567 const __m128i final_fill1 =
1568 LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
1569 __m128i wide_fill;
1570 if (block_width_log2 == 5) {
1571 // There are 16 16-bit fill values per row, shifting by 2 accounts for
1572 // the widening to 32-bit.
1573 wide_fill =
1574 _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
1575 }
1576 const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
1577 const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
1578 const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
1579 const __m128i final_fill_to_sum =
1580 _mm_add_epi32(final_inner_sum0, final_inner_sum1);
1581
1582 do {
1583 StoreUnaligned16(luma_ptr, final_fill0);
1584 StoreUnaligned16(luma_ptr + 8, final_fill1);
1585 if (block_width_log2 == 5) {
1586 final_sum = _mm_add_epi32(final_sum, wide_fill);
1587 }
1588 luma_ptr += kCflLumaBufferStride;
1589 final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
1590 } while (++y < block_height);
1591 } // End second y section.
1592
1593 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
1594 final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
1595
1596 __m128i averages = RightShiftWithRounding_S32(
1597 final_sum, block_width_log2 + block_height_log2);
1598 averages = _mm_shufflelo_epi16(averages, 0);
1599 averages = _mm_shuffle_epi32(averages, 0);
1600
1601 luma_ptr = luma[0];
1602 y = block_height;
1603 do {
1604 const __m128i samples0 = LoadUnaligned16(luma_ptr);
1605 StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
1606 const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
1607 final_row_result = _mm_sub_epi16(samples1, averages);
1608 StoreUnaligned16(luma_ptr + 8, final_row_result);
1609
1610 if (block_width_log2 == 5) {
1611 const __m128i wide_fill = LastRowResult(final_row_result);
1612 StoreUnaligned16(luma_ptr + 16, wide_fill);
1613 StoreUnaligned16(luma_ptr + 24, wide_fill);
1614 }
1615 luma_ptr += kCflLumaBufferStride;
1616 } while (--y != 0);
1617 }
1618
1619 template <int block_width_log2, int block_height_log2>
CflSubsampler420_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1620 void CflSubsampler420_WxH_SSE4_1(
1621 int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1622 const int max_luma_width, const int max_luma_height,
1623 const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1624 switch (max_luma_width) {
1625 case 8:
1626 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
1627 luma, max_luma_height, source, stride);
1628 return;
1629 case 16:
1630 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
1631 luma, max_luma_height, source, stride);
1632 return;
1633 case 24:
1634 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
1635 luma, max_luma_height, source, stride);
1636 return;
1637 default:
1638 assert(max_luma_width == 32);
1639 CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
1640 luma, max_luma_height, source, stride);
1641 return;
1642 }
1643 }
1644
Init10bpp()1645 void Init10bpp() {
1646 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1647 assert(dsp != nullptr);
1648
1649 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
1650 dsp->cfl_intra_predictors[kTransformSize4x4] =
1651 CflIntraPredictor_10bpp_SSE4_1<4, 4>;
1652 #endif
1653 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
1654 dsp->cfl_intra_predictors[kTransformSize4x8] =
1655 CflIntraPredictor_10bpp_SSE4_1<4, 8>;
1656 #endif
1657 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
1658 dsp->cfl_intra_predictors[kTransformSize4x16] =
1659 CflIntraPredictor_10bpp_SSE4_1<4, 16>;
1660 #endif
1661 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
1662 dsp->cfl_intra_predictors[kTransformSize8x4] =
1663 CflIntraPredictor_10bpp_SSE4_1<8, 4>;
1664 #endif
1665 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
1666 dsp->cfl_intra_predictors[kTransformSize8x8] =
1667 CflIntraPredictor_10bpp_SSE4_1<8, 8>;
1668 #endif
1669 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
1670 dsp->cfl_intra_predictors[kTransformSize8x16] =
1671 CflIntraPredictor_10bpp_SSE4_1<8, 16>;
1672 #endif
1673 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
1674 dsp->cfl_intra_predictors[kTransformSize8x32] =
1675 CflIntraPredictor_10bpp_SSE4_1<8, 32>;
1676 #endif
1677 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
1678 dsp->cfl_intra_predictors[kTransformSize16x4] =
1679 CflIntraPredictor_10bpp_SSE4_1<16, 4>;
1680 #endif
1681 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
1682 dsp->cfl_intra_predictors[kTransformSize16x8] =
1683 CflIntraPredictor_10bpp_SSE4_1<16, 8>;
1684 #endif
1685 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
1686 dsp->cfl_intra_predictors[kTransformSize16x16] =
1687 CflIntraPredictor_10bpp_SSE4_1<16, 16>;
1688 #endif
1689 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
1690 dsp->cfl_intra_predictors[kTransformSize16x32] =
1691 CflIntraPredictor_10bpp_SSE4_1<16, 32>;
1692 #endif
1693 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
1694 dsp->cfl_intra_predictors[kTransformSize32x8] =
1695 CflIntraPredictor_10bpp_SSE4_1<32, 8>;
1696 #endif
1697 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
1698 dsp->cfl_intra_predictors[kTransformSize32x16] =
1699 CflIntraPredictor_10bpp_SSE4_1<32, 16>;
1700 #endif
1701 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
1702 dsp->cfl_intra_predictors[kTransformSize32x32] =
1703 CflIntraPredictor_10bpp_SSE4_1<32, 32>;
1704 #endif
1705 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
1706 dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
1707 CflSubsampler420_4xH_SSE4_1<2>;
1708 #endif
1709 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
1710 dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
1711 CflSubsampler420_4xH_SSE4_1<3>;
1712 #endif
1713 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
1714 dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
1715 CflSubsampler420_4xH_SSE4_1<4>;
1716 #endif
1717 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
1718 dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
1719 CflSubsampler420_8xH_SSE4_1<2>;
1720 #endif
1721 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
1722 dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
1723 CflSubsampler420_8xH_SSE4_1<3>;
1724 #endif
1725 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
1726 dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
1727 CflSubsampler420_8xH_SSE4_1<4>;
1728 #endif
1729 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
1730 dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
1731 CflSubsampler420_8xH_SSE4_1<5>;
1732 #endif
1733 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
1734 dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
1735 CflSubsampler420_WxH_SSE4_1<4, 2>;
1736 #endif
1737 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
1738 dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
1739 CflSubsampler420_WxH_SSE4_1<4, 3>;
1740 #endif
1741 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
1742 dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
1743 CflSubsampler420_WxH_SSE4_1<4, 4>;
1744 #endif
1745 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
1746 dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
1747 CflSubsampler420_WxH_SSE4_1<4, 5>;
1748 #endif
1749 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
1750 dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
1751 CflSubsampler420_WxH_SSE4_1<5, 3>;
1752 #endif
1753 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
1754 dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
1755 CflSubsampler420_WxH_SSE4_1<5, 4>;
1756 #endif
1757 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
1758 dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
1759 CflSubsampler420_WxH_SSE4_1<5, 5>;
1760 #endif
1761
1762 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
1763 dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
1764 CflSubsampler444_4xH_SSE4_1<2>;
1765 #endif
1766 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
1767 dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
1768 CflSubsampler444_4xH_SSE4_1<3>;
1769 #endif
1770 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
1771 dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
1772 CflSubsampler444_4xH_SSE4_1<4>;
1773 #endif
1774 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
1775 dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
1776 CflSubsampler444_8xH_SSE4_1<2>;
1777 #endif
1778 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
1779 dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
1780 CflSubsampler444_8xH_SSE4_1<3>;
1781 #endif
1782 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
1783 dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
1784 CflSubsampler444_8xH_SSE4_1<4>;
1785 #endif
1786 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
1787 dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
1788 CflSubsampler444_8xH_SSE4_1<5>;
1789 #endif
1790 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
1791 dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
1792 CflSubsampler444_WxH_SSE4_1<4, 2>;
1793 #endif
1794 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
1795 dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
1796 CflSubsampler444_WxH_SSE4_1<4, 3>;
1797 #endif
1798 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
1799 dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
1800 CflSubsampler444_WxH_SSE4_1<4, 4>;
1801 #endif
1802 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
1803 dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
1804 CflSubsampler444_WxH_SSE4_1<4, 5>;
1805 #endif
1806 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
1807 dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
1808 CflSubsampler444_WxH_SSE4_1<5, 3>;
1809 #endif
1810 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
1811 dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
1812 CflSubsampler444_WxH_SSE4_1<5, 4>;
1813 #endif
1814 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
1815 dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
1816 CflSubsampler444_WxH_SSE4_1<5, 5>;
1817 #endif
1818 }
1819
1820 } // namespace
1821 } // namespace high_bitdepth
1822 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1823
IntraPredCflInit_SSE4_1()1824 void IntraPredCflInit_SSE4_1() {
1825 low_bitdepth::Init8bpp();
1826 #if LIBGAV1_MAX_BITDEPTH >= 10
1827 high_bitdepth::Init10bpp();
1828 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1829 }
1830
1831 } // namespace dsp
1832 } // namespace libgav1
1833
1834 #else // !LIBGAV1_TARGETING_SSE4_1
1835
1836 namespace libgav1 {
1837 namespace dsp {
1838
IntraPredCflInit_SSE4_1()1839 void IntraPredCflInit_SSE4_1() {}
1840
1841 } // namespace dsp
1842 } // namespace libgav1
1843
1844 #endif // LIBGAV1_TARGETING_SSE4_1
1845