xref: /aosp_15_r20/external/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred_cfl.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/dsp/x86/common_sse4.h"
30 #include "src/utils/common.h"
31 #include "src/utils/compiler_attributes.h"
32 #include "src/utils/constants.h"
33 
34 namespace libgav1 {
35 namespace dsp {
36 namespace {
37 
38 // This duplicates the last two 16-bit values in |row|.
LastRowSamples(const __m128i row)39 inline __m128i LastRowSamples(const __m128i row) {
40   return _mm_shuffle_epi32(row, 0xFF);
41 }
42 
43 // This duplicates the last 16-bit value in |row|.
LastRowResult(const __m128i row)44 inline __m128i LastRowResult(const __m128i row) {
45   const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
46   return _mm_shuffle_epi32(dup_row, 0xFF);
47 }
48 
49 // Takes in two sums of input row pairs, and completes the computation for two
50 // output rows.
StoreLumaResults4_420(const __m128i vertical_sum0,const __m128i vertical_sum1,int16_t * luma_ptr)51 inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
52                                      const __m128i vertical_sum1,
53                                      int16_t* luma_ptr) {
54   __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
55   result = _mm_slli_epi16(result, 1);
56   StoreLo8(luma_ptr, result);
57   StoreHi8(luma_ptr + kCflLumaBufferStride, result);
58   return result;
59 }
60 
61 // Takes two halves of a vertically added pair of rows and completes the
62 // computation for one output row.
StoreLumaResults8_420(const __m128i vertical_sum0,const __m128i vertical_sum1,int16_t * luma_ptr)63 inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
64                                      const __m128i vertical_sum1,
65                                      int16_t* luma_ptr) {
66   __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
67   result = _mm_slli_epi16(result, 1);
68   StoreUnaligned16(luma_ptr, result);
69   return result;
70 }
71 
72 }  // namespace
73 
74 namespace low_bitdepth {
75 namespace {
76 
77 //------------------------------------------------------------------------------
78 // CflIntraPredictor_SSE4_1
79 
CflPredictUnclipped(const __m128i * input,__m128i alpha_q12,__m128i alpha_sign,__m128i dc_q0)80 inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
81                                    __m128i alpha_sign, __m128i dc_q0) {
82   const __m128i ac_q3 = LoadUnaligned16(input);
83   const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
84   __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
85   scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
86   return _mm_add_epi16(scaled_luma_q0, dc_q0);
87 }
88 
89 template <int width, int height>
CflIntraPredictor_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int alpha)90 void CflIntraPredictor_SSE4_1(
91     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
92     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
93     const int alpha) {
94   auto* dst = static_cast<uint8_t*>(dest);
95   const __m128i alpha_sign = _mm_set1_epi16(alpha);
96   const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
97   auto* row = reinterpret_cast<const __m128i*>(luma);
98   const int kCflLumaBufferStrideLog2_16i = 5;
99   const int kCflLumaBufferStrideLog2_128i = kCflLumaBufferStrideLog2_16i - 3;
100   const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
101   const __m128i dc_val = _mm_set1_epi16(dst[0]);
102   do {
103     __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
104     if (width < 16) {
105       res = _mm_packus_epi16(res, res);
106       if (width == 4) {
107         Store4(dst, res);
108       } else {
109         StoreLo8(dst, res);
110       }
111     } else {
112       __m128i next =
113           CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
114       res = _mm_packus_epi16(res, next);
115       StoreUnaligned16(dst, res);
116       if (width == 32) {
117         res = CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
118         next = CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
119         res = _mm_packus_epi16(res, next);
120         StoreUnaligned16(dst + 16, res);
121       }
122     }
123     dst += stride;
124   } while ((row += (1 << kCflLumaBufferStrideLog2_128i)) < row_end);
125 }
126 
127 template <int block_height_log2, bool is_inside>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)128 void CflSubsampler444_4xH_SSE4_1(
129     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
130     const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
131     ptrdiff_t stride) {
132   static_assert(block_height_log2 <= 4, "");
133   const int block_height = 1 << block_height_log2;
134   const int visible_height = max_luma_height;
135   const auto* src = static_cast<const uint8_t*>(source);
136   __m128i sum = _mm_setzero_si128();
137   int16_t* luma_ptr = luma[0];
138   const __m128i zero = _mm_setzero_si128();
139   __m128i samples;
140   int y = 0;
141   do {
142     samples = Load4(src);
143     src += stride;
144     int src_bytes;
145     memcpy(&src_bytes, src, 4);
146     samples = _mm_insert_epi32(samples, src_bytes, 1);
147     src += stride;
148     samples = _mm_slli_epi16(_mm_cvtepu8_epi16(samples), 3);
149     StoreLo8(luma_ptr, samples);
150     luma_ptr += kCflLumaBufferStride;
151     StoreHi8(luma_ptr, samples);
152     luma_ptr += kCflLumaBufferStride;
153 
154     // The maximum value here is 2**bd * H * 2**shift. Since the maximum H for
155     // 4XH is 16 = 2**4, we have 2**(8 + 4 + 3) = 2**15, which fits in 16 bits.
156     sum = _mm_add_epi16(sum, samples);
157     y += 2;
158   } while (y < visible_height);
159 
160   if (!is_inside) {
161     // Replicate the 2 high lanes.
162     samples = _mm_shuffle_epi32(samples, 0xee);
163     do {
164       StoreLo8(luma_ptr, samples);
165       luma_ptr += kCflLumaBufferStride;
166       StoreHi8(luma_ptr, samples);
167       luma_ptr += kCflLumaBufferStride;
168       sum = _mm_add_epi16(sum, samples);
169       y += 2;
170     } while (y < block_height);
171   }
172 
173   __m128i sum_tmp = _mm_unpackhi_epi16(sum, zero);
174   sum = _mm_cvtepu16_epi32(sum);
175   sum = _mm_add_epi32(sum, sum_tmp);
176   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
177   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
178 
179   __m128i averages = RightShiftWithRounding_U32(
180       sum, block_height_log2 + 2 /* log2 of width 4 */);
181   averages = _mm_shufflelo_epi16(averages, 0);
182   luma_ptr = luma[0];
183   for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
184     const __m128i samples = LoadLo8(luma_ptr);
185     StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
186   }
187 }
188 
189 template <int block_height_log2>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)190 void CflSubsampler444_4xH_SSE4_1(
191     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
192     const int max_luma_width, const int max_luma_height,
193     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
194   static_assert(block_height_log2 <= 4, "");
195   assert(max_luma_width >= 4);
196   assert(max_luma_height >= 4);
197   static_cast<void>(max_luma_width);
198   constexpr int block_height = 1 << block_height_log2;
199 
200   if (block_height <= max_luma_height) {
201     CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
202                                                          source, stride);
203   } else {
204     CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
205                                                           source, stride);
206   }
207 }
208 
209 template <int block_height_log2, bool inside>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)210 void CflSubsampler444_8xH_SSE4_1(
211     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
212     const int max_luma_width, const int max_luma_height,
213     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
214   static_assert(block_height_log2 <= 5, "");
215   const int block_height = 1 << block_height_log2, block_width = 8;
216   const int visible_height = max_luma_height;
217   const int invisible_width = inside ? 0 : block_width - max_luma_width;
218   const int visible_width = max_luma_width;
219   const __m128i blend_mask =
220       inside ? _mm_setzero_si128() : MaskHighNBytes(8 + invisible_width);
221   const __m128i dup16 = _mm_set1_epi32(0x01000100);
222   const auto* src = static_cast<const uint8_t*>(source);
223   int16_t* luma_ptr = luma[0];
224   const __m128i zero = _mm_setzero_si128();
225   // Since the maximum height is 32, if we split them by parity, each one only
226   // needs to accumulate 16 rows. Just like the calculation done in 4XH, we can
227   // store them in 16 bits without casting to 32 bits.
228   __m128i sum_even = _mm_setzero_si128(), sum_odd = _mm_setzero_si128();
229   __m128i sum;
230   __m128i samples1;
231 
232   int y = 0;
233   do {
234     __m128i samples0 = LoadLo8(src);
235     if (!inside) {
236       const __m128i border0 =
237           _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
238       samples0 = _mm_blendv_epi8(samples0, border0, blend_mask);
239     }
240     src += stride;
241     samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples0), 3);
242     StoreUnaligned16(luma_ptr, samples0);
243     luma_ptr += kCflLumaBufferStride;
244 
245     sum_even = _mm_add_epi16(sum_even, samples0);
246 
247     samples1 = LoadLo8(src);
248     if (!inside) {
249       const __m128i border1 =
250           _mm_set1_epi8(static_cast<int8_t>(src[visible_width - 1]));
251       samples1 = _mm_blendv_epi8(samples1, border1, blend_mask);
252     }
253     src += stride;
254     samples1 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples1), 3);
255     StoreUnaligned16(luma_ptr, samples1);
256     luma_ptr += kCflLumaBufferStride;
257 
258     sum_odd = _mm_add_epi16(sum_odd, samples1);
259     y += 2;
260   } while (y < visible_height);
261 
262   if (!inside) {
263     for (int y = visible_height; y < block_height; y += 2) {
264       sum_even = _mm_add_epi16(sum_even, samples1);
265       StoreUnaligned16(luma_ptr, samples1);
266       luma_ptr += kCflLumaBufferStride;
267 
268       sum_odd = _mm_add_epi16(sum_odd, samples1);
269       StoreUnaligned16(luma_ptr, samples1);
270       luma_ptr += kCflLumaBufferStride;
271     }
272   }
273 
274   sum = _mm_add_epi32(_mm_unpackhi_epi16(sum_even, zero),
275                       _mm_cvtepu16_epi32(sum_even));
276   sum = _mm_add_epi32(sum, _mm_unpackhi_epi16(sum_odd, zero));
277   sum = _mm_add_epi32(sum, _mm_cvtepu16_epi32(sum_odd));
278 
279   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
280   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
281 
282   __m128i averages = RightShiftWithRounding_U32(
283       sum, block_height_log2 + 3 /* log2 of width 8 */);
284   averages = _mm_shuffle_epi8(averages, dup16);
285   luma_ptr = luma[0];
286   for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
287     const __m128i samples = LoadUnaligned16(luma_ptr);
288     StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
289   }
290 }
291 
292 template <int block_height_log2>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)293 void CflSubsampler444_8xH_SSE4_1(
294     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
295     const int max_luma_width, const int max_luma_height,
296     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
297   static_assert(block_height_log2 <= 5, "");
298   assert(max_luma_width >= 4);
299   assert(max_luma_height >= 4);
300   const int block_height = 1 << block_height_log2;
301   const int block_width = 8;
302 
303   const int horz_inside = block_width <= max_luma_width;
304   const int vert_inside = block_height <= max_luma_height;
305   if (horz_inside && vert_inside) {
306     CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(
307         luma, max_luma_width, max_luma_height, source, stride);
308   } else {
309     CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(
310         luma, max_luma_width, max_luma_height, source, stride);
311   }
312 }
313 
314 // This function will only work for block_width 16 and 32.
315 template <int block_width_log2, int block_height_log2, bool inside>
CflSubsampler444_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)316 void CflSubsampler444_SSE4_1(
317     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
318     const int max_luma_width, const int max_luma_height,
319     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
320   static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
321   static_assert(block_height_log2 <= 5, "");
322   assert(max_luma_width >= 4);
323   assert(max_luma_height >= 4);
324   const int block_height = 1 << block_height_log2;
325   const int block_width = 1 << block_width_log2;
326 
327   const int visible_height = max_luma_height;
328   const int visible_width_16 = inside ? 16 : std::min(16, max_luma_width);
329   const int invisible_width_16 = 16 - visible_width_16;
330   const __m128i blend_mask_16 = MaskHighNBytes(invisible_width_16);
331   const int visible_width_32 = inside ? 32 : max_luma_width;
332   const int invisible_width_32 = 32 - visible_width_32;
333   const __m128i blend_mask_32 =
334       MaskHighNBytes(std::min(16, invisible_width_32));
335 
336   const __m128i dup16 = _mm_set1_epi32(0x01000100);
337   const __m128i zero = _mm_setzero_si128();
338   const auto* src = static_cast<const uint8_t*>(source);
339   int16_t* luma_ptr = luma[0];
340   __m128i sum = _mm_setzero_si128();
341 
342   __m128i samples0, samples1;
343   __m128i samples2, samples3;
344   __m128i inner_sum_lo, inner_sum_hi;
345   int y = 0;
346   do {
347     // We can load uninitialized values here. Even though they are then masked
348     // off by blendv, MSAN doesn't model that behavior.
349     __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
350 
351     if (!inside) {
352       const __m128i border16 =
353           _mm_set1_epi8(static_cast<int8_t>(src[visible_width_16 - 1]));
354       samples01 = _mm_blendv_epi8(samples01, border16, blend_mask_16);
355     }
356     samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
357     samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
358 
359     StoreUnaligned16(luma_ptr, samples0);
360     StoreUnaligned16(luma_ptr + 8, samples1);
361     __m128i inner_sum = _mm_add_epi16(samples0, samples1);
362 
363     if (block_width == 32) {
364       // We can load uninitialized values here. Even though they are then masked
365       // off by blendv, MSAN doesn't model that behavior.
366       __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
367       if (!inside) {
368         const __m128i border32 =
369             _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
370         samples23 = _mm_blendv_epi8(samples23, border32, blend_mask_32);
371       }
372       samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
373       samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
374 
375       StoreUnaligned16(luma_ptr + 16, samples2);
376       StoreUnaligned16(luma_ptr + 24, samples3);
377       inner_sum = _mm_add_epi16(samples2, inner_sum);
378       inner_sum = _mm_add_epi16(samples3, inner_sum);
379     }
380 
381     inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
382     inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
383     sum = _mm_add_epi32(sum, inner_sum_lo);
384     sum = _mm_add_epi32(sum, inner_sum_hi);
385     luma_ptr += kCflLumaBufferStride;
386     src += stride;
387   } while (++y < visible_height);
388 
389   if (!inside) {
390     for (int y = visible_height; y < block_height;
391          luma_ptr += kCflLumaBufferStride, ++y) {
392       sum = _mm_add_epi32(sum, inner_sum_lo);
393       StoreUnaligned16(luma_ptr, samples0);
394       sum = _mm_add_epi32(sum, inner_sum_hi);
395       StoreUnaligned16(luma_ptr + 8, samples1);
396       if (block_width == 32) {
397         StoreUnaligned16(luma_ptr + 16, samples2);
398         StoreUnaligned16(luma_ptr + 24, samples3);
399       }
400     }
401   }
402 
403   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
404   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
405 
406   __m128i averages =
407       RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2);
408   averages = _mm_shuffle_epi8(averages, dup16);
409   luma_ptr = luma[0];
410   for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
411     for (int x = 0; x < block_width; x += 8) {
412       __m128i samples = LoadUnaligned16(&luma_ptr[x]);
413       StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples, averages));
414     }
415   }
416 }
417 
418 template <int block_width_log2, int block_height_log2>
CflSubsampler444_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)419 void CflSubsampler444_SSE4_1(
420     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
421     const int max_luma_width, const int max_luma_height,
422     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
423   static_assert(block_width_log2 == 4 || block_width_log2 == 5, "");
424   static_assert(block_height_log2 <= 5, "");
425   assert(max_luma_width >= 4);
426   assert(max_luma_height >= 4);
427 
428   const int block_height = 1 << block_height_log2;
429   const int block_width = 1 << block_width_log2;
430   const int horz_inside = block_width <= max_luma_width;
431   const int vert_inside = block_height <= max_luma_height;
432   if (horz_inside && vert_inside) {
433     CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, true>(
434         luma, max_luma_width, max_luma_height, source, stride);
435   } else {
436     CflSubsampler444_SSE4_1<block_width_log2, block_height_log2, false>(
437         luma, max_luma_width, max_luma_height, source, stride);
438   }
439 }
440 
441 template <int block_height_log2>
CflSubsampler420_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)442 void CflSubsampler420_4xH_SSE4_1(
443     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
444     const int /*max_luma_width*/, const int max_luma_height,
445     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
446   const int block_height = 1 << block_height_log2;
447   const auto* src = static_cast<const uint8_t*>(source);
448   int16_t* luma_ptr = luma[0];
449   const __m128i zero = _mm_setzero_si128();
450   __m128i final_sum = zero;
451   const int luma_height = std::min(block_height, max_luma_height >> 1);
452   int y = 0;
453   do {
454     // Note that double sampling and converting to 16bit makes a row fill the
455     // vector.
456     const __m128i samples_row0 = _mm_cvtepu8_epi16(LoadLo8(src));
457     src += stride;
458     const __m128i samples_row1 = _mm_cvtepu8_epi16(LoadLo8(src));
459     src += stride;
460     const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
461 
462     const __m128i samples_row2 = _mm_cvtepu8_epi16(LoadLo8(src));
463     src += stride;
464     const __m128i samples_row3 = _mm_cvtepu8_epi16(LoadLo8(src));
465     src += stride;
466     const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
467     __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
468     luma_ptr += kCflLumaBufferStride << 1;
469 
470     const __m128i samples_row4 = _mm_cvtepu8_epi16(LoadLo8(src));
471     src += stride;
472     const __m128i samples_row5 = _mm_cvtepu8_epi16(LoadLo8(src));
473     src += stride;
474     const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
475 
476     const __m128i samples_row6 = _mm_cvtepu8_epi16(LoadLo8(src));
477     src += stride;
478     const __m128i samples_row7 = _mm_cvtepu8_epi16(LoadLo8(src));
479     src += stride;
480     const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
481     sum = _mm_add_epi16(
482         sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
483     luma_ptr += kCflLumaBufferStride << 1;
484 
485     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
486     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
487     y += 4;
488   } while (y < luma_height);
489   const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
490   const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
491   for (; y < block_height; ++y) {
492     StoreLo8(luma_ptr, final_fill);
493     luma_ptr += kCflLumaBufferStride;
494 
495     final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
496   }
497   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
498   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
499 
500   __m128i averages = RightShiftWithRounding_U32(
501       final_sum, block_height_log2 + 2 /*log2 of width 4*/);
502 
503   averages = _mm_shufflelo_epi16(averages, 0);
504   luma_ptr = luma[0];
505   for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
506     const __m128i samples = LoadLo8(luma_ptr);
507     StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
508   }
509 }
510 
511 template <int block_height_log2, int max_luma_width>
CflSubsampler420Impl_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)512 inline void CflSubsampler420Impl_8xH_SSE4_1(
513     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
514     const int /*max_luma_width*/, const int max_luma_height,
515     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
516   const int block_height = 1 << block_height_log2;
517   const auto* src = static_cast<const uint8_t*>(source);
518   const __m128i zero = _mm_setzero_si128();
519   __m128i final_sum = zero;
520   int16_t* luma_ptr = luma[0];
521   const int luma_height = std::min(block_height, max_luma_height >> 1);
522   int y = 0;
523 
524   do {
525     const __m128i samples_row00 = _mm_cvtepu8_epi16(LoadLo8(src));
526     const __m128i samples_row01 = (max_luma_width == 16)
527                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
528                                       : LastRowSamples(samples_row00);
529     src += stride;
530     const __m128i samples_row10 = _mm_cvtepu8_epi16(LoadLo8(src));
531     const __m128i samples_row11 = (max_luma_width == 16)
532                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
533                                       : LastRowSamples(samples_row10);
534     src += stride;
535     const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
536     const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
537     __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
538     luma_ptr += kCflLumaBufferStride;
539 
540     const __m128i samples_row20 = _mm_cvtepu8_epi16(LoadLo8(src));
541     const __m128i samples_row21 = (max_luma_width == 16)
542                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
543                                       : LastRowSamples(samples_row20);
544     src += stride;
545     const __m128i samples_row30 = _mm_cvtepu8_epi16(LoadLo8(src));
546     const __m128i samples_row31 = (max_luma_width == 16)
547                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
548                                       : LastRowSamples(samples_row30);
549     src += stride;
550     const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
551     const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
552     sum = _mm_add_epi16(
553         sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
554     luma_ptr += kCflLumaBufferStride;
555 
556     const __m128i samples_row40 = _mm_cvtepu8_epi16(LoadLo8(src));
557     const __m128i samples_row41 = (max_luma_width == 16)
558                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
559                                       : LastRowSamples(samples_row40);
560     src += stride;
561     const __m128i samples_row50 = _mm_cvtepu8_epi16(LoadLo8(src));
562     const __m128i samples_row51 = (max_luma_width == 16)
563                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
564                                       : LastRowSamples(samples_row50);
565     src += stride;
566     const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
567     const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
568     sum = _mm_add_epi16(
569         sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
570     luma_ptr += kCflLumaBufferStride;
571 
572     const __m128i samples_row60 = _mm_cvtepu8_epi16(LoadLo8(src));
573     const __m128i samples_row61 = (max_luma_width == 16)
574                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
575                                       : LastRowSamples(samples_row60);
576     src += stride;
577     const __m128i samples_row70 = _mm_cvtepu8_epi16(LoadLo8(src));
578     const __m128i samples_row71 = (max_luma_width == 16)
579                                       ? _mm_cvtepu8_epi16(LoadLo8(src + 8))
580                                       : LastRowSamples(samples_row70);
581     src += stride;
582     const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
583     const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
584     sum = _mm_add_epi16(
585         sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
586     luma_ptr += kCflLumaBufferStride;
587 
588     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
589     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
590     y += 4;
591   } while (y < luma_height);
592   // Duplicate the final row downward to the end after max_luma_height.
593   const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
594   const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
595   const __m128i final_fill_to_sum1 =
596       _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
597   const __m128i final_fill_to_sum =
598       _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
599   for (; y < block_height; ++y) {
600     StoreUnaligned16(luma_ptr, final_fill);
601     luma_ptr += kCflLumaBufferStride;
602 
603     final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
604   }
605   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
606   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
607 
608   __m128i averages = RightShiftWithRounding_S32(
609       final_sum, block_height_log2 + 3 /*log2 of width 8*/);
610 
611   averages = _mm_shufflelo_epi16(averages, 0);
612   averages = _mm_shuffle_epi32(averages, 0);
613   luma_ptr = luma[0];
614   for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
615     const __m128i samples = LoadUnaligned16(luma_ptr);
616     StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
617   }
618 }
619 
620 template <int block_height_log2>
CflSubsampler420_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)621 void CflSubsampler420_8xH_SSE4_1(
622     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
623     const int max_luma_width, const int max_luma_height,
624     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
625   if (max_luma_width == 8) {
626     CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(
627         luma, max_luma_width, max_luma_height, source, stride);
628   } else {
629     CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
630         luma, max_luma_width, max_luma_height, source, stride);
631   }
632 }
633 
634 template <int block_width_log2, int block_height_log2, int max_luma_width>
CflSubsampler420Impl_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)635 inline void CflSubsampler420Impl_WxH_SSE4_1(
636     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
637     const int /*max_luma_width*/, const int max_luma_height,
638     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
639   const auto* src = static_cast<const uint8_t*>(source);
640   const __m128i zero = _mm_setzero_si128();
641   __m128i final_sum = zero;
642   const int block_height = 1 << block_height_log2;
643   const int luma_height = std::min(block_height, max_luma_height >> 1);
644   static_assert(max_luma_width <= 32, "");
645 
646   int16_t* luma_ptr = luma[0];
647   __m128i final_row_result;
648   // Begin first y section, covering width up to 32.
649   int y = 0;
650   do {
651     const uint8_t* src_next = src + stride;
652     const __m128i samples_row0_lo = LoadUnaligned16(src);
653     const __m128i samples_row00 = _mm_cvtepu8_epi16(samples_row0_lo);
654     const __m128i samples_row01 = (max_luma_width >= 16)
655                                       ? _mm_unpackhi_epi8(samples_row0_lo, zero)
656                                       : LastRowSamples(samples_row00);
657     const __m128i samples_row0_hi = LoadUnaligned16(src + 16);
658     const __m128i samples_row02 = (max_luma_width >= 24)
659                                       ? _mm_cvtepu8_epi16(samples_row0_hi)
660                                       : LastRowSamples(samples_row01);
661     const __m128i samples_row03 = (max_luma_width == 32)
662                                       ? _mm_unpackhi_epi8(samples_row0_hi, zero)
663                                       : LastRowSamples(samples_row02);
664     const __m128i samples_row1_lo = LoadUnaligned16(src_next);
665     const __m128i samples_row10 = _mm_cvtepu8_epi16(samples_row1_lo);
666     const __m128i samples_row11 = (max_luma_width >= 16)
667                                       ? _mm_unpackhi_epi8(samples_row1_lo, zero)
668                                       : LastRowSamples(samples_row10);
669     const __m128i samples_row1_hi = LoadUnaligned16(src_next + 16);
670     const __m128i samples_row12 = (max_luma_width >= 24)
671                                       ? _mm_cvtepu8_epi16(samples_row1_hi)
672                                       : LastRowSamples(samples_row11);
673     const __m128i samples_row13 = (max_luma_width == 32)
674                                       ? _mm_unpackhi_epi8(samples_row1_hi, zero)
675                                       : LastRowSamples(samples_row12);
676     const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
677     const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
678     const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
679     const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
680     __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
681     final_row_result =
682         StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
683     sum = _mm_add_epi16(sum, final_row_result);
684     if (block_width_log2 == 5) {
685       const __m128i wide_fill = LastRowResult(final_row_result);
686       sum = _mm_add_epi16(sum, wide_fill);
687       sum = _mm_add_epi16(sum, wide_fill);
688     }
689     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
690     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
691     src += stride << 1;
692     luma_ptr += kCflLumaBufferStride;
693   } while (++y < luma_height);
694 
695   // Begin second y section.
696   if (y < block_height) {
697     const __m128i final_fill0 =
698         LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
699     const __m128i final_fill1 =
700         LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
701     __m128i wide_fill;
702 
703     if (block_width_log2 == 5) {
704       // There are 16 16-bit fill values per row, shifting by 2 accounts for
705       // the widening to 32-bit.
706       wide_fill =
707           _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
708     }
709 
710     const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
711     const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
712     const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
713     const __m128i final_fill_to_sum =
714         _mm_add_epi32(final_inner_sum0, final_inner_sum1);
715 
716     do {
717       StoreUnaligned16(luma_ptr, final_fill0);
718       StoreUnaligned16(luma_ptr + 8, final_fill1);
719       if (block_width_log2 == 5) {
720         final_sum = _mm_add_epi32(final_sum, wide_fill);
721       }
722       luma_ptr += kCflLumaBufferStride;
723 
724       final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
725     } while (++y < block_height);
726   }  // End second y section.
727 
728   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
729   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
730 
731   __m128i averages = RightShiftWithRounding_S32(
732       final_sum, block_width_log2 + block_height_log2);
733   averages = _mm_shufflelo_epi16(averages, 0);
734   averages = _mm_shuffle_epi32(averages, 0);
735 
736   luma_ptr = luma[0];
737   for (int y = 0; y < block_height; ++y, luma_ptr += kCflLumaBufferStride) {
738     const __m128i samples0 = LoadUnaligned16(luma_ptr);
739     StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
740     const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
741     final_row_result = _mm_sub_epi16(samples1, averages);
742     StoreUnaligned16(luma_ptr + 8, final_row_result);
743     if (block_width_log2 == 5) {
744       const __m128i wide_fill = LastRowResult(final_row_result);
745       StoreUnaligned16(luma_ptr + 16, wide_fill);
746       StoreUnaligned16(luma_ptr + 24, wide_fill);
747     }
748   }
749 }
750 
751 template <int block_width_log2, int block_height_log2>
CflSubsampler420_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)752 void CflSubsampler420_WxH_SSE4_1(
753     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
754     const int max_luma_width, const int max_luma_height,
755     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
756   switch (max_luma_width) {
757     case 8:
758       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
759           luma, max_luma_width, max_luma_height, source, stride);
760       return;
761     case 16:
762       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
763           luma, max_luma_width, max_luma_height, source, stride);
764       return;
765     case 24:
766       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
767           luma, max_luma_width, max_luma_height, source, stride);
768       return;
769     default:
770       assert(max_luma_width == 32);
771       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
772           luma, max_luma_width, max_luma_height, source, stride);
773       return;
774   }
775 }
776 
Init8bpp()777 void Init8bpp() {
778   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
779   assert(dsp != nullptr);
780 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
781   dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
782       CflSubsampler420_4xH_SSE4_1<2>;
783 #endif
784 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
785   dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
786       CflSubsampler420_4xH_SSE4_1<3>;
787 #endif
788 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
789   dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
790       CflSubsampler420_4xH_SSE4_1<4>;
791 #endif
792 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
793   dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
794       CflSubsampler420_8xH_SSE4_1<2>;
795 #endif
796 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
797   dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
798       CflSubsampler420_8xH_SSE4_1<3>;
799 #endif
800 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
801   dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
802       CflSubsampler420_8xH_SSE4_1<4>;
803 #endif
804 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
805   dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
806       CflSubsampler420_8xH_SSE4_1<5>;
807 #endif
808 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
809   dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
810       CflSubsampler420_WxH_SSE4_1<4, 2>;
811 #endif
812 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
813   dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
814       CflSubsampler420_WxH_SSE4_1<4, 3>;
815 #endif
816 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
817   dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
818       CflSubsampler420_WxH_SSE4_1<4, 4>;
819 #endif
820 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
821   dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
822       CflSubsampler420_WxH_SSE4_1<4, 5>;
823 #endif
824 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
825   dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
826       CflSubsampler420_WxH_SSE4_1<5, 3>;
827 #endif
828 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
829   dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
830       CflSubsampler420_WxH_SSE4_1<5, 4>;
831 #endif
832 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
833   dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
834       CflSubsampler420_WxH_SSE4_1<5, 5>;
835 #endif
836 
837 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
838   dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
839       CflSubsampler444_4xH_SSE4_1<2>;
840 #endif
841 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
842   dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
843       CflSubsampler444_4xH_SSE4_1<3>;
844 #endif
845 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
846   dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
847       CflSubsampler444_4xH_SSE4_1<4>;
848 #endif
849 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
850   dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
851       CflSubsampler444_8xH_SSE4_1<2>;
852 #endif
853 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
854   dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
855       CflSubsampler444_8xH_SSE4_1<3>;
856 #endif
857 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
858   dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
859       CflSubsampler444_8xH_SSE4_1<4>;
860 #endif
861 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
862   dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
863       CflSubsampler444_8xH_SSE4_1<5>;
864 #endif
865 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
866   dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
867       CflSubsampler444_SSE4_1<4, 2>;
868 #endif
869 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
870   dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
871       CflSubsampler444_SSE4_1<4, 3>;
872 #endif
873 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
874   dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
875       CflSubsampler444_SSE4_1<4, 4>;
876 #endif
877 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
878   dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
879       CflSubsampler444_SSE4_1<4, 5>;
880 #endif
881 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
882   dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
883       CflSubsampler444_SSE4_1<5, 3>;
884 #endif
885 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
886   dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
887       CflSubsampler444_SSE4_1<5, 4>;
888 #endif
889 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
890   dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
891       CflSubsampler444_SSE4_1<5, 5>;
892 #endif
893 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
894   dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor_SSE4_1<4, 4>;
895 #endif
896 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
897   dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor_SSE4_1<4, 8>;
898 #endif
899 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
900   dsp->cfl_intra_predictors[kTransformSize4x16] =
901       CflIntraPredictor_SSE4_1<4, 16>;
902 #endif
903 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
904   dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor_SSE4_1<8, 4>;
905 #endif
906 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
907   dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor_SSE4_1<8, 8>;
908 #endif
909 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
910   dsp->cfl_intra_predictors[kTransformSize8x16] =
911       CflIntraPredictor_SSE4_1<8, 16>;
912 #endif
913 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
914   dsp->cfl_intra_predictors[kTransformSize8x32] =
915       CflIntraPredictor_SSE4_1<8, 32>;
916 #endif
917 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
918   dsp->cfl_intra_predictors[kTransformSize16x4] =
919       CflIntraPredictor_SSE4_1<16, 4>;
920 #endif
921 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
922   dsp->cfl_intra_predictors[kTransformSize16x8] =
923       CflIntraPredictor_SSE4_1<16, 8>;
924 #endif
925 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
926   dsp->cfl_intra_predictors[kTransformSize16x16] =
927       CflIntraPredictor_SSE4_1<16, 16>;
928 #endif
929 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
930   dsp->cfl_intra_predictors[kTransformSize16x32] =
931       CflIntraPredictor_SSE4_1<16, 32>;
932 #endif
933 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
934   dsp->cfl_intra_predictors[kTransformSize32x8] =
935       CflIntraPredictor_SSE4_1<32, 8>;
936 #endif
937 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
938   dsp->cfl_intra_predictors[kTransformSize32x16] =
939       CflIntraPredictor_SSE4_1<32, 16>;
940 #endif
941 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
942   dsp->cfl_intra_predictors[kTransformSize32x32] =
943       CflIntraPredictor_SSE4_1<32, 32>;
944 #endif
945 }
946 
947 }  // namespace
948 }  // namespace low_bitdepth
949 
950 #if LIBGAV1_MAX_BITDEPTH >= 10
951 namespace high_bitdepth {
952 namespace {
953 
954 //------------------------------------------------------------------------------
955 // CflIntraPredictor_10bpp_SSE4_1
956 
CflPredictUnclipped(const __m128i * input,__m128i alpha_q12,__m128i alpha_sign,__m128i dc_q0)957 inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
958                                    __m128i alpha_sign, __m128i dc_q0) {
959   const __m128i ac_q3 = LoadUnaligned16(input);
960   const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
961   __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
962   scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
963   return _mm_add_epi16(scaled_luma_q0, dc_q0);
964 }
965 
ClipEpi16(__m128i x,__m128i min,__m128i max)966 inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
967   return _mm_max_epi16(_mm_min_epi16(x, max), min);
968 }
969 
970 template <int width, int height>
CflIntraPredictor_10bpp_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int alpha)971 void CflIntraPredictor_10bpp_SSE4_1(
972     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
973     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
974     const int alpha) {
975   constexpr int kCflLumaBufferStrideLog2_16i = 5;
976   constexpr int kCflLumaBufferStrideLog2_128i =
977       kCflLumaBufferStrideLog2_16i - 3;
978   constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
979   auto* dst = static_cast<uint16_t*>(dest);
980   const __m128i alpha_sign = _mm_set1_epi16(alpha);
981   const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
982   auto* row = reinterpret_cast<const __m128i*>(luma);
983   const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
984   const __m128i dc_val = _mm_set1_epi16(dst[0]);
985   const __m128i min = _mm_setzero_si128();
986   const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
987 
988   stride >>= 1;
989 
990   do {
991     __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
992     res = ClipEpi16(res, min, max);
993     if (width == 4) {
994       StoreLo8(dst, res);
995     } else if (width == 8) {
996       StoreUnaligned16(dst, res);
997     } else if (width == 16) {
998       StoreUnaligned16(dst, res);
999       const __m128i res_1 =
1000           CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
1001       StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
1002     } else {
1003       StoreUnaligned16(dst, res);
1004       const __m128i res_1 =
1005           CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
1006       StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
1007       const __m128i res_2 =
1008           CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
1009       StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
1010       const __m128i res_3 =
1011           CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
1012       StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
1013     }
1014 
1015     dst += stride;
1016   } while ((row += kRowIncr) < row_end);
1017 }
1018 
1019 template <int block_height_log2, bool is_inside>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1020 void CflSubsampler444_4xH_SSE4_1(
1021     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1022     const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1023     ptrdiff_t stride) {
1024   static_assert(block_height_log2 <= 4, "");
1025   const int block_height = 1 << block_height_log2;
1026   const int visible_height = max_luma_height;
1027   const auto* src = static_cast<const uint16_t*>(source);
1028   const ptrdiff_t src_stride = stride / sizeof(src[0]);
1029   int16_t* luma_ptr = luma[0];
1030   __m128i zero = _mm_setzero_si128();
1031   __m128i sum = zero;
1032   __m128i samples;
1033   int y = visible_height;
1034 
1035   do {
1036     samples = LoadHi8(LoadLo8(src), src + src_stride);
1037     src += src_stride << 1;
1038     sum = _mm_add_epi16(sum, samples);
1039     y -= 2;
1040   } while (y != 0);
1041 
1042   if (!is_inside) {
1043     y = visible_height;
1044     samples = _mm_unpackhi_epi64(samples, samples);
1045     do {
1046       sum = _mm_add_epi16(sum, samples);
1047       y += 2;
1048     } while (y < block_height);
1049   }
1050 
1051   sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
1052   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
1053   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
1054 
1055   // Here the left shift by 3 (to increase precision) is nullified in right
1056   // shift ((log2 of width 4) + 1).
1057   __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
1058   averages = _mm_shufflelo_epi16(averages, 0);
1059   src = static_cast<const uint16_t*>(source);
1060   luma_ptr = luma[0];
1061   y = visible_height;
1062   do {
1063     samples = LoadLo8(src);
1064     samples = _mm_slli_epi16(samples, 3);
1065     StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
1066     src += src_stride;
1067     luma_ptr += kCflLumaBufferStride;
1068   } while (--y != 0);
1069 
1070   if (!is_inside) {
1071     y = visible_height;
1072     // Replicate last line
1073     do {
1074       StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
1075       luma_ptr += kCflLumaBufferStride;
1076     } while (++y < block_height);
1077   }
1078 }
1079 
1080 template <int block_height_log2>
CflSubsampler444_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1081 void CflSubsampler444_4xH_SSE4_1(
1082     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1083     const int max_luma_width, const int max_luma_height,
1084     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1085   static_cast<void>(max_luma_width);
1086   static_cast<void>(max_luma_height);
1087   static_assert(block_height_log2 <= 4, "");
1088   assert(max_luma_width >= 4);
1089   assert(max_luma_height >= 4);
1090   const int block_height = 1 << block_height_log2;
1091 
1092   if (block_height <= max_luma_height) {
1093     CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
1094                                                          source, stride);
1095   } else {
1096     CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
1097                                                           source, stride);
1098   }
1099 }
1100 
1101 template <int block_height_log2, bool is_inside>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1102 void CflSubsampler444_8xH_SSE4_1(
1103     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1104     const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1105     ptrdiff_t stride) {
1106   const int block_height = 1 << block_height_log2;
1107   const int visible_height = max_luma_height;
1108   const __m128i dup16 = _mm_set1_epi32(0x01000100);
1109   const auto* src = static_cast<const uint16_t*>(source);
1110   const ptrdiff_t src_stride = stride / sizeof(src[0]);
1111   int16_t* luma_ptr = luma[0];
1112   const __m128i zero = _mm_setzero_si128();
1113   __m128i sum = zero;
1114   __m128i samples;
1115   int y = visible_height;
1116 
1117   do {
1118     samples = LoadUnaligned16(src);
1119     src += src_stride;
1120     sum = _mm_add_epi16(sum, samples);
1121   } while (--y != 0);
1122 
1123   if (!is_inside) {
1124     y = visible_height;
1125     do {
1126       sum = _mm_add_epi16(sum, samples);
1127     } while (++y < block_height);
1128   }
1129 
1130   sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
1131   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
1132   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
1133 
1134   // Here the left shift by 3 (to increase precision) is nullified in right
1135   // shift (log2 of width 8).
1136   __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
1137   averages = _mm_shuffle_epi8(averages, dup16);
1138 
1139   src = static_cast<const uint16_t*>(source);
1140   luma_ptr = luma[0];
1141   y = visible_height;
1142   do {
1143     samples = LoadUnaligned16(src);
1144     samples = _mm_slli_epi16(samples, 3);
1145     StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
1146     src += src_stride;
1147     luma_ptr += kCflLumaBufferStride;
1148   } while (--y != 0);
1149 
1150   if (!is_inside) {
1151     y = visible_height;
1152     // Replicate last line
1153     do {
1154       StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
1155       luma_ptr += kCflLumaBufferStride;
1156     } while (++y < block_height);
1157   }
1158 }
1159 
1160 template <int block_height_log2>
CflSubsampler444_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1161 void CflSubsampler444_8xH_SSE4_1(
1162     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1163     const int max_luma_width, const int max_luma_height,
1164     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1165   static_cast<void>(max_luma_width);
1166   static_cast<void>(max_luma_height);
1167   static_assert(block_height_log2 <= 5, "");
1168   assert(max_luma_width >= 4);
1169   assert(max_luma_height >= 4);
1170   const int block_height = 1 << block_height_log2;
1171   const int block_width = 8;
1172 
1173   const int horz_inside = block_width <= max_luma_width;
1174   const int vert_inside = block_height <= max_luma_height;
1175   if (horz_inside && vert_inside) {
1176     CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
1177                                                          source, stride);
1178   } else {
1179     CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
1180                                                           source, stride);
1181   }
1182 }
1183 
1184 template <int block_width_log2, int block_height_log2, bool is_inside>
CflSubsampler444_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1185 void CflSubsampler444_WxH_SSE4_1(
1186     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1187     const int max_luma_width, const int max_luma_height,
1188     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1189   const int block_height = 1 << block_height_log2;
1190   const int visible_height = max_luma_height;
1191   const int block_width = 1 << block_width_log2;
1192   const __m128i dup16 = _mm_set1_epi32(0x01000100);
1193   const __m128i zero = _mm_setzero_si128();
1194   const auto* src = static_cast<const uint16_t*>(source);
1195   const ptrdiff_t src_stride = stride / sizeof(src[0]);
1196   int16_t* luma_ptr = luma[0];
1197   __m128i sum = zero;
1198   __m128i inner_sum_lo, inner_sum_hi;
1199   __m128i samples[4];
1200   int y = visible_height;
1201 
1202   do {
1203     samples[0] = LoadUnaligned16(src);
1204     samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
1205                                         : LastRowResult(samples[0]);
1206     __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
1207     if (block_width == 32) {
1208       samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
1209                                           : LastRowResult(samples[1]);
1210       samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
1211                                           : LastRowResult(samples[2]);
1212 
1213       inner_sum = _mm_add_epi16(samples[2], inner_sum);
1214       inner_sum = _mm_add_epi16(samples[3], inner_sum);
1215     }
1216     inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
1217     inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
1218     sum = _mm_add_epi32(sum, inner_sum_lo);
1219     sum = _mm_add_epi32(sum, inner_sum_hi);
1220     src += src_stride;
1221   } while (--y != 0);
1222 
1223   if (!is_inside) {
1224     y = visible_height;
1225     __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
1226     if (block_width == 32) {
1227       inner_sum = _mm_add_epi16(samples[2], inner_sum);
1228       inner_sum = _mm_add_epi16(samples[3], inner_sum);
1229     }
1230     inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
1231     inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
1232     do {
1233       sum = _mm_add_epi32(sum, inner_sum_lo);
1234       sum = _mm_add_epi32(sum, inner_sum_hi);
1235     } while (++y < block_height);
1236   }
1237 
1238   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
1239   sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
1240 
1241   // Here the left shift by 3 (to increase precision) is subtracted in right
1242   // shift factor (block_width_log2 + block_height_log2 - 3).
1243   __m128i averages =
1244       RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
1245   averages = _mm_shuffle_epi8(averages, dup16);
1246 
1247   src = static_cast<const uint16_t*>(source);
1248   __m128i samples_ext = zero;
1249   luma_ptr = luma[0];
1250   y = visible_height;
1251   do {
1252     int idx = 0;
1253     for (int x = 0; x < block_width; x += 8) {
1254       if (max_luma_width > x) {
1255         samples[idx] = LoadUnaligned16(&src[x]);
1256         samples[idx] = _mm_slli_epi16(samples[idx], 3);
1257         samples_ext = samples[idx];
1258       } else {
1259         samples[idx] = LastRowResult(samples_ext);
1260       }
1261       StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
1262     }
1263     src += src_stride;
1264     luma_ptr += kCflLumaBufferStride;
1265   } while (--y != 0);
1266 
1267   if (!is_inside) {
1268     y = visible_height;
1269     // Replicate last line
1270     do {
1271       int idx = 0;
1272       for (int x = 0; x < block_width; x += 8) {
1273         StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
1274       }
1275       luma_ptr += kCflLumaBufferStride;
1276     } while (++y < block_height);
1277   }
1278 }
1279 
1280 template <int block_width_log2, int block_height_log2>
CflSubsampler444_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1281 void CflSubsampler444_WxH_SSE4_1(
1282     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1283     const int max_luma_width, const int max_luma_height,
1284     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1285   static_assert(block_width_log2 == 4 || block_width_log2 == 5,
1286                 "This function will only work for block_width 16 and 32.");
1287   static_assert(block_height_log2 <= 5, "");
1288   assert(max_luma_width >= 4);
1289   assert(max_luma_height >= 4);
1290 
1291   const int block_height = 1 << block_height_log2;
1292   const int vert_inside = block_height <= max_luma_height;
1293   if (vert_inside) {
1294     CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
1295         luma, max_luma_width, max_luma_height, source, stride);
1296   } else {
1297     CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
1298         luma, max_luma_width, max_luma_height, source, stride);
1299   }
1300 }
1301 
1302 template <int block_height_log2>
CflSubsampler420_4xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1303 void CflSubsampler420_4xH_SSE4_1(
1304     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1305     const int /*max_luma_width*/, const int max_luma_height,
1306     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1307   const int block_height = 1 << block_height_log2;
1308   const auto* src = static_cast<const uint16_t*>(source);
1309   const ptrdiff_t src_stride = stride / sizeof(src[0]);
1310   int16_t* luma_ptr = luma[0];
1311   const __m128i zero = _mm_setzero_si128();
1312   __m128i final_sum = zero;
1313   const int luma_height = std::min(block_height, max_luma_height >> 1);
1314   int y = luma_height;
1315 
1316   do {
1317     const __m128i samples_row0 = LoadUnaligned16(src);
1318     src += src_stride;
1319     const __m128i samples_row1 = LoadUnaligned16(src);
1320     src += src_stride;
1321     const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
1322 
1323     const __m128i samples_row2 = LoadUnaligned16(src);
1324     src += src_stride;
1325     const __m128i samples_row3 = LoadUnaligned16(src);
1326     src += src_stride;
1327     const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
1328     __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
1329     luma_ptr += kCflLumaBufferStride << 1;
1330 
1331     const __m128i samples_row4 = LoadUnaligned16(src);
1332     src += src_stride;
1333     const __m128i samples_row5 = LoadUnaligned16(src);
1334     src += src_stride;
1335     const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
1336 
1337     const __m128i samples_row6 = LoadUnaligned16(src);
1338     src += src_stride;
1339     const __m128i samples_row7 = LoadUnaligned16(src);
1340     src += src_stride;
1341     const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
1342     sum = _mm_add_epi16(
1343         sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
1344     luma_ptr += kCflLumaBufferStride << 1;
1345 
1346     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
1347     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
1348     y -= 4;
1349   } while (y != 0);
1350 
1351   const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
1352   const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
1353   for (y = luma_height; y < block_height; ++y) {
1354     StoreLo8(luma_ptr, final_fill);
1355     luma_ptr += kCflLumaBufferStride;
1356     final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
1357   }
1358   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
1359   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
1360 
1361   __m128i averages = RightShiftWithRounding_U32(
1362       final_sum, block_height_log2 + 2 /*log2 of width 4*/);
1363 
1364   averages = _mm_shufflelo_epi16(averages, 0);
1365   luma_ptr = luma[0];
1366   y = block_height;
1367   do {
1368     const __m128i samples = LoadLo8(luma_ptr);
1369     StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
1370     luma_ptr += kCflLumaBufferStride;
1371   } while (--y != 0);
1372 }
1373 
1374 template <int block_height_log2, int max_luma_width>
CflSubsampler420Impl_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1375 inline void CflSubsampler420Impl_8xH_SSE4_1(
1376     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1377     const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1378     ptrdiff_t stride) {
1379   const int block_height = 1 << block_height_log2;
1380   const auto* src = static_cast<const uint16_t*>(source);
1381   const ptrdiff_t src_stride = stride / sizeof(src[0]);
1382   const __m128i zero = _mm_setzero_si128();
1383   __m128i final_sum = zero;
1384   int16_t* luma_ptr = luma[0];
1385   const int luma_height = std::min(block_height, max_luma_height >> 1);
1386   int y = luma_height;
1387 
1388   do {
1389     const __m128i samples_row00 = LoadUnaligned16(src);
1390     const __m128i samples_row01 = (max_luma_width == 16)
1391                                       ? LoadUnaligned16(src + 8)
1392                                       : LastRowSamples(samples_row00);
1393     src += src_stride;
1394     const __m128i samples_row10 = LoadUnaligned16(src);
1395     const __m128i samples_row11 = (max_luma_width == 16)
1396                                       ? LoadUnaligned16(src + 8)
1397                                       : LastRowSamples(samples_row10);
1398     src += src_stride;
1399     const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
1400     const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
1401     __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
1402     luma_ptr += kCflLumaBufferStride;
1403 
1404     const __m128i samples_row20 = LoadUnaligned16(src);
1405     const __m128i samples_row21 = (max_luma_width == 16)
1406                                       ? LoadUnaligned16(src + 8)
1407                                       : LastRowSamples(samples_row20);
1408     src += src_stride;
1409     const __m128i samples_row30 = LoadUnaligned16(src);
1410     const __m128i samples_row31 = (max_luma_width == 16)
1411                                       ? LoadUnaligned16(src + 8)
1412                                       : LastRowSamples(samples_row30);
1413     src += src_stride;
1414     const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
1415     const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
1416     sum = _mm_add_epi16(
1417         sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
1418     luma_ptr += kCflLumaBufferStride;
1419 
1420     const __m128i samples_row40 = LoadUnaligned16(src);
1421     const __m128i samples_row41 = (max_luma_width == 16)
1422                                       ? LoadUnaligned16(src + 8)
1423                                       : LastRowSamples(samples_row40);
1424     src += src_stride;
1425     const __m128i samples_row50 = LoadUnaligned16(src);
1426     const __m128i samples_row51 = (max_luma_width == 16)
1427                                       ? LoadUnaligned16(src + 8)
1428                                       : LastRowSamples(samples_row50);
1429     src += src_stride;
1430     const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
1431     const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
1432     sum = _mm_add_epi16(
1433         sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
1434     luma_ptr += kCflLumaBufferStride;
1435 
1436     const __m128i samples_row60 = LoadUnaligned16(src);
1437     const __m128i samples_row61 = (max_luma_width == 16)
1438                                       ? LoadUnaligned16(src + 8)
1439                                       : LastRowSamples(samples_row60);
1440     src += src_stride;
1441     const __m128i samples_row70 = LoadUnaligned16(src);
1442     const __m128i samples_row71 = (max_luma_width == 16)
1443                                       ? LoadUnaligned16(src + 8)
1444                                       : LastRowSamples(samples_row70);
1445     src += src_stride;
1446     const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
1447     const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
1448     sum = _mm_add_epi16(
1449         sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
1450     luma_ptr += kCflLumaBufferStride;
1451 
1452     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
1453     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
1454     y -= 4;
1455   } while (y != 0);
1456 
1457   // Duplicate the final row downward to the end after max_luma_height.
1458   const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
1459   const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
1460   const __m128i final_fill_to_sum1 =
1461       _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
1462   const __m128i final_fill_to_sum =
1463       _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
1464   for (y = luma_height; y < block_height; ++y) {
1465     StoreUnaligned16(luma_ptr, final_fill);
1466     luma_ptr += kCflLumaBufferStride;
1467     final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
1468   }
1469   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
1470   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
1471 
1472   __m128i averages = RightShiftWithRounding_S32(
1473       final_sum, block_height_log2 + 3 /*log2 of width 8*/);
1474 
1475   averages = _mm_shufflelo_epi16(averages, 0);
1476   averages = _mm_shuffle_epi32(averages, 0);
1477   luma_ptr = luma[0];
1478   y = block_height;
1479   do {
1480     const __m128i samples = LoadUnaligned16(luma_ptr);
1481     StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
1482     luma_ptr += kCflLumaBufferStride;
1483   } while (--y != 0);
1484 }
1485 
1486 template <int block_height_log2>
CflSubsampler420_8xH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1487 void CflSubsampler420_8xH_SSE4_1(
1488     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1489     const int max_luma_width, const int max_luma_height,
1490     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1491   if (max_luma_width == 8) {
1492     CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
1493                                                           source, stride);
1494   } else {
1495     CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
1496         luma, max_luma_height, source, stride);
1497   }
1498 }
1499 
1500 template <int block_width_log2, int block_height_log2, int max_luma_width>
CflSubsampler420Impl_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1501 inline void CflSubsampler420Impl_WxH_SSE4_1(
1502     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1503     const int max_luma_height, const void* LIBGAV1_RESTRICT const source,
1504     ptrdiff_t stride) {
1505   const auto* src = static_cast<const uint16_t*>(source);
1506   const ptrdiff_t src_stride = stride / sizeof(src[0]);
1507   const __m128i zero = _mm_setzero_si128();
1508   __m128i final_sum = zero;
1509   const int block_height = 1 << block_height_log2;
1510   const int luma_height = std::min(block_height, max_luma_height >> 1);
1511   int16_t* luma_ptr = luma[0];
1512   __m128i final_row_result;
1513   // Begin first y section, covering width up to 32.
1514   int y = luma_height;
1515 
1516   do {
1517     const uint16_t* src_next = src + src_stride;
1518     const __m128i samples_row00 = LoadUnaligned16(src);
1519     const __m128i samples_row01 = (max_luma_width >= 16)
1520                                       ? LoadUnaligned16(src + 8)
1521                                       : LastRowSamples(samples_row00);
1522     const __m128i samples_row02 = (max_luma_width >= 24)
1523                                       ? LoadUnaligned16(src + 16)
1524                                       : LastRowSamples(samples_row01);
1525     const __m128i samples_row03 = (max_luma_width == 32)
1526                                       ? LoadUnaligned16(src + 24)
1527                                       : LastRowSamples(samples_row02);
1528     const __m128i samples_row10 = LoadUnaligned16(src_next);
1529     const __m128i samples_row11 = (max_luma_width >= 16)
1530                                       ? LoadUnaligned16(src_next + 8)
1531                                       : LastRowSamples(samples_row10);
1532     const __m128i samples_row12 = (max_luma_width >= 24)
1533                                       ? LoadUnaligned16(src_next + 16)
1534                                       : LastRowSamples(samples_row11);
1535     const __m128i samples_row13 = (max_luma_width == 32)
1536                                       ? LoadUnaligned16(src_next + 24)
1537                                       : LastRowSamples(samples_row12);
1538     const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
1539     const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
1540     const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
1541     const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
1542     __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
1543     final_row_result =
1544         StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
1545     sum = _mm_add_epi16(sum, final_row_result);
1546     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
1547     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
1548 
1549     // Because max_luma_width is at most 32, any values beyond x=16 will
1550     // necessarily be duplicated.
1551     if (block_width_log2 == 5) {
1552       const __m128i wide_fill = LastRowResult(final_row_result);
1553       // There are 16 16-bit fill values per row, shifting by 2 accounts for
1554       // the widening to 32-bit.
1555       final_sum = _mm_add_epi32(
1556           final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
1557     }
1558     src += src_stride << 1;
1559     luma_ptr += kCflLumaBufferStride;
1560   } while (--y != 0);
1561 
1562   // Begin second y section.
1563   y = luma_height;
1564   if (y < block_height) {
1565     const __m128i final_fill0 =
1566         LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
1567     const __m128i final_fill1 =
1568         LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
1569     __m128i wide_fill;
1570     if (block_width_log2 == 5) {
1571       // There are 16 16-bit fill values per row, shifting by 2 accounts for
1572       // the widening to 32-bit.
1573       wide_fill =
1574           _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
1575     }
1576     const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
1577     const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
1578     const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
1579     const __m128i final_fill_to_sum =
1580         _mm_add_epi32(final_inner_sum0, final_inner_sum1);
1581 
1582     do {
1583       StoreUnaligned16(luma_ptr, final_fill0);
1584       StoreUnaligned16(luma_ptr + 8, final_fill1);
1585       if (block_width_log2 == 5) {
1586         final_sum = _mm_add_epi32(final_sum, wide_fill);
1587       }
1588       luma_ptr += kCflLumaBufferStride;
1589       final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
1590     } while (++y < block_height);
1591   }  // End second y section.
1592 
1593   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
1594   final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
1595 
1596   __m128i averages = RightShiftWithRounding_S32(
1597       final_sum, block_width_log2 + block_height_log2);
1598   averages = _mm_shufflelo_epi16(averages, 0);
1599   averages = _mm_shuffle_epi32(averages, 0);
1600 
1601   luma_ptr = luma[0];
1602   y = block_height;
1603   do {
1604     const __m128i samples0 = LoadUnaligned16(luma_ptr);
1605     StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
1606     const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
1607     final_row_result = _mm_sub_epi16(samples1, averages);
1608     StoreUnaligned16(luma_ptr + 8, final_row_result);
1609 
1610     if (block_width_log2 == 5) {
1611       const __m128i wide_fill = LastRowResult(final_row_result);
1612       StoreUnaligned16(luma_ptr + 16, wide_fill);
1613       StoreUnaligned16(luma_ptr + 24, wide_fill);
1614     }
1615     luma_ptr += kCflLumaBufferStride;
1616   } while (--y != 0);
1617 }
1618 
1619 template <int block_width_log2, int block_height_log2>
CflSubsampler420_WxH_SSE4_1(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],const int max_luma_width,const int max_luma_height,const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride)1620 void CflSubsampler420_WxH_SSE4_1(
1621     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
1622     const int max_luma_width, const int max_luma_height,
1623     const void* LIBGAV1_RESTRICT const source, ptrdiff_t stride) {
1624   switch (max_luma_width) {
1625     case 8:
1626       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
1627           luma, max_luma_height, source, stride);
1628       return;
1629     case 16:
1630       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
1631           luma, max_luma_height, source, stride);
1632       return;
1633     case 24:
1634       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
1635           luma, max_luma_height, source, stride);
1636       return;
1637     default:
1638       assert(max_luma_width == 32);
1639       CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
1640           luma, max_luma_height, source, stride);
1641       return;
1642   }
1643 }
1644 
Init10bpp()1645 void Init10bpp() {
1646   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1647   assert(dsp != nullptr);
1648 
1649 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
1650   dsp->cfl_intra_predictors[kTransformSize4x4] =
1651       CflIntraPredictor_10bpp_SSE4_1<4, 4>;
1652 #endif
1653 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
1654   dsp->cfl_intra_predictors[kTransformSize4x8] =
1655       CflIntraPredictor_10bpp_SSE4_1<4, 8>;
1656 #endif
1657 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
1658   dsp->cfl_intra_predictors[kTransformSize4x16] =
1659       CflIntraPredictor_10bpp_SSE4_1<4, 16>;
1660 #endif
1661 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
1662   dsp->cfl_intra_predictors[kTransformSize8x4] =
1663       CflIntraPredictor_10bpp_SSE4_1<8, 4>;
1664 #endif
1665 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
1666   dsp->cfl_intra_predictors[kTransformSize8x8] =
1667       CflIntraPredictor_10bpp_SSE4_1<8, 8>;
1668 #endif
1669 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
1670   dsp->cfl_intra_predictors[kTransformSize8x16] =
1671       CflIntraPredictor_10bpp_SSE4_1<8, 16>;
1672 #endif
1673 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
1674   dsp->cfl_intra_predictors[kTransformSize8x32] =
1675       CflIntraPredictor_10bpp_SSE4_1<8, 32>;
1676 #endif
1677 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
1678   dsp->cfl_intra_predictors[kTransformSize16x4] =
1679       CflIntraPredictor_10bpp_SSE4_1<16, 4>;
1680 #endif
1681 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
1682   dsp->cfl_intra_predictors[kTransformSize16x8] =
1683       CflIntraPredictor_10bpp_SSE4_1<16, 8>;
1684 #endif
1685 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
1686   dsp->cfl_intra_predictors[kTransformSize16x16] =
1687       CflIntraPredictor_10bpp_SSE4_1<16, 16>;
1688 #endif
1689 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
1690   dsp->cfl_intra_predictors[kTransformSize16x32] =
1691       CflIntraPredictor_10bpp_SSE4_1<16, 32>;
1692 #endif
1693 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
1694   dsp->cfl_intra_predictors[kTransformSize32x8] =
1695       CflIntraPredictor_10bpp_SSE4_1<32, 8>;
1696 #endif
1697 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
1698   dsp->cfl_intra_predictors[kTransformSize32x16] =
1699       CflIntraPredictor_10bpp_SSE4_1<32, 16>;
1700 #endif
1701 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
1702   dsp->cfl_intra_predictors[kTransformSize32x32] =
1703       CflIntraPredictor_10bpp_SSE4_1<32, 32>;
1704 #endif
1705 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
1706   dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
1707       CflSubsampler420_4xH_SSE4_1<2>;
1708 #endif
1709 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
1710   dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
1711       CflSubsampler420_4xH_SSE4_1<3>;
1712 #endif
1713 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
1714   dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
1715       CflSubsampler420_4xH_SSE4_1<4>;
1716 #endif
1717 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
1718   dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
1719       CflSubsampler420_8xH_SSE4_1<2>;
1720 #endif
1721 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
1722   dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
1723       CflSubsampler420_8xH_SSE4_1<3>;
1724 #endif
1725 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
1726   dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
1727       CflSubsampler420_8xH_SSE4_1<4>;
1728 #endif
1729 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
1730   dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
1731       CflSubsampler420_8xH_SSE4_1<5>;
1732 #endif
1733 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
1734   dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
1735       CflSubsampler420_WxH_SSE4_1<4, 2>;
1736 #endif
1737 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
1738   dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
1739       CflSubsampler420_WxH_SSE4_1<4, 3>;
1740 #endif
1741 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
1742   dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
1743       CflSubsampler420_WxH_SSE4_1<4, 4>;
1744 #endif
1745 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
1746   dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
1747       CflSubsampler420_WxH_SSE4_1<4, 5>;
1748 #endif
1749 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
1750   dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
1751       CflSubsampler420_WxH_SSE4_1<5, 3>;
1752 #endif
1753 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
1754   dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
1755       CflSubsampler420_WxH_SSE4_1<5, 4>;
1756 #endif
1757 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
1758   dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
1759       CflSubsampler420_WxH_SSE4_1<5, 5>;
1760 #endif
1761 
1762 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
1763   dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
1764       CflSubsampler444_4xH_SSE4_1<2>;
1765 #endif
1766 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
1767   dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
1768       CflSubsampler444_4xH_SSE4_1<3>;
1769 #endif
1770 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
1771   dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
1772       CflSubsampler444_4xH_SSE4_1<4>;
1773 #endif
1774 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
1775   dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
1776       CflSubsampler444_8xH_SSE4_1<2>;
1777 #endif
1778 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
1779   dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
1780       CflSubsampler444_8xH_SSE4_1<3>;
1781 #endif
1782 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
1783   dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
1784       CflSubsampler444_8xH_SSE4_1<4>;
1785 #endif
1786 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
1787   dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
1788       CflSubsampler444_8xH_SSE4_1<5>;
1789 #endif
1790 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
1791   dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
1792       CflSubsampler444_WxH_SSE4_1<4, 2>;
1793 #endif
1794 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
1795   dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
1796       CflSubsampler444_WxH_SSE4_1<4, 3>;
1797 #endif
1798 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
1799   dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
1800       CflSubsampler444_WxH_SSE4_1<4, 4>;
1801 #endif
1802 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
1803   dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
1804       CflSubsampler444_WxH_SSE4_1<4, 5>;
1805 #endif
1806 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
1807   dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
1808       CflSubsampler444_WxH_SSE4_1<5, 3>;
1809 #endif
1810 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
1811   dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
1812       CflSubsampler444_WxH_SSE4_1<5, 4>;
1813 #endif
1814 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
1815   dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
1816       CflSubsampler444_WxH_SSE4_1<5, 5>;
1817 #endif
1818 }
1819 
1820 }  // namespace
1821 }  // namespace high_bitdepth
1822 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1823 
IntraPredCflInit_SSE4_1()1824 void IntraPredCflInit_SSE4_1() {
1825   low_bitdepth::Init8bpp();
1826 #if LIBGAV1_MAX_BITDEPTH >= 10
1827   high_bitdepth::Init10bpp();
1828 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1829 }
1830 
1831 }  // namespace dsp
1832 }  // namespace libgav1
1833 
1834 #else  // !LIBGAV1_TARGETING_SSE4_1
1835 
1836 namespace libgav1 {
1837 namespace dsp {
1838 
IntraPredCflInit_SSE4_1()1839 void IntraPredCflInit_SSE4_1() {}
1840 
1841 }  // namespace dsp
1842 }  // namespace libgav1
1843 
1844 #endif  // LIBGAV1_TARGETING_SSE4_1
1845