xref: /aosp_15_r20/external/libgav1/src/dsp/x86/intrapred_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_TARGETING_SSE4_1
19 
20 #include <xmmintrin.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27 
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/constants.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace {
38 
39 //------------------------------------------------------------------------------
40 // Utility Functions
41 
42 // This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
43 // Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
44 // block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
45 // we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
46 // bits.
47 constexpr int kThreeInverse = 0x5556;
48 constexpr int kFiveInverse = 0x3334;
49 template <int shiftk, int multiplier>
DivideByMultiplyShift_U32(const __m128i dividend)50 inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
51   const __m128i interm = _mm_srli_epi32(dividend, shiftk);
52   return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
53 }
54 
55 //------------------------------------------------------------------------------
56 // DcPredFuncs_SSE4_1
57 
58 using DcSumFunc = __m128i (*)(const void* ref);
59 using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
60 using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
61                                     const __m128i column);
62 // For copying an entire column across a block.
63 using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
64                                  const void* column);
65 
66 // DC intra-predictors for non-square blocks.
67 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
68           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
69 struct DcPredFuncs_SSE4_1 {
70   DcPredFuncs_SSE4_1() = delete;
71 
72   static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
73                     const void* left_column);
74   static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
75                      const void* left_column);
76   static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
77                  const void* left_column);
78 };
79 
80 // Directional intra-predictors for square blocks.
81 template <ColumnStoreFunc col_storefn>
82 struct DirectionalPredFuncs_SSE4_1 {
83   DirectionalPredFuncs_SSE4_1() = delete;
84 
85   static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
86                        const void* left_column);
87   static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
88                          const void* left_column);
89 };
90 
91 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
92           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
93 void DcPredFuncs_SSE4_1<
94     width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
DcTop(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void *)95     dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
96                     const void* LIBGAV1_RESTRICT const top_row,
97                     const void* /*left_column*/) {
98   const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
99   const __m128i sum = top_sumfn(top_row);
100   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
101   storefn(dest, stride, dc);
102 }
103 
104 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
105           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
106 void DcPredFuncs_SSE4_1<
107     width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
DcLeft(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)108     dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
109                      const void* /*top_row*/,
110                      const void* LIBGAV1_RESTRICT const left_column) {
111   const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
112   const __m128i sum = left_sumfn(left_column);
113   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
114   storefn(dest, stride, dc);
115 }
116 
117 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
118           DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
119 void DcPredFuncs_SSE4_1<
120     width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
Dc(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)121     dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
122                  const void* LIBGAV1_RESTRICT const top_row,
123                  const void* LIBGAV1_RESTRICT const left_column) {
124   const __m128i rounder =
125       _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
126   const __m128i sum_top = top_sumfn(top_row);
127   const __m128i sum_left = left_sumfn(left_column);
128   const __m128i sum = _mm_add_epi32(sum_top, sum_left);
129   if (width_log2 == height_log2) {
130     const __m128i dc =
131         _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
132     storefn(dest, stride, dc);
133   } else {
134     const __m128i dc =
135         DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
136     storefn(dest, stride, dc);
137   }
138 }
139 
140 //------------------------------------------------------------------------------
141 // DcPredFuncs_SSE4_1 directional predictors
142 
143 template <ColumnStoreFunc col_storefn>
Horizontal(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)144 void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
145     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
146     const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
147   col_storefn(dest, stride, left_column);
148 }
149 
150 }  // namespace
151 
152 //------------------------------------------------------------------------------
153 namespace low_bitdepth {
154 namespace {
155 
156 // |ref| points to 4 bytes containing 4 packed ints.
DcSum4_SSE4_1(const void * const ref)157 inline __m128i DcSum4_SSE4_1(const void* const ref) {
158   const __m128i vals = Load4(ref);
159   const __m128i zero = _mm_setzero_si128();
160   return _mm_sad_epu8(vals, zero);
161 }
162 
DcSum8_SSE4_1(const void * const ref)163 inline __m128i DcSum8_SSE4_1(const void* const ref) {
164   const __m128i vals = LoadLo8(ref);
165   const __m128i zero = _mm_setzero_si128();
166   return _mm_sad_epu8(vals, zero);
167 }
168 
DcSum16_SSE4_1(const void * const ref)169 inline __m128i DcSum16_SSE4_1(const void* const ref) {
170   const __m128i zero = _mm_setzero_si128();
171   const __m128i vals = LoadUnaligned16(ref);
172   const __m128i partial_sum = _mm_sad_epu8(vals, zero);
173   return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
174 }
175 
DcSum32_SSE4_1(const void * const ref)176 inline __m128i DcSum32_SSE4_1(const void* const ref) {
177   const __m128i zero = _mm_setzero_si128();
178   const __m128i vals1 = LoadUnaligned16(ref);
179   const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
180   const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
181   const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
182   const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
183   return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
184 }
185 
DcSum64_SSE4_1(const void * const ref)186 inline __m128i DcSum64_SSE4_1(const void* const ref) {
187   const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
188   const __m128i zero = _mm_setzero_si128();
189   const __m128i vals1 = LoadUnaligned16(ref_ptr);
190   const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
191   const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
192   const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
193   const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
194   const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
195   __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
196   const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
197   partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
198   const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
199   partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
200   return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
201 }
202 
203 template <int height>
DcStore4xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)204 inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
205                               const __m128i dc) {
206   const __m128i zero = _mm_setzero_si128();
207   const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
208   int y = height - 1;
209   auto* dst = static_cast<uint8_t*>(dest);
210   do {
211     Store4(dst, dc_dup);
212     dst += stride;
213   } while (--y != 0);
214   Store4(dst, dc_dup);
215 }
216 
217 template <int height>
DcStore8xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)218 inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
219                               const __m128i dc) {
220   const __m128i zero = _mm_setzero_si128();
221   const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
222   int y = height - 1;
223   auto* dst = static_cast<uint8_t*>(dest);
224   do {
225     StoreLo8(dst, dc_dup);
226     dst += stride;
227   } while (--y != 0);
228   StoreLo8(dst, dc_dup);
229 }
230 
231 template <int height>
DcStore16xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)232 inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
233                                const __m128i dc) {
234   const __m128i zero = _mm_setzero_si128();
235   const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
236   int y = height - 1;
237   auto* dst = static_cast<uint8_t*>(dest);
238   do {
239     StoreUnaligned16(dst, dc_dup);
240     dst += stride;
241   } while (--y != 0);
242   StoreUnaligned16(dst, dc_dup);
243 }
244 
245 template <int height>
DcStore32xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)246 inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
247                                const __m128i dc) {
248   const __m128i zero = _mm_setzero_si128();
249   const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
250   int y = height - 1;
251   auto* dst = static_cast<uint8_t*>(dest);
252   do {
253     StoreUnaligned16(dst, dc_dup);
254     StoreUnaligned16(dst + 16, dc_dup);
255     dst += stride;
256   } while (--y != 0);
257   StoreUnaligned16(dst, dc_dup);
258   StoreUnaligned16(dst + 16, dc_dup);
259 }
260 
261 template <int height>
DcStore64xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)262 inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
263                                const __m128i dc) {
264   const __m128i zero = _mm_setzero_si128();
265   const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
266   int y = height - 1;
267   auto* dst = static_cast<uint8_t*>(dest);
268   do {
269     StoreUnaligned16(dst, dc_dup);
270     StoreUnaligned16(dst + 16, dc_dup);
271     StoreUnaligned16(dst + 32, dc_dup);
272     StoreUnaligned16(dst + 48, dc_dup);
273     dst += stride;
274   } while (--y != 0);
275   StoreUnaligned16(dst, dc_dup);
276   StoreUnaligned16(dst + 16, dc_dup);
277   StoreUnaligned16(dst + 32, dc_dup);
278   StoreUnaligned16(dst + 48, dc_dup);
279 }
280 
281 // WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
282 // be copied for width N into dest.
WriteDuplicate4x4(void * const dest,ptrdiff_t stride,const __m128i dup32)283 inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
284                               const __m128i dup32) {
285   auto* dst = static_cast<uint8_t*>(dest);
286   Store4(dst, dup32);
287   dst += stride;
288   const int row1 = _mm_extract_epi32(dup32, 1);
289   memcpy(dst, &row1, 4);
290   dst += stride;
291   const int row2 = _mm_extract_epi32(dup32, 2);
292   memcpy(dst, &row2, 4);
293   dst += stride;
294   const int row3 = _mm_extract_epi32(dup32, 3);
295   memcpy(dst, &row3, 4);
296 }
297 
WriteDuplicate8x4(void * const dest,ptrdiff_t stride,const __m128i dup32)298 inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
299                               const __m128i dup32) {
300   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
301   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
302   auto* dst = static_cast<uint8_t*>(dest);
303   _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
304   dst += stride;
305   _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
306   dst += stride;
307   _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
308   dst += stride;
309   _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
310 }
311 
WriteDuplicate16x4(void * const dest,ptrdiff_t stride,const __m128i dup32)312 inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
313                                const __m128i dup32) {
314   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
315   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
316 
317   auto* dst = static_cast<uint8_t*>(dest);
318   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
319   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
320   dst += stride;
321   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
322   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
323   dst += stride;
324   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
325   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
326   dst += stride;
327   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
328   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
329 }
330 
WriteDuplicate32x4(void * const dest,ptrdiff_t stride,const __m128i dup32)331 inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
332                                const __m128i dup32) {
333   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
334   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
335 
336   auto* dst = static_cast<uint8_t*>(dest);
337   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
338   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
339   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
340   dst += stride;
341   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
342   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
343   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
344   dst += stride;
345   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
346   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
347   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
348   dst += stride;
349   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
350   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
351   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
352 }
353 
WriteDuplicate64x4(void * const dest,ptrdiff_t stride,const __m128i dup32)354 inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
355                                const __m128i dup32) {
356   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
357   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
358 
359   auto* dst = static_cast<uint8_t*>(dest);
360   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
361   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
362   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
363   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
364   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
365   dst += stride;
366   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
367   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
368   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
369   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
370   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
371   dst += stride;
372   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
373   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
374   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
375   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
376   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
377   dst += stride;
378   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
379   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
380   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
381   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
382   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
383 }
384 
385 // ColStoreN<height> copies each of the |height| values in |column| across its
386 // corresponding in dest.
387 template <WriteDuplicateFunc writefn>
ColStore4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)388 inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
389                              ptrdiff_t stride,
390                              const void* LIBGAV1_RESTRICT const column) {
391   const __m128i col_data = Load4(column);
392   const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
393   const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
394   writefn(dest, stride, col_dup32);
395 }
396 
397 template <WriteDuplicateFunc writefn>
ColStore8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)398 inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
399                              ptrdiff_t stride,
400                              const void* LIBGAV1_RESTRICT const column) {
401   const ptrdiff_t stride4 = stride << 2;
402   const __m128i col_data = LoadLo8(column);
403   const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
404   const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
405   auto* dst = static_cast<uint8_t*>(dest);
406   writefn(dst, stride, col_dup32_lo);
407   dst += stride4;
408   const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
409   writefn(dst, stride, col_dup32_hi);
410 }
411 
412 template <WriteDuplicateFunc writefn>
ColStore16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)413 inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
414                               ptrdiff_t stride,
415                               const void* LIBGAV1_RESTRICT const column) {
416   const ptrdiff_t stride4 = stride << 2;
417   const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
418   const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
419   const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
420   const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
421   auto* dst = static_cast<uint8_t*>(dest);
422   writefn(dst, stride, col_dup32_lolo);
423   dst += stride4;
424   const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
425   writefn(dst, stride, col_dup32_lohi);
426   dst += stride4;
427   const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
428   writefn(dst, stride, col_dup32_hilo);
429   dst += stride4;
430   const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
431   writefn(dst, stride, col_dup32_hihi);
432 }
433 
434 template <WriteDuplicateFunc writefn>
ColStore32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)435 inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
436                               ptrdiff_t stride,
437                               const void* LIBGAV1_RESTRICT const column) {
438   const ptrdiff_t stride4 = stride << 2;
439   auto* dst = static_cast<uint8_t*>(dest);
440   for (int y = 0; y < 32; y += 16) {
441     const __m128i col_data =
442         LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
443     const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
444     const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
445     const __m128i col_dup32_lolo =
446         _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
447     writefn(dst, stride, col_dup32_lolo);
448     dst += stride4;
449     const __m128i col_dup32_lohi =
450         _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
451     writefn(dst, stride, col_dup32_lohi);
452     dst += stride4;
453     const __m128i col_dup32_hilo =
454         _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
455     writefn(dst, stride, col_dup32_hilo);
456     dst += stride4;
457     const __m128i col_dup32_hihi =
458         _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
459     writefn(dst, stride, col_dup32_hihi);
460     dst += stride4;
461   }
462 }
463 
464 template <WriteDuplicateFunc writefn>
ColStore64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)465 inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
466                               ptrdiff_t stride,
467                               const void* LIBGAV1_RESTRICT const column) {
468   const ptrdiff_t stride4 = stride << 2;
469   auto* dst = static_cast<uint8_t*>(dest);
470   for (int y = 0; y < 64; y += 16) {
471     const __m128i col_data =
472         LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
473     const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
474     const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
475     const __m128i col_dup32_lolo =
476         _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
477     writefn(dst, stride, col_dup32_lolo);
478     dst += stride4;
479     const __m128i col_dup32_lohi =
480         _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
481     writefn(dst, stride, col_dup32_lohi);
482     dst += stride4;
483     const __m128i col_dup32_hilo =
484         _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
485     writefn(dst, stride, col_dup32_hilo);
486     dst += stride4;
487     const __m128i col_dup32_hihi =
488         _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
489     writefn(dst, stride, col_dup32_hihi);
490     dst += stride4;
491   }
492 }
493 
494 struct DcDefs {
495   DcDefs() = delete;
496 
497   using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
498                                   DcStore4xH_SSE4_1<4>, 0, 0>;
499   // shiftk is the smaller of width_log2 and height_log2.
500   // dc_mult corresponds to the ratio of the smaller block size to the larger.
501   using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
502                                   DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
503   using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
504                                    DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
505 
506   using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
507                                   DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
508   using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
509                                   DcStore8xH_SSE4_1<8>, 0, 0>;
510   using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
511                                    DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
512   using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
513                                    DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
514 
515   using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
516                                    DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
517   using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
518                                    DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
519   using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
520                                     DcStore16xH_SSE4_1<16>, 0, 0>;
521   using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
522                                     DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
523   using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
524                                     DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
525 
526   using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
527                                    DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
528   using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
529                                     DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
530   using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
531                                     DcStore32xH_SSE4_1<32>, 0, 0>;
532   using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
533                                     DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
534 
535   using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
536                                     DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
537   using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
538                                     DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
539   using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
540                                     DcStore64xH_SSE4_1<64>, 0, 0>;
541 };
542 
543 struct DirDefs {
544   DirDefs() = delete;
545 
546   using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
547   using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
548   using _4x16 =
549       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
550   using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
551   using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
552   using _8x16 =
553       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
554   using _8x32 =
555       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
556   using _16x4 =
557       DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
558   using _16x8 =
559       DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
560   using _16x16 =
561       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
562   using _16x32 =
563       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
564   using _16x64 =
565       DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
566   using _32x8 =
567       DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
568   using _32x16 =
569       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
570   using _32x32 =
571       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
572   using _32x64 =
573       DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
574   using _64x16 =
575       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
576   using _64x32 =
577       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
578   using _64x64 =
579       DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
580 };
581 
582 template <int y_mask>
WritePaethLine4(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & top_left_diffs)583 inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
584                             const __m128i& left, const __m128i& top_lefts,
585                             const __m128i& top_dists, const __m128i& left_dists,
586                             const __m128i& top_left_diffs) {
587   const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
588 
589   const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
590   const __m128i top_left_dists =
591       _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
592 
593   // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
594   // operation is unavailable, so the logic for selecting top, left, or
595   // top_left is inverted.
596   __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
597   not_select_left =
598       _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
599   const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
600 
601   const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
602 
603   const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
604   __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
605   top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
606   top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
607 
608   // The sequence of 32-bit packed operations was found (see CL via blame) to
609   // outperform 16-bit operations, despite the availability of the packus
610   // function, when tested on a Xeon E7 v3.
611   const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
612   const __m128i pred = _mm_shuffle_epi8(
613       _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
614   Store4(dst, pred);
615 }
616 
617 // top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
618 // we would be able to do all of these operations as epi8 for a 16-pixel version
619 // of this function. Still, since lefts_y is just a vector of duplicates, it
620 // could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
621 // for the blends.
622 template <int y_mask>
WritePaethLine8(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & top_left_diffs)623 inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
624                             const __m128i& left, const __m128i& top_lefts,
625                             const __m128i& top_dists, const __m128i& left_dists,
626                             const __m128i& top_left_diffs) {
627   const __m128i select_y = _mm_set1_epi32(y_mask);
628   const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
629 
630   const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
631   const __m128i top_left_dists =
632       _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
633 
634   // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
635   // operation is unavailable, so the logic for selecting top, left, or
636   // top_left is inverted.
637   __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
638   not_select_left =
639       _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
640   const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
641 
642   const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
643 
644   const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
645   __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
646   top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
647   top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
648 
649   const __m128i pred = _mm_packus_epi16(
650       _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
651   _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
652 }
653 
654 // |top| is an epi8 of length 16
655 // |left| is epi8 of unknown length, as y_mask specifies access
656 // |top_lefts| is an epi8 of 16 duplicates
657 // |top_dists| is an epi8 of unknown length, as y_mask specifies access
658 // |left_dists| is an epi8 of length 16
659 // |left_dists_lo| is an epi16 of length 8
660 // |left_dists_hi| is an epi16 of length 8
661 // |top_left_diffs_lo| is an epi16 of length 8
662 // |top_left_diffs_hi| is an epi16 of length 8
663 // The latter two vectors are epi16 because their values may reach -510.
664 // |left_dists| is provided alongside its spread out version because it doesn't
665 // change between calls and interacts with both kinds of packing.
666 template <int y_mask>
WritePaethLine16(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & left_dists_lo,const __m128i & left_dists_hi,const __m128i & top_left_diffs_lo,const __m128i & top_left_diffs_hi)667 inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
668                              const __m128i& left, const __m128i& top_lefts,
669                              const __m128i& top_dists,
670                              const __m128i& left_dists,
671                              const __m128i& left_dists_lo,
672                              const __m128i& left_dists_hi,
673                              const __m128i& top_left_diffs_lo,
674                              const __m128i& top_left_diffs_hi) {
675   const __m128i select_y = _mm_set1_epi32(y_mask);
676   const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
677   const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
678   const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
679   const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
680 
681   const __m128i top_left_dists_lo =
682       _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
683   const __m128i top_left_dists_hi =
684       _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
685 
686   const __m128i left_gt_top_left_lo = _mm_packs_epi16(
687       _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
688   const __m128i left_gt_top_left_hi =
689       _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
690                       /* unused second arg for pack */ left_dists_hi);
691   const __m128i left_gt_top_left = _mm_alignr_epi8(
692       left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
693 
694   const __m128i not_select_top_lo =
695       _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
696                       /* unused second arg for pack */ top_dists_y16);
697   const __m128i not_select_top_hi =
698       _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
699                       /* unused second arg for pack */ top_dists_y16);
700   const __m128i not_select_top = _mm_alignr_epi8(
701       not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
702 
703   const __m128i left_leq_top =
704       _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
705   const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
706 
707   // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
708   // operation is unavailable, so the logic for selecting top, left, or
709   // top_left is inverted.
710   const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
711 
712   const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
713   __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
714   top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
715   top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
716   const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
717 
718   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
719 }
720 
Paeth4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)721 void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
722                      const void* LIBGAV1_RESTRICT const top_row,
723                      const void* LIBGAV1_RESTRICT const left_column) {
724   const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
725   const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
726 
727   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
728   const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
729 
730   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
731   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
732   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
733   const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
734   const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
735 
736   const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
737   const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
738   auto* dst = static_cast<uint8_t*>(dest);
739   WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
740                      top_left_diff);
741   dst += stride;
742   WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
743                         top_left_diff);
744   dst += stride;
745   WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
746                         top_left_diff);
747   dst += stride;
748   WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
749                         top_left_diff);
750 }
751 
Paeth4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)752 void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
753                      const void* LIBGAV1_RESTRICT const top_row,
754                      const void* LIBGAV1_RESTRICT const left_column) {
755   const __m128i left = LoadLo8(left_column);
756   const __m128i left_lo = _mm_cvtepu8_epi32(left);
757   const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
758 
759   const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
760   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
761   const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
762 
763   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
764   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
765   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
766   const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
767   const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
768   const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
769 
770   const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
771   const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
772   auto* dst = static_cast<uint8_t*>(dest);
773   WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
774                      top_left_diff);
775   dst += stride;
776   WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
777                         top_left_diff);
778   dst += stride;
779   WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
780                         top_left_diff);
781   dst += stride;
782   WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
783                         top_left_diff);
784   dst += stride;
785   WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
786                      top_left_diff);
787   dst += stride;
788   WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
789                         top_left_diff);
790   dst += stride;
791   WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
792                         top_left_diff);
793   dst += stride;
794   WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
795                         top_left_diff);
796 }
797 
Paeth4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)798 void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
799                       const void* LIBGAV1_RESTRICT const top_row,
800                       const void* LIBGAV1_RESTRICT const left_column) {
801   const __m128i left = LoadUnaligned16(left_column);
802   const __m128i left_0 = _mm_cvtepu8_epi32(left);
803   const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
804   const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
805   const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
806 
807   const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
808   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
809   const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
810 
811   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
812   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
813   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
814   const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
815   const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
816   const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
817   const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
818   const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
819 
820   const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
821   const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
822 
823   auto* dst = static_cast<uint8_t*>(dest);
824   WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
825                      top_left_diff);
826   dst += stride;
827   WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
828                         top_left_diff);
829   dst += stride;
830   WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
831                         top_left_diff);
832   dst += stride;
833   WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
834                         top_left_diff);
835   dst += stride;
836   WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
837                      top_left_diff);
838   dst += stride;
839   WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
840                         top_left_diff);
841   dst += stride;
842   WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
843                         top_left_diff);
844   dst += stride;
845   WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
846                         top_left_diff);
847   dst += stride;
848   WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
849                      top_left_diff);
850   dst += stride;
851   WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
852                         top_left_diff);
853   dst += stride;
854   WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
855                         top_left_diff);
856   dst += stride;
857   WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
858                         top_left_diff);
859   dst += stride;
860   WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
861                      top_left_diff);
862   dst += stride;
863   WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
864                         top_left_diff);
865   dst += stride;
866   WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
867                         top_left_diff);
868   dst += stride;
869   WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
870                         top_left_diff);
871 }
872 
Paeth8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)873 void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
874                      const void* LIBGAV1_RESTRICT const top_row,
875                      const void* LIBGAV1_RESTRICT const left_column) {
876   const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
877   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
878   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
879   const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
880 
881   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
882   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
883   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
884   const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
885   const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
886 
887   const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
888   const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
889   auto* dst = static_cast<uint8_t*>(dest);
890   WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
891                               top_left_diff);
892   dst += stride;
893   WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
894                               top_left_diff);
895   dst += stride;
896   WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
897                               top_left_diff);
898   dst += stride;
899   WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
900                               top_left_diff);
901 }
902 
Paeth8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)903 void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
904                      const void* LIBGAV1_RESTRICT const top_row,
905                      const void* LIBGAV1_RESTRICT const left_column) {
906   const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
907   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
908   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
909   const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
910 
911   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
912   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
913   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
914   const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
915   const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
916 
917   const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
918   const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
919   auto* dst = static_cast<uint8_t*>(dest);
920   WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
921                               top_left_diff);
922   dst += stride;
923   WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
924                               top_left_diff);
925   dst += stride;
926   WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
927                               top_left_diff);
928   dst += stride;
929   WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
930                               top_left_diff);
931   dst += stride;
932   WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
933                               top_left_diff);
934   dst += stride;
935   WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
936                               top_left_diff);
937   dst += stride;
938   WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
939                               top_left_diff);
940   dst += stride;
941   WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
942                               top_left_diff);
943 }
944 
Paeth8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)945 void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
946                       const void* LIBGAV1_RESTRICT const top_row,
947                       const void* LIBGAV1_RESTRICT const left_column) {
948   const __m128i left = LoadUnaligned16(left_column);
949   const __m128i left_lo = _mm_cvtepu8_epi16(left);
950   const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
951   const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
952   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
953   const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
954 
955   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
956   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
957   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
958   const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
959   const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
960   const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
961 
962   const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
963   const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
964   auto* dst = static_cast<uint8_t*>(dest);
965   WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
966                               left_dists, top_left_diff);
967   dst += stride;
968   WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
969                               left_dists, top_left_diff);
970   dst += stride;
971   WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
972                               left_dists, top_left_diff);
973   dst += stride;
974   WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
975                               left_dists, top_left_diff);
976   dst += stride;
977   WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
978                               left_dists, top_left_diff);
979   dst += stride;
980   WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
981                               left_dists, top_left_diff);
982   dst += stride;
983   WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
984                               left_dists, top_left_diff);
985   dst += stride;
986   WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
987                               left_dists, top_left_diff);
988   dst += stride;
989   WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
990                               left_dists, top_left_diff);
991   dst += stride;
992   WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
993                               left_dists, top_left_diff);
994   dst += stride;
995   WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
996                               left_dists, top_left_diff);
997   dst += stride;
998   WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
999                               left_dists, top_left_diff);
1000   dst += stride;
1001   WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
1002                               left_dists, top_left_diff);
1003   dst += stride;
1004   WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
1005                               left_dists, top_left_diff);
1006   dst += stride;
1007   WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
1008                               left_dists, top_left_diff);
1009   dst += stride;
1010   WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
1011                               left_dists, top_left_diff);
1012 }
1013 
Paeth8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1014 void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1015                       const void* LIBGAV1_RESTRICT const top_row,
1016                       const void* LIBGAV1_RESTRICT const left_column) {
1017   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1018   auto* const dst = static_cast<uint8_t*>(dest);
1019   Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
1020   Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
1021 }
1022 
Paeth16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1023 void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1024                       const void* LIBGAV1_RESTRICT const top_row,
1025                       const void* LIBGAV1_RESTRICT const left_column) {
1026   const __m128i left = Load4(left_column);
1027   const __m128i top = LoadUnaligned16(top_row);
1028   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1029   const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1030 
1031   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1032   const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
1033   const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
1034 
1035   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
1036   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1037   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1038 
1039   const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1040                                           _mm_subs_epu8(top_lefts8, top));
1041   const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1042   const __m128i left_dists_hi =
1043       _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1044   const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1045                                          _mm_subs_epu8(top_lefts8, left));
1046 
1047   const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1048   const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1049   const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1050   auto* dst = static_cast<uint8_t*>(dest);
1051   WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1052                       left_dists_lo, left_dists_hi, top_left_diff_lo,
1053                       top_left_diff_hi);
1054   dst += stride;
1055   WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1056                                left_dists, left_dists_lo, left_dists_hi,
1057                                top_left_diff_lo, top_left_diff_hi);
1058   dst += stride;
1059   WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1060                                left_dists, left_dists_lo, left_dists_hi,
1061                                top_left_diff_lo, top_left_diff_hi);
1062   dst += stride;
1063   WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1064                                left_dists, left_dists_lo, left_dists_hi,
1065                                top_left_diff_lo, top_left_diff_hi);
1066 }
1067 
1068 // Inlined for calling with offsets in larger transform sizes, mainly to
1069 // preserve top_left.
WritePaeth16x8(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const uint8_t top_left,const __m128i top,const __m128i left)1070 inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1071                            const uint8_t top_left, const __m128i top,
1072                            const __m128i left) {
1073   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1074   const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1075 
1076   const __m128i top_lefts16 = _mm_set1_epi16(top_left);
1077   const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
1078 
1079   // Given that the spec defines "base" as top[x] + left[y] - top_left,
1080   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1081   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1082 
1083   const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1084                                           _mm_subs_epu8(top_lefts8, top));
1085   const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1086   const __m128i left_dists_hi =
1087       _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1088   const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1089                                          _mm_subs_epu8(top_lefts8, left));
1090 
1091   const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1092   const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1093   const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1094   auto* dst = static_cast<uint8_t*>(dest);
1095   WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1096                       left_dists_lo, left_dists_hi, top_left_diff_lo,
1097                       top_left_diff_hi);
1098   dst += stride;
1099   WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1100                                left_dists, left_dists_lo, left_dists_hi,
1101                                top_left_diff_lo, top_left_diff_hi);
1102   dst += stride;
1103   WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1104                                left_dists, left_dists_lo, left_dists_hi,
1105                                top_left_diff_lo, top_left_diff_hi);
1106   dst += stride;
1107   WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1108                                left_dists, left_dists_lo, left_dists_hi,
1109                                top_left_diff_lo, top_left_diff_hi);
1110   dst += stride;
1111   WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
1112                                left_dists, left_dists_lo, left_dists_hi,
1113                                top_left_diff_lo, top_left_diff_hi);
1114   dst += stride;
1115   WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
1116                                left_dists, left_dists_lo, left_dists_hi,
1117                                top_left_diff_lo, top_left_diff_hi);
1118   dst += stride;
1119   WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
1120                                left_dists, left_dists_lo, left_dists_hi,
1121                                top_left_diff_lo, top_left_diff_hi);
1122   dst += stride;
1123   WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
1124                                left_dists, left_dists_lo, left_dists_hi,
1125                                top_left_diff_lo, top_left_diff_hi);
1126 }
1127 
Paeth16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1128 void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1129                       const void* LIBGAV1_RESTRICT const top_row,
1130                       const void* LIBGAV1_RESTRICT const left_column) {
1131   const __m128i top = LoadUnaligned16(top_row);
1132   const __m128i left = LoadLo8(left_column);
1133   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1134   WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
1135 }
1136 
WritePaeth16x16(void * const dest,ptrdiff_t stride,const uint8_t top_left,const __m128i top,const __m128i left)1137 void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
1138                      const __m128i top, const __m128i left) {
1139   const __m128i top_lo = _mm_cvtepu8_epi16(top);
1140   const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1141 
1142   const __m128i top_lefts16 = _mm_set1_epi16(top_left);
1143   const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
1144 
1145   // Given that the spec defines "base" as top[x] + left[y] - top[-1],
1146   // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1147   // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1148 
1149   const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1150                                           _mm_subs_epu8(top_lefts8, top));
1151   const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1152   const __m128i left_dists_hi =
1153       _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1154   const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1155                                          _mm_subs_epu8(top_lefts8, left));
1156 
1157   const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1158   const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1159   const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1160   auto* dst = static_cast<uint8_t*>(dest);
1161   WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1162                       left_dists_lo, left_dists_hi, top_left_diff_lo,
1163                       top_left_diff_hi);
1164   dst += stride;
1165   WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1166                                left_dists, left_dists_lo, left_dists_hi,
1167                                top_left_diff_lo, top_left_diff_hi);
1168   dst += stride;
1169   WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1170                                left_dists, left_dists_lo, left_dists_hi,
1171                                top_left_diff_lo, top_left_diff_hi);
1172   dst += stride;
1173   WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1174                                left_dists, left_dists_lo, left_dists_hi,
1175                                top_left_diff_lo, top_left_diff_hi);
1176   dst += stride;
1177   WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
1178                                left_dists, left_dists_lo, left_dists_hi,
1179                                top_left_diff_lo, top_left_diff_hi);
1180   dst += stride;
1181   WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
1182                                left_dists, left_dists_lo, left_dists_hi,
1183                                top_left_diff_lo, top_left_diff_hi);
1184   dst += stride;
1185   WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
1186                                left_dists, left_dists_lo, left_dists_hi,
1187                                top_left_diff_lo, top_left_diff_hi);
1188   dst += stride;
1189   WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
1190                                left_dists, left_dists_lo, left_dists_hi,
1191                                top_left_diff_lo, top_left_diff_hi);
1192   dst += stride;
1193   WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
1194                                left_dists, left_dists_lo, left_dists_hi,
1195                                top_left_diff_lo, top_left_diff_hi);
1196   dst += stride;
1197   WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
1198                                left_dists, left_dists_lo, left_dists_hi,
1199                                top_left_diff_lo, top_left_diff_hi);
1200   dst += stride;
1201   WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
1202                                left_dists, left_dists_lo, left_dists_hi,
1203                                top_left_diff_lo, top_left_diff_hi);
1204   dst += stride;
1205   WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
1206                                left_dists, left_dists_lo, left_dists_hi,
1207                                top_left_diff_lo, top_left_diff_hi);
1208   dst += stride;
1209   WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
1210                                left_dists, left_dists_lo, left_dists_hi,
1211                                top_left_diff_lo, top_left_diff_hi);
1212   dst += stride;
1213   WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
1214                                left_dists, left_dists_lo, left_dists_hi,
1215                                top_left_diff_lo, top_left_diff_hi);
1216   dst += stride;
1217   WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
1218                                left_dists, left_dists_lo, left_dists_hi,
1219                                top_left_diff_lo, top_left_diff_hi);
1220   dst += stride;
1221   WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
1222                                left_dists, left_dists_lo, left_dists_hi,
1223                                top_left_diff_lo, top_left_diff_hi);
1224 }
1225 
Paeth16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1226 void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1227                        const void* LIBGAV1_RESTRICT const top_row,
1228                        const void* LIBGAV1_RESTRICT const left_column) {
1229   const __m128i left = LoadUnaligned16(left_column);
1230   const __m128i top = LoadUnaligned16(top_row);
1231   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1232   WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
1233 }
1234 
Paeth16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1235 void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1236                        const void* LIBGAV1_RESTRICT const top_row,
1237                        const void* LIBGAV1_RESTRICT const left_column) {
1238   const __m128i left_0 = LoadUnaligned16(left_column);
1239   const __m128i top = LoadUnaligned16(top_row);
1240   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1241   const uint8_t top_left = top_ptr[-1];
1242   auto* const dst = static_cast<uint8_t*>(dest);
1243   WritePaeth16x16(dst, stride, top_left, top, left_0);
1244   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1245   const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1246   WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
1247 }
1248 
Paeth16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1249 void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1250                        const void* LIBGAV1_RESTRICT const top_row,
1251                        const void* LIBGAV1_RESTRICT const left_column) {
1252   const ptrdiff_t stride16 = stride << 4;
1253   const __m128i left_0 = LoadUnaligned16(left_column);
1254   const __m128i top = LoadUnaligned16(top_row);
1255   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1256   const uint8_t top_left = top_ptr[-1];
1257   auto* dst = static_cast<uint8_t*>(dest);
1258   WritePaeth16x16(dst, stride, top_left, top, left_0);
1259   dst += stride16;
1260   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1261   const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1262   WritePaeth16x16(dst, stride, top_left, top, left_1);
1263   dst += stride16;
1264   const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1265   WritePaeth16x16(dst, stride, top_left, top, left_2);
1266   dst += stride16;
1267   const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1268   WritePaeth16x16(dst, stride, top_left, top, left_3);
1269 }
1270 
Paeth32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1271 void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1272                       const void* LIBGAV1_RESTRICT const top_row,
1273                       const void* LIBGAV1_RESTRICT const left_column) {
1274   const __m128i left = LoadLo8(left_column);
1275   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1276   const __m128i top_0 = LoadUnaligned16(top_row);
1277   const uint8_t top_left = top_ptr[-1];
1278   auto* const dst = static_cast<uint8_t*>(dest);
1279   WritePaeth16x8(dst, stride, top_left, top_0, left);
1280   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1281   WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
1282 }
1283 
Paeth32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1284 void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1285                        const void* LIBGAV1_RESTRICT const top_row,
1286                        const void* LIBGAV1_RESTRICT const left_column) {
1287   const __m128i left = LoadUnaligned16(left_column);
1288   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1289   const __m128i top_0 = LoadUnaligned16(top_row);
1290   const uint8_t top_left = top_ptr[-1];
1291   auto* const dst = static_cast<uint8_t*>(dest);
1292   WritePaeth16x16(dst, stride, top_left, top_0, left);
1293   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1294   WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
1295 }
1296 
Paeth32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1297 void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1298                        const void* LIBGAV1_RESTRICT const top_row,
1299                        const void* LIBGAV1_RESTRICT const left_column) {
1300   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1301   const __m128i left_0 = LoadUnaligned16(left_ptr);
1302   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1303   const __m128i top_0 = LoadUnaligned16(top_ptr);
1304   const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1305   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1306   const uint8_t top_left = top_ptr[-1];
1307   auto* dst = static_cast<uint8_t*>(dest);
1308   WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1309   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1310   dst += (stride << 4);
1311   WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1312   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1313 }
1314 
Paeth32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1315 void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1316                        const void* LIBGAV1_RESTRICT const top_row,
1317                        const void* LIBGAV1_RESTRICT const left_column) {
1318   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1319   const __m128i left_0 = LoadUnaligned16(left_ptr);
1320   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1321   const __m128i top_0 = LoadUnaligned16(top_ptr);
1322   const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1323   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1324   const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1325   const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1326   const uint8_t top_left = top_ptr[-1];
1327   auto* dst = static_cast<uint8_t*>(dest);
1328   WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1329   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1330   dst += (stride << 4);
1331   WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1332   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1333   dst += (stride << 4);
1334   WritePaeth16x16(dst, stride, top_left, top_0, left_2);
1335   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
1336   dst += (stride << 4);
1337   WritePaeth16x16(dst, stride, top_left, top_0, left_3);
1338   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
1339 }
1340 
Paeth64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1341 void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1342                        const void* LIBGAV1_RESTRICT const top_row,
1343                        const void* LIBGAV1_RESTRICT const left_column) {
1344   const __m128i left = LoadUnaligned16(left_column);
1345   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1346   const __m128i top_0 = LoadUnaligned16(top_ptr);
1347   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1348   const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1349   const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1350   const uint8_t top_left = top_ptr[-1];
1351   auto* dst = static_cast<uint8_t*>(dest);
1352   WritePaeth16x16(dst, stride, top_left, top_0, left);
1353   WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
1354   WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
1355   WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
1356 }
1357 
Paeth64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1358 void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1359                        const void* LIBGAV1_RESTRICT const top_row,
1360                        const void* LIBGAV1_RESTRICT const left_column) {
1361   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1362   const __m128i left_0 = LoadUnaligned16(left_ptr);
1363   const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1364   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1365   const __m128i top_0 = LoadUnaligned16(top_ptr);
1366   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1367   const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1368   const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1369   const uint8_t top_left = top_ptr[-1];
1370   auto* dst = static_cast<uint8_t*>(dest);
1371   WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1372   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1373   WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
1374   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
1375   dst += (stride << 4);
1376   WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1377   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1378   WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
1379   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
1380 }
1381 
Paeth64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1382 void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1383                        const void* LIBGAV1_RESTRICT const top_row,
1384                        const void* LIBGAV1_RESTRICT const left_column) {
1385   const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1386   const __m128i left_0 = LoadUnaligned16(left_ptr);
1387   const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1388   const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1389   const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1390   const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1391   const __m128i top_0 = LoadUnaligned16(top_ptr);
1392   const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1393   const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1394   const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1395   const uint8_t top_left = top_ptr[-1];
1396   auto* dst = static_cast<uint8_t*>(dest);
1397   WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1398   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1399   WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
1400   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
1401   dst += (stride << 4);
1402   WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1403   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1404   WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
1405   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
1406   dst += (stride << 4);
1407   WritePaeth16x16(dst, stride, top_left, top_0, left_2);
1408   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
1409   WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
1410   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
1411   dst += (stride << 4);
1412   WritePaeth16x16(dst, stride, top_left, top_0, left_3);
1413   WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
1414   WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
1415   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
1416 }
1417 
Init8bpp()1418 void Init8bpp() {
1419   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1420   assert(dsp != nullptr);
1421   static_cast<void>(dsp);
1422 // These guards check if this version of the function was not superseded by
1423 // a higher optimization level, such as AVX. The corresponding #define also
1424 // prevents the C version from being added to the table.
1425 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
1426   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
1427       DcDefs::_4x4::DcTop;
1428 #endif
1429 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
1430   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
1431       DcDefs::_4x8::DcTop;
1432 #endif
1433 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
1434   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
1435       DcDefs::_4x16::DcTop;
1436 #endif
1437 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
1438   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
1439       DcDefs::_8x4::DcTop;
1440 #endif
1441 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
1442   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
1443       DcDefs::_8x8::DcTop;
1444 #endif
1445 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
1446   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
1447       DcDefs::_8x16::DcTop;
1448 #endif
1449 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
1450   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
1451       DcDefs::_8x32::DcTop;
1452 #endif
1453 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
1454   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
1455       DcDefs::_16x4::DcTop;
1456 #endif
1457 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
1458   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
1459       DcDefs::_16x8::DcTop;
1460 #endif
1461 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
1462   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
1463       DcDefs::_16x16::DcTop;
1464 #endif
1465 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
1466   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
1467       DcDefs::_16x32::DcTop;
1468 #endif
1469 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
1470   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
1471       DcDefs::_16x64::DcTop;
1472 #endif
1473 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
1474   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
1475       DcDefs::_32x8::DcTop;
1476 #endif
1477 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
1478   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
1479       DcDefs::_32x16::DcTop;
1480 #endif
1481 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
1482   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
1483       DcDefs::_32x32::DcTop;
1484 #endif
1485 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
1486   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
1487       DcDefs::_32x64::DcTop;
1488 #endif
1489 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
1490   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
1491       DcDefs::_64x16::DcTop;
1492 #endif
1493 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
1494   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
1495       DcDefs::_64x32::DcTop;
1496 #endif
1497 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
1498   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
1499       DcDefs::_64x64::DcTop;
1500 #endif
1501 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
1502   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
1503       DcDefs::_4x4::DcLeft;
1504 #endif
1505 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
1506   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
1507       DcDefs::_4x8::DcLeft;
1508 #endif
1509 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
1510   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
1511       DcDefs::_4x16::DcLeft;
1512 #endif
1513 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
1514   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
1515       DcDefs::_8x4::DcLeft;
1516 #endif
1517 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
1518   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
1519       DcDefs::_8x8::DcLeft;
1520 #endif
1521 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
1522   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
1523       DcDefs::_8x16::DcLeft;
1524 #endif
1525 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
1526   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
1527       DcDefs::_8x32::DcLeft;
1528 #endif
1529 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
1530   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
1531       DcDefs::_16x4::DcLeft;
1532 #endif
1533 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
1534   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
1535       DcDefs::_16x8::DcLeft;
1536 #endif
1537 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
1538   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
1539       DcDefs::_16x16::DcLeft;
1540 #endif
1541 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
1542   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
1543       DcDefs::_16x32::DcLeft;
1544 #endif
1545 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
1546   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
1547       DcDefs::_16x64::DcLeft;
1548 #endif
1549 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
1550   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
1551       DcDefs::_32x8::DcLeft;
1552 #endif
1553 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
1554   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
1555       DcDefs::_32x16::DcLeft;
1556 #endif
1557 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
1558   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
1559       DcDefs::_32x32::DcLeft;
1560 #endif
1561 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
1562   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
1563       DcDefs::_32x64::DcLeft;
1564 #endif
1565 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
1566   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
1567       DcDefs::_64x16::DcLeft;
1568 #endif
1569 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
1570   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
1571       DcDefs::_64x32::DcLeft;
1572 #endif
1573 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
1574   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
1575       DcDefs::_64x64::DcLeft;
1576 #endif
1577 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
1578   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
1579       DcDefs::_4x4::Dc;
1580 #endif
1581 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
1582   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
1583       DcDefs::_4x8::Dc;
1584 #endif
1585 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
1586   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
1587       DcDefs::_4x16::Dc;
1588 #endif
1589 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
1590   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
1591       DcDefs::_8x4::Dc;
1592 #endif
1593 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
1594   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
1595       DcDefs::_8x8::Dc;
1596 #endif
1597 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
1598   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
1599       DcDefs::_8x16::Dc;
1600 #endif
1601 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
1602   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
1603       DcDefs::_8x32::Dc;
1604 #endif
1605 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
1606   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
1607       DcDefs::_16x4::Dc;
1608 #endif
1609 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
1610   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
1611       DcDefs::_16x8::Dc;
1612 #endif
1613 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
1614   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
1615       DcDefs::_16x16::Dc;
1616 #endif
1617 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
1618   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
1619       DcDefs::_16x32::Dc;
1620 #endif
1621 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
1622   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
1623       DcDefs::_16x64::Dc;
1624 #endif
1625 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
1626   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
1627       DcDefs::_32x8::Dc;
1628 #endif
1629 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
1630   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
1631       DcDefs::_32x16::Dc;
1632 #endif
1633 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
1634   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
1635       DcDefs::_32x32::Dc;
1636 #endif
1637 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
1638   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
1639       DcDefs::_32x64::Dc;
1640 #endif
1641 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
1642   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
1643       DcDefs::_64x16::Dc;
1644 #endif
1645 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
1646   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
1647       DcDefs::_64x32::Dc;
1648 #endif
1649 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
1650   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
1651       DcDefs::_64x64::Dc;
1652 #endif
1653 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
1654   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
1655       Paeth4x4_SSE4_1;
1656 #endif
1657 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
1658   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
1659       Paeth4x8_SSE4_1;
1660 #endif
1661 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
1662   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
1663       Paeth4x16_SSE4_1;
1664 #endif
1665 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
1666   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
1667       Paeth8x4_SSE4_1;
1668 #endif
1669 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
1670   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
1671       Paeth8x8_SSE4_1;
1672 #endif
1673 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
1674   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
1675       Paeth8x16_SSE4_1;
1676 #endif
1677 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
1678   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
1679       Paeth8x32_SSE4_1;
1680 #endif
1681 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
1682   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
1683       Paeth16x4_SSE4_1;
1684 #endif
1685 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
1686   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
1687       Paeth16x8_SSE4_1;
1688 #endif
1689 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
1690   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
1691       Paeth16x16_SSE4_1;
1692 #endif
1693 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
1694   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
1695       Paeth16x32_SSE4_1;
1696 #endif
1697 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
1698   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
1699       Paeth16x64_SSE4_1;
1700 #endif
1701 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
1702   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
1703       Paeth32x8_SSE4_1;
1704 #endif
1705 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
1706   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
1707       Paeth32x16_SSE4_1;
1708 #endif
1709 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
1710   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
1711       Paeth32x32_SSE4_1;
1712 #endif
1713 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
1714   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
1715       Paeth32x64_SSE4_1;
1716 #endif
1717 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
1718   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
1719       Paeth64x16_SSE4_1;
1720 #endif
1721 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
1722   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
1723       Paeth64x32_SSE4_1;
1724 #endif
1725 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
1726   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
1727       Paeth64x64_SSE4_1;
1728 #endif
1729 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
1730   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
1731       DirDefs::_4x4::Horizontal;
1732 #endif
1733 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
1734   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
1735       DirDefs::_4x8::Horizontal;
1736 #endif
1737 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
1738   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
1739       DirDefs::_4x16::Horizontal;
1740 #endif
1741 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
1742   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
1743       DirDefs::_8x4::Horizontal;
1744 #endif
1745 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
1746   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
1747       DirDefs::_8x8::Horizontal;
1748 #endif
1749 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
1750   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
1751       DirDefs::_8x16::Horizontal;
1752 #endif
1753 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
1754   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
1755       DirDefs::_8x32::Horizontal;
1756 #endif
1757 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
1758   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
1759       DirDefs::_16x4::Horizontal;
1760 #endif
1761 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
1762   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
1763       DirDefs::_16x8::Horizontal;
1764 #endif
1765 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
1766   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
1767       DirDefs::_16x16::Horizontal;
1768 #endif
1769 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
1770   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
1771       DirDefs::_16x32::Horizontal;
1772 #endif
1773 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
1774   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
1775       DirDefs::_16x64::Horizontal;
1776 #endif
1777 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
1778   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
1779       DirDefs::_32x8::Horizontal;
1780 #endif
1781 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
1782   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
1783       DirDefs::_32x16::Horizontal;
1784 #endif
1785 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
1786   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
1787       DirDefs::_32x32::Horizontal;
1788 #endif
1789 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
1790   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
1791       DirDefs::_32x64::Horizontal;
1792 #endif
1793 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
1794   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
1795       DirDefs::_64x16::Horizontal;
1796 #endif
1797 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
1798   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
1799       DirDefs::_64x32::Horizontal;
1800 #endif
1801 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
1802   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
1803       DirDefs::_64x64::Horizontal;
1804 #endif
1805 }  // NOLINT(readability/fn_size)
1806 
1807 }  // namespace
1808 }  // namespace low_bitdepth
1809 
1810 //------------------------------------------------------------------------------
1811 #if LIBGAV1_MAX_BITDEPTH >= 10
1812 namespace high_bitdepth {
1813 namespace {
1814 
1815 template <int height>
DcStore4xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)1816 inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
1817                               const __m128i dc) {
1818   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
1819   int y = height - 1;
1820   auto* dst = static_cast<uint8_t*>(dest);
1821   do {
1822     StoreLo8(dst, dc_dup);
1823     dst += stride;
1824   } while (--y != 0);
1825   StoreLo8(dst, dc_dup);
1826 }
1827 
1828 // WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
1829 // identical shorts that need N total copies written into dest. The unpacking
1830 // works the same as in the 8bpp case, except that each 32-bit unit needs twice
1831 // as many copies.
WriteDuplicate4x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1832 inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
1833                               const __m128i dup32) {
1834   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1835   auto* dst = static_cast<uint8_t*>(dest);
1836   _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
1837   dst += stride;
1838   _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
1839   dst += stride;
1840   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1841   _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
1842   dst += stride;
1843   _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
1844 }
1845 
WriteDuplicate8x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1846 inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
1847                               const __m128i dup32) {
1848   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1849   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1850 
1851   auto* dst = static_cast<uint8_t*>(dest);
1852   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1853   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1854   dst += stride;
1855   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1856   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1857   dst += stride;
1858   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1859   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1860   dst += stride;
1861   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1862   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1863 }
1864 
WriteDuplicate16x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1865 inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
1866                                const __m128i dup32) {
1867   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1868   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1869 
1870   auto* dst = static_cast<uint8_t*>(dest);
1871   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1872   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1873   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
1874   dst += stride;
1875   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1876   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1877   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
1878   dst += stride;
1879   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1880   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1881   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
1882   dst += stride;
1883   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1884   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1885   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
1886 }
1887 
WriteDuplicate32x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1888 inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
1889                                const __m128i dup32) {
1890   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1891   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1892 
1893   auto* dst = static_cast<uint8_t*>(dest);
1894   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1895   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1896   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
1897   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
1898   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
1899   dst += stride;
1900   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1901   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1902   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
1903   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
1904   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
1905   dst += stride;
1906   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1907   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1908   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
1909   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
1910   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
1911   dst += stride;
1912   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1913   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1914   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
1915   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
1916   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
1917 }
1918 
WriteDuplicate64x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1919 inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
1920                                const __m128i dup32) {
1921   const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1922   const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1923 
1924   auto* dst = static_cast<uint8_t*>(dest);
1925   const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1926   for (int x = 0; x < 128; x += 16) {
1927     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
1928   }
1929   dst += stride;
1930   const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1931   for (int x = 0; x < 128; x += 16) {
1932     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
1933   }
1934   dst += stride;
1935   const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1936   for (int x = 0; x < 128; x += 16) {
1937     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
1938   }
1939   dst += stride;
1940   const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1941   for (int x = 0; x < 128; x += 16) {
1942     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
1943   }
1944 }
1945 
1946 // ColStoreN<height> copies each of the |height| values in |column| across its
1947 // corresponding row in dest.
1948 template <WriteDuplicateFunc writefn>
ColStore4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1949 inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1950                              ptrdiff_t stride,
1951                              const void* LIBGAV1_RESTRICT const column) {
1952   const __m128i col_data = LoadLo8(column);
1953   const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
1954   writefn(dest, stride, col_dup32);
1955 }
1956 
1957 template <WriteDuplicateFunc writefn>
ColStore8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1958 inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1959                              ptrdiff_t stride,
1960                              const void* LIBGAV1_RESTRICT const column) {
1961   const __m128i col_data = LoadUnaligned16(column);
1962   const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1963   const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
1964   auto* dst = static_cast<uint8_t*>(dest);
1965   writefn(dst, stride, col_dup32_lo);
1966   const ptrdiff_t stride4 = stride << 2;
1967   dst += stride4;
1968   writefn(dst, stride, col_dup32_hi);
1969 }
1970 
1971 template <WriteDuplicateFunc writefn>
ColStore16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1972 inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1973                               ptrdiff_t stride,
1974                               const void* LIBGAV1_RESTRICT const column) {
1975   const ptrdiff_t stride4 = stride << 2;
1976   auto* dst = static_cast<uint8_t*>(dest);
1977   for (int y = 0; y < 32; y += 16) {
1978     const __m128i col_data =
1979         LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
1980     const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1981     const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
1982     writefn(dst, stride, col_dup32_lo);
1983     dst += stride4;
1984     writefn(dst, stride, col_dup32_hi);
1985     dst += stride4;
1986   }
1987 }
1988 
1989 template <WriteDuplicateFunc writefn>
ColStore32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1990 inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1991                               ptrdiff_t stride,
1992                               const void* LIBGAV1_RESTRICT const column) {
1993   const ptrdiff_t stride4 = stride << 2;
1994   auto* dst = static_cast<uint8_t*>(dest);
1995   for (int y = 0; y < 64; y += 16) {
1996     const __m128i col_data =
1997         LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
1998     const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1999     const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
2000     writefn(dst, stride, col_dup32_lo);
2001     dst += stride4;
2002     writefn(dst, stride, col_dup32_hi);
2003     dst += stride4;
2004   }
2005 }
2006 
2007 template <WriteDuplicateFunc writefn>
ColStore64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)2008 inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
2009                               ptrdiff_t stride,
2010                               const void* LIBGAV1_RESTRICT const column) {
2011   const ptrdiff_t stride4 = stride << 2;
2012   auto* dst = static_cast<uint8_t*>(dest);
2013   for (int y = 0; y < 128; y += 16) {
2014     const __m128i col_data =
2015         LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
2016     const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
2017     const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
2018     writefn(dst, stride, col_dup32_lo);
2019     dst += stride4;
2020     writefn(dst, stride, col_dup32_hi);
2021     dst += stride4;
2022   }
2023 }
2024 
2025 // |ref| points to 8 bytes containing 4 packed int16 values.
DcSum4_SSE4_1(const void * ref)2026 inline __m128i DcSum4_SSE4_1(const void* ref) {
2027   const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
2028   const __m128i ones = _mm_set1_epi16(1);
2029 
2030   // half_sum[31:0]  = a1+a2
2031   // half_sum[63:32] = a3+a4
2032   const __m128i half_sum = _mm_madd_epi16(vals, ones);
2033   // Place half_sum[63:32] in shift_sum[31:0].
2034   const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
2035   return _mm_add_epi32(half_sum, shift_sum);
2036 }
2037 
2038 struct DcDefs {
2039   DcDefs() = delete;
2040 
2041   using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
2042                                   DcStore4xH_SSE4_1<4>, 0, 0>;
2043 };
2044 
2045 struct DirDefs {
2046   DirDefs() = delete;
2047 
2048   using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
2049   using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
2050   using _4x16 =
2051       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
2052   using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
2053   using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
2054   using _8x16 =
2055       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
2056   using _8x32 =
2057       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
2058   using _16x4 =
2059       DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
2060   using _16x8 =
2061       DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
2062   using _16x16 =
2063       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
2064   using _16x32 =
2065       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
2066   using _16x64 =
2067       DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
2068   using _32x8 =
2069       DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
2070   using _32x16 =
2071       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
2072   using _32x32 =
2073       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
2074   using _32x64 =
2075       DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
2076   using _64x16 =
2077       DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
2078   using _64x32 =
2079       DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
2080   using _64x64 =
2081       DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
2082 };
2083 
Init10bpp()2084 void Init10bpp() {
2085   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
2086   assert(dsp != nullptr);
2087   static_cast<void>(dsp);
2088 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
2089   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
2090       DcDefs::_4x4::DcTop;
2091 #endif
2092 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
2093   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
2094       DcDefs::_4x4::DcLeft;
2095 #endif
2096 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
2097   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
2098       DcDefs::_4x4::Dc;
2099 #endif
2100 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
2101   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
2102       DirDefs::_4x4::Horizontal;
2103 #endif
2104 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
2105   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
2106       DirDefs::_4x8::Horizontal;
2107 #endif
2108 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
2109   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
2110       DirDefs::_4x16::Horizontal;
2111 #endif
2112 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
2113   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
2114       DirDefs::_8x4::Horizontal;
2115 #endif
2116 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
2117   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
2118       DirDefs::_8x8::Horizontal;
2119 #endif
2120 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
2121   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
2122       DirDefs::_8x16::Horizontal;
2123 #endif
2124 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
2125   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
2126       DirDefs::_8x32::Horizontal;
2127 #endif
2128 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
2129   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
2130       DirDefs::_16x4::Horizontal;
2131 #endif
2132 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
2133   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
2134       DirDefs::_16x8::Horizontal;
2135 #endif
2136 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
2137   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
2138       DirDefs::_16x16::Horizontal;
2139 #endif
2140 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
2141   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
2142       DirDefs::_16x32::Horizontal;
2143 #endif
2144 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
2145   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
2146       DirDefs::_16x64::Horizontal;
2147 #endif
2148 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
2149   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
2150       DirDefs::_32x8::Horizontal;
2151 #endif
2152 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
2153   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
2154       DirDefs::_32x16::Horizontal;
2155 #endif
2156 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
2157   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
2158       DirDefs::_32x32::Horizontal;
2159 #endif
2160 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
2161   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
2162       DirDefs::_32x64::Horizontal;
2163 #endif
2164 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
2165   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
2166       DirDefs::_64x16::Horizontal;
2167 #endif
2168 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
2169   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
2170       DirDefs::_64x32::Horizontal;
2171 #endif
2172 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
2173   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
2174       DirDefs::_64x64::Horizontal;
2175 #endif
2176 }
2177 
2178 }  // namespace
2179 }  // namespace high_bitdepth
2180 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
2181 
IntraPredInit_SSE4_1()2182 void IntraPredInit_SSE4_1() {
2183   low_bitdepth::Init8bpp();
2184 #if LIBGAV1_MAX_BITDEPTH >= 10
2185   high_bitdepth::Init10bpp();
2186 #endif
2187 }
2188 
2189 }  // namespace dsp
2190 }  // namespace libgav1
2191 
2192 #else   // !LIBGAV1_TARGETING_SSE4_1
2193 namespace libgav1 {
2194 namespace dsp {
2195 
IntraPredInit_SSE4_1()2196 void IntraPredInit_SSE4_1() {}
2197 
2198 }  // namespace dsp
2199 }  // namespace libgav1
2200 #endif  // LIBGAV1_TARGETING_SSE4_1
2201