1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <xmmintrin.h>
21
22 #include <algorithm>
23 #include <cassert>
24 #include <cstddef>
25 #include <cstdint>
26 #include <cstring>
27
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/constants.h"
34
35 namespace libgav1 {
36 namespace dsp {
37 namespace {
38
39 //------------------------------------------------------------------------------
40 // Utility Functions
41
42 // This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
43 // Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
44 // block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
45 // we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
46 // bits.
47 constexpr int kThreeInverse = 0x5556;
48 constexpr int kFiveInverse = 0x3334;
49 template <int shiftk, int multiplier>
DivideByMultiplyShift_U32(const __m128i dividend)50 inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
51 const __m128i interm = _mm_srli_epi32(dividend, shiftk);
52 return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
53 }
54
55 //------------------------------------------------------------------------------
56 // DcPredFuncs_SSE4_1
57
58 using DcSumFunc = __m128i (*)(const void* ref);
59 using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
60 using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
61 const __m128i column);
62 // For copying an entire column across a block.
63 using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
64 const void* column);
65
66 // DC intra-predictors for non-square blocks.
67 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
68 DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
69 struct DcPredFuncs_SSE4_1 {
70 DcPredFuncs_SSE4_1() = delete;
71
72 static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
73 const void* left_column);
74 static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
75 const void* left_column);
76 static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
77 const void* left_column);
78 };
79
80 // Directional intra-predictors for square blocks.
81 template <ColumnStoreFunc col_storefn>
82 struct DirectionalPredFuncs_SSE4_1 {
83 DirectionalPredFuncs_SSE4_1() = delete;
84
85 static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
86 const void* left_column);
87 static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
88 const void* left_column);
89 };
90
91 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
92 DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
93 void DcPredFuncs_SSE4_1<
94 width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
DcTop(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void *)95 dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
96 const void* LIBGAV1_RESTRICT const top_row,
97 const void* /*left_column*/) {
98 const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
99 const __m128i sum = top_sumfn(top_row);
100 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
101 storefn(dest, stride, dc);
102 }
103
104 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
105 DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
106 void DcPredFuncs_SSE4_1<
107 width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
DcLeft(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)108 dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
109 const void* /*top_row*/,
110 const void* LIBGAV1_RESTRICT const left_column) {
111 const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
112 const __m128i sum = left_sumfn(left_column);
113 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
114 storefn(dest, stride, dc);
115 }
116
117 template <int width_log2, int height_log2, DcSumFunc top_sumfn,
118 DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
119 void DcPredFuncs_SSE4_1<
120 width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
Dc(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)121 dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
122 const void* LIBGAV1_RESTRICT const top_row,
123 const void* LIBGAV1_RESTRICT const left_column) {
124 const __m128i rounder =
125 _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
126 const __m128i sum_top = top_sumfn(top_row);
127 const __m128i sum_left = left_sumfn(left_column);
128 const __m128i sum = _mm_add_epi32(sum_top, sum_left);
129 if (width_log2 == height_log2) {
130 const __m128i dc =
131 _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
132 storefn(dest, stride, dc);
133 } else {
134 const __m128i dc =
135 DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
136 storefn(dest, stride, dc);
137 }
138 }
139
140 //------------------------------------------------------------------------------
141 // DcPredFuncs_SSE4_1 directional predictors
142
143 template <ColumnStoreFunc col_storefn>
Horizontal(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)144 void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
145 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
146 const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
147 col_storefn(dest, stride, left_column);
148 }
149
150 } // namespace
151
152 //------------------------------------------------------------------------------
153 namespace low_bitdepth {
154 namespace {
155
156 // |ref| points to 4 bytes containing 4 packed ints.
DcSum4_SSE4_1(const void * const ref)157 inline __m128i DcSum4_SSE4_1(const void* const ref) {
158 const __m128i vals = Load4(ref);
159 const __m128i zero = _mm_setzero_si128();
160 return _mm_sad_epu8(vals, zero);
161 }
162
DcSum8_SSE4_1(const void * const ref)163 inline __m128i DcSum8_SSE4_1(const void* const ref) {
164 const __m128i vals = LoadLo8(ref);
165 const __m128i zero = _mm_setzero_si128();
166 return _mm_sad_epu8(vals, zero);
167 }
168
DcSum16_SSE4_1(const void * const ref)169 inline __m128i DcSum16_SSE4_1(const void* const ref) {
170 const __m128i zero = _mm_setzero_si128();
171 const __m128i vals = LoadUnaligned16(ref);
172 const __m128i partial_sum = _mm_sad_epu8(vals, zero);
173 return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
174 }
175
DcSum32_SSE4_1(const void * const ref)176 inline __m128i DcSum32_SSE4_1(const void* const ref) {
177 const __m128i zero = _mm_setzero_si128();
178 const __m128i vals1 = LoadUnaligned16(ref);
179 const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
180 const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
181 const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
182 const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
183 return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
184 }
185
DcSum64_SSE4_1(const void * const ref)186 inline __m128i DcSum64_SSE4_1(const void* const ref) {
187 const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
188 const __m128i zero = _mm_setzero_si128();
189 const __m128i vals1 = LoadUnaligned16(ref_ptr);
190 const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
191 const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
192 const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
193 const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
194 const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
195 __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
196 const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
197 partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
198 const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
199 partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
200 return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
201 }
202
203 template <int height>
DcStore4xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)204 inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
205 const __m128i dc) {
206 const __m128i zero = _mm_setzero_si128();
207 const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
208 int y = height - 1;
209 auto* dst = static_cast<uint8_t*>(dest);
210 do {
211 Store4(dst, dc_dup);
212 dst += stride;
213 } while (--y != 0);
214 Store4(dst, dc_dup);
215 }
216
217 template <int height>
DcStore8xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)218 inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
219 const __m128i dc) {
220 const __m128i zero = _mm_setzero_si128();
221 const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
222 int y = height - 1;
223 auto* dst = static_cast<uint8_t*>(dest);
224 do {
225 StoreLo8(dst, dc_dup);
226 dst += stride;
227 } while (--y != 0);
228 StoreLo8(dst, dc_dup);
229 }
230
231 template <int height>
DcStore16xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)232 inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
233 const __m128i dc) {
234 const __m128i zero = _mm_setzero_si128();
235 const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
236 int y = height - 1;
237 auto* dst = static_cast<uint8_t*>(dest);
238 do {
239 StoreUnaligned16(dst, dc_dup);
240 dst += stride;
241 } while (--y != 0);
242 StoreUnaligned16(dst, dc_dup);
243 }
244
245 template <int height>
DcStore32xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)246 inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
247 const __m128i dc) {
248 const __m128i zero = _mm_setzero_si128();
249 const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
250 int y = height - 1;
251 auto* dst = static_cast<uint8_t*>(dest);
252 do {
253 StoreUnaligned16(dst, dc_dup);
254 StoreUnaligned16(dst + 16, dc_dup);
255 dst += stride;
256 } while (--y != 0);
257 StoreUnaligned16(dst, dc_dup);
258 StoreUnaligned16(dst + 16, dc_dup);
259 }
260
261 template <int height>
DcStore64xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)262 inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
263 const __m128i dc) {
264 const __m128i zero = _mm_setzero_si128();
265 const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
266 int y = height - 1;
267 auto* dst = static_cast<uint8_t*>(dest);
268 do {
269 StoreUnaligned16(dst, dc_dup);
270 StoreUnaligned16(dst + 16, dc_dup);
271 StoreUnaligned16(dst + 32, dc_dup);
272 StoreUnaligned16(dst + 48, dc_dup);
273 dst += stride;
274 } while (--y != 0);
275 StoreUnaligned16(dst, dc_dup);
276 StoreUnaligned16(dst + 16, dc_dup);
277 StoreUnaligned16(dst + 32, dc_dup);
278 StoreUnaligned16(dst + 48, dc_dup);
279 }
280
281 // WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
282 // be copied for width N into dest.
WriteDuplicate4x4(void * const dest,ptrdiff_t stride,const __m128i dup32)283 inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
284 const __m128i dup32) {
285 auto* dst = static_cast<uint8_t*>(dest);
286 Store4(dst, dup32);
287 dst += stride;
288 const int row1 = _mm_extract_epi32(dup32, 1);
289 memcpy(dst, &row1, 4);
290 dst += stride;
291 const int row2 = _mm_extract_epi32(dup32, 2);
292 memcpy(dst, &row2, 4);
293 dst += stride;
294 const int row3 = _mm_extract_epi32(dup32, 3);
295 memcpy(dst, &row3, 4);
296 }
297
WriteDuplicate8x4(void * const dest,ptrdiff_t stride,const __m128i dup32)298 inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
299 const __m128i dup32) {
300 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
301 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
302 auto* dst = static_cast<uint8_t*>(dest);
303 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
304 dst += stride;
305 _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
306 dst += stride;
307 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
308 dst += stride;
309 _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
310 }
311
WriteDuplicate16x4(void * const dest,ptrdiff_t stride,const __m128i dup32)312 inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
313 const __m128i dup32) {
314 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
315 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
316
317 auto* dst = static_cast<uint8_t*>(dest);
318 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
319 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
320 dst += stride;
321 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
322 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
323 dst += stride;
324 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
325 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
326 dst += stride;
327 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
328 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
329 }
330
WriteDuplicate32x4(void * const dest,ptrdiff_t stride,const __m128i dup32)331 inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
332 const __m128i dup32) {
333 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
334 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
335
336 auto* dst = static_cast<uint8_t*>(dest);
337 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
338 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
339 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
340 dst += stride;
341 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
342 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
343 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
344 dst += stride;
345 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
346 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
347 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
348 dst += stride;
349 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
350 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
351 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
352 }
353
WriteDuplicate64x4(void * const dest,ptrdiff_t stride,const __m128i dup32)354 inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
355 const __m128i dup32) {
356 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
357 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
358
359 auto* dst = static_cast<uint8_t*>(dest);
360 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
361 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
362 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
363 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
364 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
365 dst += stride;
366 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
367 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
368 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
369 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
370 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
371 dst += stride;
372 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
373 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
374 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
375 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
376 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
377 dst += stride;
378 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
379 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
380 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
381 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
382 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
383 }
384
385 // ColStoreN<height> copies each of the |height| values in |column| across its
386 // corresponding in dest.
387 template <WriteDuplicateFunc writefn>
ColStore4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)388 inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
389 ptrdiff_t stride,
390 const void* LIBGAV1_RESTRICT const column) {
391 const __m128i col_data = Load4(column);
392 const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
393 const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
394 writefn(dest, stride, col_dup32);
395 }
396
397 template <WriteDuplicateFunc writefn>
ColStore8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)398 inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
399 ptrdiff_t stride,
400 const void* LIBGAV1_RESTRICT const column) {
401 const ptrdiff_t stride4 = stride << 2;
402 const __m128i col_data = LoadLo8(column);
403 const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
404 const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
405 auto* dst = static_cast<uint8_t*>(dest);
406 writefn(dst, stride, col_dup32_lo);
407 dst += stride4;
408 const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
409 writefn(dst, stride, col_dup32_hi);
410 }
411
412 template <WriteDuplicateFunc writefn>
ColStore16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)413 inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
414 ptrdiff_t stride,
415 const void* LIBGAV1_RESTRICT const column) {
416 const ptrdiff_t stride4 = stride << 2;
417 const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
418 const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
419 const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
420 const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
421 auto* dst = static_cast<uint8_t*>(dest);
422 writefn(dst, stride, col_dup32_lolo);
423 dst += stride4;
424 const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
425 writefn(dst, stride, col_dup32_lohi);
426 dst += stride4;
427 const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
428 writefn(dst, stride, col_dup32_hilo);
429 dst += stride4;
430 const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
431 writefn(dst, stride, col_dup32_hihi);
432 }
433
434 template <WriteDuplicateFunc writefn>
ColStore32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)435 inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
436 ptrdiff_t stride,
437 const void* LIBGAV1_RESTRICT const column) {
438 const ptrdiff_t stride4 = stride << 2;
439 auto* dst = static_cast<uint8_t*>(dest);
440 for (int y = 0; y < 32; y += 16) {
441 const __m128i col_data =
442 LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
443 const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
444 const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
445 const __m128i col_dup32_lolo =
446 _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
447 writefn(dst, stride, col_dup32_lolo);
448 dst += stride4;
449 const __m128i col_dup32_lohi =
450 _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
451 writefn(dst, stride, col_dup32_lohi);
452 dst += stride4;
453 const __m128i col_dup32_hilo =
454 _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
455 writefn(dst, stride, col_dup32_hilo);
456 dst += stride4;
457 const __m128i col_dup32_hihi =
458 _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
459 writefn(dst, stride, col_dup32_hihi);
460 dst += stride4;
461 }
462 }
463
464 template <WriteDuplicateFunc writefn>
ColStore64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)465 inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
466 ptrdiff_t stride,
467 const void* LIBGAV1_RESTRICT const column) {
468 const ptrdiff_t stride4 = stride << 2;
469 auto* dst = static_cast<uint8_t*>(dest);
470 for (int y = 0; y < 64; y += 16) {
471 const __m128i col_data =
472 LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
473 const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
474 const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
475 const __m128i col_dup32_lolo =
476 _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
477 writefn(dst, stride, col_dup32_lolo);
478 dst += stride4;
479 const __m128i col_dup32_lohi =
480 _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
481 writefn(dst, stride, col_dup32_lohi);
482 dst += stride4;
483 const __m128i col_dup32_hilo =
484 _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
485 writefn(dst, stride, col_dup32_hilo);
486 dst += stride4;
487 const __m128i col_dup32_hihi =
488 _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
489 writefn(dst, stride, col_dup32_hihi);
490 dst += stride4;
491 }
492 }
493
494 struct DcDefs {
495 DcDefs() = delete;
496
497 using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
498 DcStore4xH_SSE4_1<4>, 0, 0>;
499 // shiftk is the smaller of width_log2 and height_log2.
500 // dc_mult corresponds to the ratio of the smaller block size to the larger.
501 using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
502 DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
503 using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
504 DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
505
506 using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
507 DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
508 using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
509 DcStore8xH_SSE4_1<8>, 0, 0>;
510 using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
511 DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
512 using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
513 DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
514
515 using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
516 DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
517 using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
518 DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
519 using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
520 DcStore16xH_SSE4_1<16>, 0, 0>;
521 using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
522 DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
523 using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
524 DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
525
526 using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
527 DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
528 using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
529 DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
530 using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
531 DcStore32xH_SSE4_1<32>, 0, 0>;
532 using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
533 DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
534
535 using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
536 DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
537 using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
538 DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
539 using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
540 DcStore64xH_SSE4_1<64>, 0, 0>;
541 };
542
543 struct DirDefs {
544 DirDefs() = delete;
545
546 using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
547 using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
548 using _4x16 =
549 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
550 using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
551 using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
552 using _8x16 =
553 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
554 using _8x32 =
555 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
556 using _16x4 =
557 DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
558 using _16x8 =
559 DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
560 using _16x16 =
561 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
562 using _16x32 =
563 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
564 using _16x64 =
565 DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
566 using _32x8 =
567 DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
568 using _32x16 =
569 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
570 using _32x32 =
571 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
572 using _32x64 =
573 DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
574 using _64x16 =
575 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
576 using _64x32 =
577 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
578 using _64x64 =
579 DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
580 };
581
582 template <int y_mask>
WritePaethLine4(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & top_left_diffs)583 inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
584 const __m128i& left, const __m128i& top_lefts,
585 const __m128i& top_dists, const __m128i& left_dists,
586 const __m128i& top_left_diffs) {
587 const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
588
589 const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
590 const __m128i top_left_dists =
591 _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
592
593 // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
594 // operation is unavailable, so the logic for selecting top, left, or
595 // top_left is inverted.
596 __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
597 not_select_left =
598 _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
599 const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
600
601 const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
602
603 const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
604 __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
605 top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
606 top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
607
608 // The sequence of 32-bit packed operations was found (see CL via blame) to
609 // outperform 16-bit operations, despite the availability of the packus
610 // function, when tested on a Xeon E7 v3.
611 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
612 const __m128i pred = _mm_shuffle_epi8(
613 _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
614 Store4(dst, pred);
615 }
616
617 // top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
618 // we would be able to do all of these operations as epi8 for a 16-pixel version
619 // of this function. Still, since lefts_y is just a vector of duplicates, it
620 // could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
621 // for the blends.
622 template <int y_mask>
WritePaethLine8(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & top_left_diffs)623 inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
624 const __m128i& left, const __m128i& top_lefts,
625 const __m128i& top_dists, const __m128i& left_dists,
626 const __m128i& top_left_diffs) {
627 const __m128i select_y = _mm_set1_epi32(y_mask);
628 const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
629
630 const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
631 const __m128i top_left_dists =
632 _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
633
634 // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
635 // operation is unavailable, so the logic for selecting top, left, or
636 // top_left is inverted.
637 __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
638 not_select_left =
639 _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
640 const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
641
642 const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
643
644 const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
645 __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
646 top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
647 top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
648
649 const __m128i pred = _mm_packus_epi16(
650 _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
651 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
652 }
653
654 // |top| is an epi8 of length 16
655 // |left| is epi8 of unknown length, as y_mask specifies access
656 // |top_lefts| is an epi8 of 16 duplicates
657 // |top_dists| is an epi8 of unknown length, as y_mask specifies access
658 // |left_dists| is an epi8 of length 16
659 // |left_dists_lo| is an epi16 of length 8
660 // |left_dists_hi| is an epi16 of length 8
661 // |top_left_diffs_lo| is an epi16 of length 8
662 // |top_left_diffs_hi| is an epi16 of length 8
663 // The latter two vectors are epi16 because their values may reach -510.
664 // |left_dists| is provided alongside its spread out version because it doesn't
665 // change between calls and interacts with both kinds of packing.
666 template <int y_mask>
WritePaethLine16(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & left_dists_lo,const __m128i & left_dists_hi,const __m128i & top_left_diffs_lo,const __m128i & top_left_diffs_hi)667 inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
668 const __m128i& left, const __m128i& top_lefts,
669 const __m128i& top_dists,
670 const __m128i& left_dists,
671 const __m128i& left_dists_lo,
672 const __m128i& left_dists_hi,
673 const __m128i& top_left_diffs_lo,
674 const __m128i& top_left_diffs_hi) {
675 const __m128i select_y = _mm_set1_epi32(y_mask);
676 const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
677 const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
678 const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
679 const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
680
681 const __m128i top_left_dists_lo =
682 _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
683 const __m128i top_left_dists_hi =
684 _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
685
686 const __m128i left_gt_top_left_lo = _mm_packs_epi16(
687 _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
688 const __m128i left_gt_top_left_hi =
689 _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
690 /* unused second arg for pack */ left_dists_hi);
691 const __m128i left_gt_top_left = _mm_alignr_epi8(
692 left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
693
694 const __m128i not_select_top_lo =
695 _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
696 /* unused second arg for pack */ top_dists_y16);
697 const __m128i not_select_top_hi =
698 _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
699 /* unused second arg for pack */ top_dists_y16);
700 const __m128i not_select_top = _mm_alignr_epi8(
701 not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
702
703 const __m128i left_leq_top =
704 _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
705 const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
706
707 // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
708 // operation is unavailable, so the logic for selecting top, left, or
709 // top_left is inverted.
710 const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
711
712 const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
713 __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
714 top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
715 top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
716 const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
717
718 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
719 }
720
Paeth4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)721 void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
722 const void* LIBGAV1_RESTRICT const top_row,
723 const void* LIBGAV1_RESTRICT const left_column) {
724 const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
725 const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
726
727 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
728 const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
729
730 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
731 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
732 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
733 const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
734 const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
735
736 const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
737 const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
738 auto* dst = static_cast<uint8_t*>(dest);
739 WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
740 top_left_diff);
741 dst += stride;
742 WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
743 top_left_diff);
744 dst += stride;
745 WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
746 top_left_diff);
747 dst += stride;
748 WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
749 top_left_diff);
750 }
751
Paeth4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)752 void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
753 const void* LIBGAV1_RESTRICT const top_row,
754 const void* LIBGAV1_RESTRICT const left_column) {
755 const __m128i left = LoadLo8(left_column);
756 const __m128i left_lo = _mm_cvtepu8_epi32(left);
757 const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
758
759 const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
760 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
761 const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
762
763 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
764 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
765 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
766 const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
767 const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
768 const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
769
770 const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
771 const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
772 auto* dst = static_cast<uint8_t*>(dest);
773 WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
774 top_left_diff);
775 dst += stride;
776 WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
777 top_left_diff);
778 dst += stride;
779 WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
780 top_left_diff);
781 dst += stride;
782 WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
783 top_left_diff);
784 dst += stride;
785 WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
786 top_left_diff);
787 dst += stride;
788 WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
789 top_left_diff);
790 dst += stride;
791 WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
792 top_left_diff);
793 dst += stride;
794 WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
795 top_left_diff);
796 }
797
Paeth4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)798 void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
799 const void* LIBGAV1_RESTRICT const top_row,
800 const void* LIBGAV1_RESTRICT const left_column) {
801 const __m128i left = LoadUnaligned16(left_column);
802 const __m128i left_0 = _mm_cvtepu8_epi32(left);
803 const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
804 const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
805 const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
806
807 const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
808 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
809 const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
810
811 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
812 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
813 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
814 const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
815 const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
816 const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
817 const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
818 const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
819
820 const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
821 const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
822
823 auto* dst = static_cast<uint8_t*>(dest);
824 WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
825 top_left_diff);
826 dst += stride;
827 WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
828 top_left_diff);
829 dst += stride;
830 WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
831 top_left_diff);
832 dst += stride;
833 WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
834 top_left_diff);
835 dst += stride;
836 WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
837 top_left_diff);
838 dst += stride;
839 WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
840 top_left_diff);
841 dst += stride;
842 WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
843 top_left_diff);
844 dst += stride;
845 WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
846 top_left_diff);
847 dst += stride;
848 WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
849 top_left_diff);
850 dst += stride;
851 WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
852 top_left_diff);
853 dst += stride;
854 WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
855 top_left_diff);
856 dst += stride;
857 WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
858 top_left_diff);
859 dst += stride;
860 WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
861 top_left_diff);
862 dst += stride;
863 WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
864 top_left_diff);
865 dst += stride;
866 WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
867 top_left_diff);
868 dst += stride;
869 WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
870 top_left_diff);
871 }
872
Paeth8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)873 void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
874 const void* LIBGAV1_RESTRICT const top_row,
875 const void* LIBGAV1_RESTRICT const left_column) {
876 const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
877 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
878 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
879 const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
880
881 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
882 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
883 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
884 const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
885 const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
886
887 const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
888 const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
889 auto* dst = static_cast<uint8_t*>(dest);
890 WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
891 top_left_diff);
892 dst += stride;
893 WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
894 top_left_diff);
895 dst += stride;
896 WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
897 top_left_diff);
898 dst += stride;
899 WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
900 top_left_diff);
901 }
902
Paeth8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)903 void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
904 const void* LIBGAV1_RESTRICT const top_row,
905 const void* LIBGAV1_RESTRICT const left_column) {
906 const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
907 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
908 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
909 const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
910
911 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
912 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
913 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
914 const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
915 const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
916
917 const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
918 const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
919 auto* dst = static_cast<uint8_t*>(dest);
920 WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
921 top_left_diff);
922 dst += stride;
923 WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
924 top_left_diff);
925 dst += stride;
926 WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
927 top_left_diff);
928 dst += stride;
929 WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
930 top_left_diff);
931 dst += stride;
932 WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
933 top_left_diff);
934 dst += stride;
935 WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
936 top_left_diff);
937 dst += stride;
938 WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
939 top_left_diff);
940 dst += stride;
941 WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
942 top_left_diff);
943 }
944
Paeth8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)945 void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
946 const void* LIBGAV1_RESTRICT const top_row,
947 const void* LIBGAV1_RESTRICT const left_column) {
948 const __m128i left = LoadUnaligned16(left_column);
949 const __m128i left_lo = _mm_cvtepu8_epi16(left);
950 const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
951 const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
952 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
953 const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
954
955 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
956 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
957 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
958 const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
959 const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
960 const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
961
962 const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
963 const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
964 auto* dst = static_cast<uint8_t*>(dest);
965 WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
966 left_dists, top_left_diff);
967 dst += stride;
968 WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
969 left_dists, top_left_diff);
970 dst += stride;
971 WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
972 left_dists, top_left_diff);
973 dst += stride;
974 WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
975 left_dists, top_left_diff);
976 dst += stride;
977 WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
978 left_dists, top_left_diff);
979 dst += stride;
980 WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
981 left_dists, top_left_diff);
982 dst += stride;
983 WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
984 left_dists, top_left_diff);
985 dst += stride;
986 WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
987 left_dists, top_left_diff);
988 dst += stride;
989 WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
990 left_dists, top_left_diff);
991 dst += stride;
992 WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
993 left_dists, top_left_diff);
994 dst += stride;
995 WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
996 left_dists, top_left_diff);
997 dst += stride;
998 WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
999 left_dists, top_left_diff);
1000 dst += stride;
1001 WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
1002 left_dists, top_left_diff);
1003 dst += stride;
1004 WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
1005 left_dists, top_left_diff);
1006 dst += stride;
1007 WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
1008 left_dists, top_left_diff);
1009 dst += stride;
1010 WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
1011 left_dists, top_left_diff);
1012 }
1013
Paeth8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1014 void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1015 const void* LIBGAV1_RESTRICT const top_row,
1016 const void* LIBGAV1_RESTRICT const left_column) {
1017 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1018 auto* const dst = static_cast<uint8_t*>(dest);
1019 Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
1020 Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
1021 }
1022
Paeth16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1023 void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1024 const void* LIBGAV1_RESTRICT const top_row,
1025 const void* LIBGAV1_RESTRICT const left_column) {
1026 const __m128i left = Load4(left_column);
1027 const __m128i top = LoadUnaligned16(top_row);
1028 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1029 const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1030
1031 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1032 const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
1033 const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
1034
1035 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
1036 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1037 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1038
1039 const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1040 _mm_subs_epu8(top_lefts8, top));
1041 const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1042 const __m128i left_dists_hi =
1043 _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1044 const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1045 _mm_subs_epu8(top_lefts8, left));
1046
1047 const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1048 const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1049 const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1050 auto* dst = static_cast<uint8_t*>(dest);
1051 WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1052 left_dists_lo, left_dists_hi, top_left_diff_lo,
1053 top_left_diff_hi);
1054 dst += stride;
1055 WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1056 left_dists, left_dists_lo, left_dists_hi,
1057 top_left_diff_lo, top_left_diff_hi);
1058 dst += stride;
1059 WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1060 left_dists, left_dists_lo, left_dists_hi,
1061 top_left_diff_lo, top_left_diff_hi);
1062 dst += stride;
1063 WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1064 left_dists, left_dists_lo, left_dists_hi,
1065 top_left_diff_lo, top_left_diff_hi);
1066 }
1067
1068 // Inlined for calling with offsets in larger transform sizes, mainly to
1069 // preserve top_left.
WritePaeth16x8(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const uint8_t top_left,const __m128i top,const __m128i left)1070 inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1071 const uint8_t top_left, const __m128i top,
1072 const __m128i left) {
1073 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1074 const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1075
1076 const __m128i top_lefts16 = _mm_set1_epi16(top_left);
1077 const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
1078
1079 // Given that the spec defines "base" as top[x] + left[y] - top_left,
1080 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1081 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1082
1083 const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1084 _mm_subs_epu8(top_lefts8, top));
1085 const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1086 const __m128i left_dists_hi =
1087 _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1088 const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1089 _mm_subs_epu8(top_lefts8, left));
1090
1091 const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1092 const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1093 const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1094 auto* dst = static_cast<uint8_t*>(dest);
1095 WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1096 left_dists_lo, left_dists_hi, top_left_diff_lo,
1097 top_left_diff_hi);
1098 dst += stride;
1099 WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1100 left_dists, left_dists_lo, left_dists_hi,
1101 top_left_diff_lo, top_left_diff_hi);
1102 dst += stride;
1103 WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1104 left_dists, left_dists_lo, left_dists_hi,
1105 top_left_diff_lo, top_left_diff_hi);
1106 dst += stride;
1107 WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1108 left_dists, left_dists_lo, left_dists_hi,
1109 top_left_diff_lo, top_left_diff_hi);
1110 dst += stride;
1111 WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
1112 left_dists, left_dists_lo, left_dists_hi,
1113 top_left_diff_lo, top_left_diff_hi);
1114 dst += stride;
1115 WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
1116 left_dists, left_dists_lo, left_dists_hi,
1117 top_left_diff_lo, top_left_diff_hi);
1118 dst += stride;
1119 WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
1120 left_dists, left_dists_lo, left_dists_hi,
1121 top_left_diff_lo, top_left_diff_hi);
1122 dst += stride;
1123 WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
1124 left_dists, left_dists_lo, left_dists_hi,
1125 top_left_diff_lo, top_left_diff_hi);
1126 }
1127
Paeth16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1128 void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1129 const void* LIBGAV1_RESTRICT const top_row,
1130 const void* LIBGAV1_RESTRICT const left_column) {
1131 const __m128i top = LoadUnaligned16(top_row);
1132 const __m128i left = LoadLo8(left_column);
1133 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1134 WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
1135 }
1136
WritePaeth16x16(void * const dest,ptrdiff_t stride,const uint8_t top_left,const __m128i top,const __m128i left)1137 void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
1138 const __m128i top, const __m128i left) {
1139 const __m128i top_lo = _mm_cvtepu8_epi16(top);
1140 const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1141
1142 const __m128i top_lefts16 = _mm_set1_epi16(top_left);
1143 const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
1144
1145 // Given that the spec defines "base" as top[x] + left[y] - top[-1],
1146 // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1147 // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1148
1149 const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1150 _mm_subs_epu8(top_lefts8, top));
1151 const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1152 const __m128i left_dists_hi =
1153 _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1154 const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1155 _mm_subs_epu8(top_lefts8, left));
1156
1157 const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1158 const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1159 const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1160 auto* dst = static_cast<uint8_t*>(dest);
1161 WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1162 left_dists_lo, left_dists_hi, top_left_diff_lo,
1163 top_left_diff_hi);
1164 dst += stride;
1165 WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1166 left_dists, left_dists_lo, left_dists_hi,
1167 top_left_diff_lo, top_left_diff_hi);
1168 dst += stride;
1169 WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1170 left_dists, left_dists_lo, left_dists_hi,
1171 top_left_diff_lo, top_left_diff_hi);
1172 dst += stride;
1173 WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1174 left_dists, left_dists_lo, left_dists_hi,
1175 top_left_diff_lo, top_left_diff_hi);
1176 dst += stride;
1177 WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
1178 left_dists, left_dists_lo, left_dists_hi,
1179 top_left_diff_lo, top_left_diff_hi);
1180 dst += stride;
1181 WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
1182 left_dists, left_dists_lo, left_dists_hi,
1183 top_left_diff_lo, top_left_diff_hi);
1184 dst += stride;
1185 WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
1186 left_dists, left_dists_lo, left_dists_hi,
1187 top_left_diff_lo, top_left_diff_hi);
1188 dst += stride;
1189 WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
1190 left_dists, left_dists_lo, left_dists_hi,
1191 top_left_diff_lo, top_left_diff_hi);
1192 dst += stride;
1193 WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
1194 left_dists, left_dists_lo, left_dists_hi,
1195 top_left_diff_lo, top_left_diff_hi);
1196 dst += stride;
1197 WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
1198 left_dists, left_dists_lo, left_dists_hi,
1199 top_left_diff_lo, top_left_diff_hi);
1200 dst += stride;
1201 WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
1202 left_dists, left_dists_lo, left_dists_hi,
1203 top_left_diff_lo, top_left_diff_hi);
1204 dst += stride;
1205 WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
1206 left_dists, left_dists_lo, left_dists_hi,
1207 top_left_diff_lo, top_left_diff_hi);
1208 dst += stride;
1209 WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
1210 left_dists, left_dists_lo, left_dists_hi,
1211 top_left_diff_lo, top_left_diff_hi);
1212 dst += stride;
1213 WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
1214 left_dists, left_dists_lo, left_dists_hi,
1215 top_left_diff_lo, top_left_diff_hi);
1216 dst += stride;
1217 WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
1218 left_dists, left_dists_lo, left_dists_hi,
1219 top_left_diff_lo, top_left_diff_hi);
1220 dst += stride;
1221 WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
1222 left_dists, left_dists_lo, left_dists_hi,
1223 top_left_diff_lo, top_left_diff_hi);
1224 }
1225
Paeth16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1226 void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1227 const void* LIBGAV1_RESTRICT const top_row,
1228 const void* LIBGAV1_RESTRICT const left_column) {
1229 const __m128i left = LoadUnaligned16(left_column);
1230 const __m128i top = LoadUnaligned16(top_row);
1231 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1232 WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
1233 }
1234
Paeth16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1235 void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1236 const void* LIBGAV1_RESTRICT const top_row,
1237 const void* LIBGAV1_RESTRICT const left_column) {
1238 const __m128i left_0 = LoadUnaligned16(left_column);
1239 const __m128i top = LoadUnaligned16(top_row);
1240 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1241 const uint8_t top_left = top_ptr[-1];
1242 auto* const dst = static_cast<uint8_t*>(dest);
1243 WritePaeth16x16(dst, stride, top_left, top, left_0);
1244 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1245 const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1246 WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
1247 }
1248
Paeth16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1249 void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1250 const void* LIBGAV1_RESTRICT const top_row,
1251 const void* LIBGAV1_RESTRICT const left_column) {
1252 const ptrdiff_t stride16 = stride << 4;
1253 const __m128i left_0 = LoadUnaligned16(left_column);
1254 const __m128i top = LoadUnaligned16(top_row);
1255 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1256 const uint8_t top_left = top_ptr[-1];
1257 auto* dst = static_cast<uint8_t*>(dest);
1258 WritePaeth16x16(dst, stride, top_left, top, left_0);
1259 dst += stride16;
1260 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1261 const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1262 WritePaeth16x16(dst, stride, top_left, top, left_1);
1263 dst += stride16;
1264 const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1265 WritePaeth16x16(dst, stride, top_left, top, left_2);
1266 dst += stride16;
1267 const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1268 WritePaeth16x16(dst, stride, top_left, top, left_3);
1269 }
1270
Paeth32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1271 void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1272 const void* LIBGAV1_RESTRICT const top_row,
1273 const void* LIBGAV1_RESTRICT const left_column) {
1274 const __m128i left = LoadLo8(left_column);
1275 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1276 const __m128i top_0 = LoadUnaligned16(top_row);
1277 const uint8_t top_left = top_ptr[-1];
1278 auto* const dst = static_cast<uint8_t*>(dest);
1279 WritePaeth16x8(dst, stride, top_left, top_0, left);
1280 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1281 WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
1282 }
1283
Paeth32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1284 void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1285 const void* LIBGAV1_RESTRICT const top_row,
1286 const void* LIBGAV1_RESTRICT const left_column) {
1287 const __m128i left = LoadUnaligned16(left_column);
1288 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1289 const __m128i top_0 = LoadUnaligned16(top_row);
1290 const uint8_t top_left = top_ptr[-1];
1291 auto* const dst = static_cast<uint8_t*>(dest);
1292 WritePaeth16x16(dst, stride, top_left, top_0, left);
1293 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1294 WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
1295 }
1296
Paeth32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1297 void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1298 const void* LIBGAV1_RESTRICT const top_row,
1299 const void* LIBGAV1_RESTRICT const left_column) {
1300 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1301 const __m128i left_0 = LoadUnaligned16(left_ptr);
1302 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1303 const __m128i top_0 = LoadUnaligned16(top_ptr);
1304 const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1305 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1306 const uint8_t top_left = top_ptr[-1];
1307 auto* dst = static_cast<uint8_t*>(dest);
1308 WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1309 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1310 dst += (stride << 4);
1311 WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1312 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1313 }
1314
Paeth32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1315 void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1316 const void* LIBGAV1_RESTRICT const top_row,
1317 const void* LIBGAV1_RESTRICT const left_column) {
1318 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1319 const __m128i left_0 = LoadUnaligned16(left_ptr);
1320 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1321 const __m128i top_0 = LoadUnaligned16(top_ptr);
1322 const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1323 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1324 const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1325 const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1326 const uint8_t top_left = top_ptr[-1];
1327 auto* dst = static_cast<uint8_t*>(dest);
1328 WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1329 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1330 dst += (stride << 4);
1331 WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1332 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1333 dst += (stride << 4);
1334 WritePaeth16x16(dst, stride, top_left, top_0, left_2);
1335 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
1336 dst += (stride << 4);
1337 WritePaeth16x16(dst, stride, top_left, top_0, left_3);
1338 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
1339 }
1340
Paeth64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1341 void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1342 const void* LIBGAV1_RESTRICT const top_row,
1343 const void* LIBGAV1_RESTRICT const left_column) {
1344 const __m128i left = LoadUnaligned16(left_column);
1345 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1346 const __m128i top_0 = LoadUnaligned16(top_ptr);
1347 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1348 const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1349 const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1350 const uint8_t top_left = top_ptr[-1];
1351 auto* dst = static_cast<uint8_t*>(dest);
1352 WritePaeth16x16(dst, stride, top_left, top_0, left);
1353 WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
1354 WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
1355 WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
1356 }
1357
Paeth64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1358 void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1359 const void* LIBGAV1_RESTRICT const top_row,
1360 const void* LIBGAV1_RESTRICT const left_column) {
1361 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1362 const __m128i left_0 = LoadUnaligned16(left_ptr);
1363 const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1364 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1365 const __m128i top_0 = LoadUnaligned16(top_ptr);
1366 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1367 const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1368 const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1369 const uint8_t top_left = top_ptr[-1];
1370 auto* dst = static_cast<uint8_t*>(dest);
1371 WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1372 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1373 WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
1374 WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
1375 dst += (stride << 4);
1376 WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1377 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1378 WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
1379 WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
1380 }
1381
Paeth64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1382 void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1383 const void* LIBGAV1_RESTRICT const top_row,
1384 const void* LIBGAV1_RESTRICT const left_column) {
1385 const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1386 const __m128i left_0 = LoadUnaligned16(left_ptr);
1387 const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1388 const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1389 const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1390 const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1391 const __m128i top_0 = LoadUnaligned16(top_ptr);
1392 const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1393 const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1394 const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1395 const uint8_t top_left = top_ptr[-1];
1396 auto* dst = static_cast<uint8_t*>(dest);
1397 WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1398 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1399 WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
1400 WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
1401 dst += (stride << 4);
1402 WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1403 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1404 WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
1405 WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
1406 dst += (stride << 4);
1407 WritePaeth16x16(dst, stride, top_left, top_0, left_2);
1408 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
1409 WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
1410 WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
1411 dst += (stride << 4);
1412 WritePaeth16x16(dst, stride, top_left, top_0, left_3);
1413 WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
1414 WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
1415 WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
1416 }
1417
Init8bpp()1418 void Init8bpp() {
1419 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1420 assert(dsp != nullptr);
1421 static_cast<void>(dsp);
1422 // These guards check if this version of the function was not superseded by
1423 // a higher optimization level, such as AVX. The corresponding #define also
1424 // prevents the C version from being added to the table.
1425 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
1426 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
1427 DcDefs::_4x4::DcTop;
1428 #endif
1429 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
1430 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
1431 DcDefs::_4x8::DcTop;
1432 #endif
1433 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
1434 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
1435 DcDefs::_4x16::DcTop;
1436 #endif
1437 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
1438 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
1439 DcDefs::_8x4::DcTop;
1440 #endif
1441 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
1442 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
1443 DcDefs::_8x8::DcTop;
1444 #endif
1445 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
1446 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
1447 DcDefs::_8x16::DcTop;
1448 #endif
1449 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
1450 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
1451 DcDefs::_8x32::DcTop;
1452 #endif
1453 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
1454 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
1455 DcDefs::_16x4::DcTop;
1456 #endif
1457 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
1458 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
1459 DcDefs::_16x8::DcTop;
1460 #endif
1461 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
1462 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
1463 DcDefs::_16x16::DcTop;
1464 #endif
1465 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
1466 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
1467 DcDefs::_16x32::DcTop;
1468 #endif
1469 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
1470 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
1471 DcDefs::_16x64::DcTop;
1472 #endif
1473 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
1474 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
1475 DcDefs::_32x8::DcTop;
1476 #endif
1477 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
1478 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
1479 DcDefs::_32x16::DcTop;
1480 #endif
1481 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
1482 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
1483 DcDefs::_32x32::DcTop;
1484 #endif
1485 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
1486 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
1487 DcDefs::_32x64::DcTop;
1488 #endif
1489 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
1490 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
1491 DcDefs::_64x16::DcTop;
1492 #endif
1493 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
1494 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
1495 DcDefs::_64x32::DcTop;
1496 #endif
1497 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
1498 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
1499 DcDefs::_64x64::DcTop;
1500 #endif
1501 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
1502 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
1503 DcDefs::_4x4::DcLeft;
1504 #endif
1505 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
1506 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
1507 DcDefs::_4x8::DcLeft;
1508 #endif
1509 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
1510 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
1511 DcDefs::_4x16::DcLeft;
1512 #endif
1513 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
1514 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
1515 DcDefs::_8x4::DcLeft;
1516 #endif
1517 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
1518 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
1519 DcDefs::_8x8::DcLeft;
1520 #endif
1521 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
1522 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
1523 DcDefs::_8x16::DcLeft;
1524 #endif
1525 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
1526 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
1527 DcDefs::_8x32::DcLeft;
1528 #endif
1529 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
1530 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
1531 DcDefs::_16x4::DcLeft;
1532 #endif
1533 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
1534 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
1535 DcDefs::_16x8::DcLeft;
1536 #endif
1537 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
1538 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
1539 DcDefs::_16x16::DcLeft;
1540 #endif
1541 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
1542 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
1543 DcDefs::_16x32::DcLeft;
1544 #endif
1545 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
1546 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
1547 DcDefs::_16x64::DcLeft;
1548 #endif
1549 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
1550 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
1551 DcDefs::_32x8::DcLeft;
1552 #endif
1553 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
1554 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
1555 DcDefs::_32x16::DcLeft;
1556 #endif
1557 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
1558 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
1559 DcDefs::_32x32::DcLeft;
1560 #endif
1561 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
1562 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
1563 DcDefs::_32x64::DcLeft;
1564 #endif
1565 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
1566 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
1567 DcDefs::_64x16::DcLeft;
1568 #endif
1569 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
1570 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
1571 DcDefs::_64x32::DcLeft;
1572 #endif
1573 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
1574 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
1575 DcDefs::_64x64::DcLeft;
1576 #endif
1577 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
1578 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
1579 DcDefs::_4x4::Dc;
1580 #endif
1581 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
1582 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
1583 DcDefs::_4x8::Dc;
1584 #endif
1585 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
1586 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
1587 DcDefs::_4x16::Dc;
1588 #endif
1589 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
1590 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
1591 DcDefs::_8x4::Dc;
1592 #endif
1593 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
1594 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
1595 DcDefs::_8x8::Dc;
1596 #endif
1597 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
1598 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
1599 DcDefs::_8x16::Dc;
1600 #endif
1601 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
1602 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
1603 DcDefs::_8x32::Dc;
1604 #endif
1605 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
1606 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
1607 DcDefs::_16x4::Dc;
1608 #endif
1609 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
1610 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
1611 DcDefs::_16x8::Dc;
1612 #endif
1613 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
1614 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
1615 DcDefs::_16x16::Dc;
1616 #endif
1617 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
1618 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
1619 DcDefs::_16x32::Dc;
1620 #endif
1621 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
1622 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
1623 DcDefs::_16x64::Dc;
1624 #endif
1625 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
1626 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
1627 DcDefs::_32x8::Dc;
1628 #endif
1629 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
1630 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
1631 DcDefs::_32x16::Dc;
1632 #endif
1633 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
1634 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
1635 DcDefs::_32x32::Dc;
1636 #endif
1637 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
1638 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
1639 DcDefs::_32x64::Dc;
1640 #endif
1641 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
1642 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
1643 DcDefs::_64x16::Dc;
1644 #endif
1645 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
1646 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
1647 DcDefs::_64x32::Dc;
1648 #endif
1649 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
1650 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
1651 DcDefs::_64x64::Dc;
1652 #endif
1653 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
1654 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
1655 Paeth4x4_SSE4_1;
1656 #endif
1657 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
1658 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
1659 Paeth4x8_SSE4_1;
1660 #endif
1661 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
1662 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
1663 Paeth4x16_SSE4_1;
1664 #endif
1665 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
1666 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
1667 Paeth8x4_SSE4_1;
1668 #endif
1669 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
1670 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
1671 Paeth8x8_SSE4_1;
1672 #endif
1673 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
1674 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
1675 Paeth8x16_SSE4_1;
1676 #endif
1677 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
1678 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
1679 Paeth8x32_SSE4_1;
1680 #endif
1681 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
1682 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
1683 Paeth16x4_SSE4_1;
1684 #endif
1685 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
1686 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
1687 Paeth16x8_SSE4_1;
1688 #endif
1689 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
1690 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
1691 Paeth16x16_SSE4_1;
1692 #endif
1693 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
1694 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
1695 Paeth16x32_SSE4_1;
1696 #endif
1697 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
1698 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
1699 Paeth16x64_SSE4_1;
1700 #endif
1701 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
1702 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
1703 Paeth32x8_SSE4_1;
1704 #endif
1705 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
1706 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
1707 Paeth32x16_SSE4_1;
1708 #endif
1709 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
1710 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
1711 Paeth32x32_SSE4_1;
1712 #endif
1713 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
1714 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
1715 Paeth32x64_SSE4_1;
1716 #endif
1717 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
1718 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
1719 Paeth64x16_SSE4_1;
1720 #endif
1721 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
1722 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
1723 Paeth64x32_SSE4_1;
1724 #endif
1725 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
1726 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
1727 Paeth64x64_SSE4_1;
1728 #endif
1729 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
1730 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
1731 DirDefs::_4x4::Horizontal;
1732 #endif
1733 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
1734 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
1735 DirDefs::_4x8::Horizontal;
1736 #endif
1737 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
1738 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
1739 DirDefs::_4x16::Horizontal;
1740 #endif
1741 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
1742 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
1743 DirDefs::_8x4::Horizontal;
1744 #endif
1745 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
1746 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
1747 DirDefs::_8x8::Horizontal;
1748 #endif
1749 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
1750 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
1751 DirDefs::_8x16::Horizontal;
1752 #endif
1753 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
1754 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
1755 DirDefs::_8x32::Horizontal;
1756 #endif
1757 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
1758 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
1759 DirDefs::_16x4::Horizontal;
1760 #endif
1761 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
1762 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
1763 DirDefs::_16x8::Horizontal;
1764 #endif
1765 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
1766 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
1767 DirDefs::_16x16::Horizontal;
1768 #endif
1769 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
1770 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
1771 DirDefs::_16x32::Horizontal;
1772 #endif
1773 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
1774 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
1775 DirDefs::_16x64::Horizontal;
1776 #endif
1777 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
1778 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
1779 DirDefs::_32x8::Horizontal;
1780 #endif
1781 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
1782 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
1783 DirDefs::_32x16::Horizontal;
1784 #endif
1785 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
1786 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
1787 DirDefs::_32x32::Horizontal;
1788 #endif
1789 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
1790 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
1791 DirDefs::_32x64::Horizontal;
1792 #endif
1793 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
1794 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
1795 DirDefs::_64x16::Horizontal;
1796 #endif
1797 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
1798 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
1799 DirDefs::_64x32::Horizontal;
1800 #endif
1801 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
1802 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
1803 DirDefs::_64x64::Horizontal;
1804 #endif
1805 } // NOLINT(readability/fn_size)
1806
1807 } // namespace
1808 } // namespace low_bitdepth
1809
1810 //------------------------------------------------------------------------------
1811 #if LIBGAV1_MAX_BITDEPTH >= 10
1812 namespace high_bitdepth {
1813 namespace {
1814
1815 template <int height>
DcStore4xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)1816 inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
1817 const __m128i dc) {
1818 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
1819 int y = height - 1;
1820 auto* dst = static_cast<uint8_t*>(dest);
1821 do {
1822 StoreLo8(dst, dc_dup);
1823 dst += stride;
1824 } while (--y != 0);
1825 StoreLo8(dst, dc_dup);
1826 }
1827
1828 // WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
1829 // identical shorts that need N total copies written into dest. The unpacking
1830 // works the same as in the 8bpp case, except that each 32-bit unit needs twice
1831 // as many copies.
WriteDuplicate4x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1832 inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
1833 const __m128i dup32) {
1834 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1835 auto* dst = static_cast<uint8_t*>(dest);
1836 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
1837 dst += stride;
1838 _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
1839 dst += stride;
1840 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1841 _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
1842 dst += stride;
1843 _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
1844 }
1845
WriteDuplicate8x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1846 inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
1847 const __m128i dup32) {
1848 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1849 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1850
1851 auto* dst = static_cast<uint8_t*>(dest);
1852 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1853 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1854 dst += stride;
1855 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1856 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1857 dst += stride;
1858 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1859 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1860 dst += stride;
1861 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1862 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1863 }
1864
WriteDuplicate16x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1865 inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
1866 const __m128i dup32) {
1867 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1868 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1869
1870 auto* dst = static_cast<uint8_t*>(dest);
1871 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1872 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1873 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
1874 dst += stride;
1875 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1876 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1877 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
1878 dst += stride;
1879 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1880 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1881 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
1882 dst += stride;
1883 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1884 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1885 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
1886 }
1887
WriteDuplicate32x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1888 inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
1889 const __m128i dup32) {
1890 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1891 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1892
1893 auto* dst = static_cast<uint8_t*>(dest);
1894 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1895 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1896 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
1897 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
1898 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
1899 dst += stride;
1900 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1901 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1902 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
1903 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
1904 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
1905 dst += stride;
1906 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1907 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1908 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
1909 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
1910 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
1911 dst += stride;
1912 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1913 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1914 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
1915 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
1916 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
1917 }
1918
WriteDuplicate64x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1919 inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
1920 const __m128i dup32) {
1921 const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1922 const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1923
1924 auto* dst = static_cast<uint8_t*>(dest);
1925 const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1926 for (int x = 0; x < 128; x += 16) {
1927 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
1928 }
1929 dst += stride;
1930 const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1931 for (int x = 0; x < 128; x += 16) {
1932 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
1933 }
1934 dst += stride;
1935 const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1936 for (int x = 0; x < 128; x += 16) {
1937 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
1938 }
1939 dst += stride;
1940 const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1941 for (int x = 0; x < 128; x += 16) {
1942 _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
1943 }
1944 }
1945
1946 // ColStoreN<height> copies each of the |height| values in |column| across its
1947 // corresponding row in dest.
1948 template <WriteDuplicateFunc writefn>
ColStore4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1949 inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1950 ptrdiff_t stride,
1951 const void* LIBGAV1_RESTRICT const column) {
1952 const __m128i col_data = LoadLo8(column);
1953 const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
1954 writefn(dest, stride, col_dup32);
1955 }
1956
1957 template <WriteDuplicateFunc writefn>
ColStore8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1958 inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1959 ptrdiff_t stride,
1960 const void* LIBGAV1_RESTRICT const column) {
1961 const __m128i col_data = LoadUnaligned16(column);
1962 const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1963 const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
1964 auto* dst = static_cast<uint8_t*>(dest);
1965 writefn(dst, stride, col_dup32_lo);
1966 const ptrdiff_t stride4 = stride << 2;
1967 dst += stride4;
1968 writefn(dst, stride, col_dup32_hi);
1969 }
1970
1971 template <WriteDuplicateFunc writefn>
ColStore16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1972 inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1973 ptrdiff_t stride,
1974 const void* LIBGAV1_RESTRICT const column) {
1975 const ptrdiff_t stride4 = stride << 2;
1976 auto* dst = static_cast<uint8_t*>(dest);
1977 for (int y = 0; y < 32; y += 16) {
1978 const __m128i col_data =
1979 LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
1980 const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1981 const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
1982 writefn(dst, stride, col_dup32_lo);
1983 dst += stride4;
1984 writefn(dst, stride, col_dup32_hi);
1985 dst += stride4;
1986 }
1987 }
1988
1989 template <WriteDuplicateFunc writefn>
ColStore32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1990 inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1991 ptrdiff_t stride,
1992 const void* LIBGAV1_RESTRICT const column) {
1993 const ptrdiff_t stride4 = stride << 2;
1994 auto* dst = static_cast<uint8_t*>(dest);
1995 for (int y = 0; y < 64; y += 16) {
1996 const __m128i col_data =
1997 LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
1998 const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1999 const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
2000 writefn(dst, stride, col_dup32_lo);
2001 dst += stride4;
2002 writefn(dst, stride, col_dup32_hi);
2003 dst += stride4;
2004 }
2005 }
2006
2007 template <WriteDuplicateFunc writefn>
ColStore64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)2008 inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
2009 ptrdiff_t stride,
2010 const void* LIBGAV1_RESTRICT const column) {
2011 const ptrdiff_t stride4 = stride << 2;
2012 auto* dst = static_cast<uint8_t*>(dest);
2013 for (int y = 0; y < 128; y += 16) {
2014 const __m128i col_data =
2015 LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
2016 const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
2017 const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
2018 writefn(dst, stride, col_dup32_lo);
2019 dst += stride4;
2020 writefn(dst, stride, col_dup32_hi);
2021 dst += stride4;
2022 }
2023 }
2024
2025 // |ref| points to 8 bytes containing 4 packed int16 values.
DcSum4_SSE4_1(const void * ref)2026 inline __m128i DcSum4_SSE4_1(const void* ref) {
2027 const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
2028 const __m128i ones = _mm_set1_epi16(1);
2029
2030 // half_sum[31:0] = a1+a2
2031 // half_sum[63:32] = a3+a4
2032 const __m128i half_sum = _mm_madd_epi16(vals, ones);
2033 // Place half_sum[63:32] in shift_sum[31:0].
2034 const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
2035 return _mm_add_epi32(half_sum, shift_sum);
2036 }
2037
2038 struct DcDefs {
2039 DcDefs() = delete;
2040
2041 using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
2042 DcStore4xH_SSE4_1<4>, 0, 0>;
2043 };
2044
2045 struct DirDefs {
2046 DirDefs() = delete;
2047
2048 using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
2049 using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
2050 using _4x16 =
2051 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
2052 using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
2053 using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
2054 using _8x16 =
2055 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
2056 using _8x32 =
2057 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
2058 using _16x4 =
2059 DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
2060 using _16x8 =
2061 DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
2062 using _16x16 =
2063 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
2064 using _16x32 =
2065 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
2066 using _16x64 =
2067 DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
2068 using _32x8 =
2069 DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
2070 using _32x16 =
2071 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
2072 using _32x32 =
2073 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
2074 using _32x64 =
2075 DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
2076 using _64x16 =
2077 DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
2078 using _64x32 =
2079 DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
2080 using _64x64 =
2081 DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
2082 };
2083
Init10bpp()2084 void Init10bpp() {
2085 Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
2086 assert(dsp != nullptr);
2087 static_cast<void>(dsp);
2088 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
2089 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
2090 DcDefs::_4x4::DcTop;
2091 #endif
2092 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
2093 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
2094 DcDefs::_4x4::DcLeft;
2095 #endif
2096 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
2097 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
2098 DcDefs::_4x4::Dc;
2099 #endif
2100 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
2101 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
2102 DirDefs::_4x4::Horizontal;
2103 #endif
2104 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
2105 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
2106 DirDefs::_4x8::Horizontal;
2107 #endif
2108 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
2109 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
2110 DirDefs::_4x16::Horizontal;
2111 #endif
2112 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
2113 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
2114 DirDefs::_8x4::Horizontal;
2115 #endif
2116 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
2117 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
2118 DirDefs::_8x8::Horizontal;
2119 #endif
2120 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
2121 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
2122 DirDefs::_8x16::Horizontal;
2123 #endif
2124 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
2125 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
2126 DirDefs::_8x32::Horizontal;
2127 #endif
2128 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
2129 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
2130 DirDefs::_16x4::Horizontal;
2131 #endif
2132 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
2133 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
2134 DirDefs::_16x8::Horizontal;
2135 #endif
2136 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
2137 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
2138 DirDefs::_16x16::Horizontal;
2139 #endif
2140 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
2141 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
2142 DirDefs::_16x32::Horizontal;
2143 #endif
2144 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
2145 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
2146 DirDefs::_16x64::Horizontal;
2147 #endif
2148 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
2149 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
2150 DirDefs::_32x8::Horizontal;
2151 #endif
2152 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
2153 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
2154 DirDefs::_32x16::Horizontal;
2155 #endif
2156 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
2157 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
2158 DirDefs::_32x32::Horizontal;
2159 #endif
2160 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
2161 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
2162 DirDefs::_32x64::Horizontal;
2163 #endif
2164 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
2165 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
2166 DirDefs::_64x16::Horizontal;
2167 #endif
2168 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
2169 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
2170 DirDefs::_64x32::Horizontal;
2171 #endif
2172 #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
2173 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
2174 DirDefs::_64x64::Horizontal;
2175 #endif
2176 }
2177
2178 } // namespace
2179 } // namespace high_bitdepth
2180 #endif // LIBGAV1_MAX_BITDEPTH >= 10
2181
IntraPredInit_SSE4_1()2182 void IntraPredInit_SSE4_1() {
2183 low_bitdepth::Init8bpp();
2184 #if LIBGAV1_MAX_BITDEPTH >= 10
2185 high_bitdepth::Init10bpp();
2186 #endif
2187 }
2188
2189 } // namespace dsp
2190 } // namespace libgav1
2191
2192 #else // !LIBGAV1_TARGETING_SSE4_1
2193 namespace libgav1 {
2194 namespace dsp {
2195
IntraPredInit_SSE4_1()2196 void IntraPredInit_SSE4_1() {}
2197
2198 } // namespace dsp
2199 } // namespace libgav1
2200 #endif // LIBGAV1_TARGETING_SSE4_1
2201