1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/intrapred.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_SSE4_1
19*09537850SAkhilesh Sanikop
20*09537850SAkhilesh Sanikop #include <xmmintrin.h>
21*09537850SAkhilesh Sanikop
22*09537850SAkhilesh Sanikop #include <algorithm>
23*09537850SAkhilesh Sanikop #include <cassert>
24*09537850SAkhilesh Sanikop #include <cstddef>
25*09537850SAkhilesh Sanikop #include <cstdint>
26*09537850SAkhilesh Sanikop #include <cstring>
27*09537850SAkhilesh Sanikop
28*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
30*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_sse4.h"
31*09537850SAkhilesh Sanikop #include "src/dsp/x86/transpose_sse4.h"
32*09537850SAkhilesh Sanikop #include "src/utils/common.h"
33*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
34*09537850SAkhilesh Sanikop
35*09537850SAkhilesh Sanikop namespace libgav1 {
36*09537850SAkhilesh Sanikop namespace dsp {
37*09537850SAkhilesh Sanikop namespace {
38*09537850SAkhilesh Sanikop
39*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
40*09537850SAkhilesh Sanikop // Utility Functions
41*09537850SAkhilesh Sanikop
42*09537850SAkhilesh Sanikop // This is a fast way to divide by a number of the form 2^n + 2^k, n > k.
43*09537850SAkhilesh Sanikop // Divide by 2^k by right shifting by k, leaving the denominator 2^m + 1. In the
44*09537850SAkhilesh Sanikop // block size cases, n - k is 1 or 2 (block is proportional to 1x2 or 1x4), so
45*09537850SAkhilesh Sanikop // we use a multiplier that reflects division by 2+1=3 or 4+1=5 in the high
46*09537850SAkhilesh Sanikop // bits.
47*09537850SAkhilesh Sanikop constexpr int kThreeInverse = 0x5556;
48*09537850SAkhilesh Sanikop constexpr int kFiveInverse = 0x3334;
49*09537850SAkhilesh Sanikop template <int shiftk, int multiplier>
DivideByMultiplyShift_U32(const __m128i dividend)50*09537850SAkhilesh Sanikop inline __m128i DivideByMultiplyShift_U32(const __m128i dividend) {
51*09537850SAkhilesh Sanikop const __m128i interm = _mm_srli_epi32(dividend, shiftk);
52*09537850SAkhilesh Sanikop return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
53*09537850SAkhilesh Sanikop }
54*09537850SAkhilesh Sanikop
55*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
56*09537850SAkhilesh Sanikop // DcPredFuncs_SSE4_1
57*09537850SAkhilesh Sanikop
58*09537850SAkhilesh Sanikop using DcSumFunc = __m128i (*)(const void* ref);
59*09537850SAkhilesh Sanikop using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const __m128i dc);
60*09537850SAkhilesh Sanikop using WriteDuplicateFunc = void (*)(void* dest, ptrdiff_t stride,
61*09537850SAkhilesh Sanikop const __m128i column);
62*09537850SAkhilesh Sanikop // For copying an entire column across a block.
63*09537850SAkhilesh Sanikop using ColumnStoreFunc = void (*)(void* dest, ptrdiff_t stride,
64*09537850SAkhilesh Sanikop const void* column);
65*09537850SAkhilesh Sanikop
66*09537850SAkhilesh Sanikop // DC intra-predictors for non-square blocks.
67*09537850SAkhilesh Sanikop template <int width_log2, int height_log2, DcSumFunc top_sumfn,
68*09537850SAkhilesh Sanikop DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
69*09537850SAkhilesh Sanikop struct DcPredFuncs_SSE4_1 {
70*09537850SAkhilesh Sanikop DcPredFuncs_SSE4_1() = delete;
71*09537850SAkhilesh Sanikop
72*09537850SAkhilesh Sanikop static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
73*09537850SAkhilesh Sanikop const void* left_column);
74*09537850SAkhilesh Sanikop static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
75*09537850SAkhilesh Sanikop const void* left_column);
76*09537850SAkhilesh Sanikop static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
77*09537850SAkhilesh Sanikop const void* left_column);
78*09537850SAkhilesh Sanikop };
79*09537850SAkhilesh Sanikop
80*09537850SAkhilesh Sanikop // Directional intra-predictors for square blocks.
81*09537850SAkhilesh Sanikop template <ColumnStoreFunc col_storefn>
82*09537850SAkhilesh Sanikop struct DirectionalPredFuncs_SSE4_1 {
83*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1() = delete;
84*09537850SAkhilesh Sanikop
85*09537850SAkhilesh Sanikop static void Vertical(void* dest, ptrdiff_t stride, const void* top_row,
86*09537850SAkhilesh Sanikop const void* left_column);
87*09537850SAkhilesh Sanikop static void Horizontal(void* dest, ptrdiff_t stride, const void* top_row,
88*09537850SAkhilesh Sanikop const void* left_column);
89*09537850SAkhilesh Sanikop };
90*09537850SAkhilesh Sanikop
91*09537850SAkhilesh Sanikop template <int width_log2, int height_log2, DcSumFunc top_sumfn,
92*09537850SAkhilesh Sanikop DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
93*09537850SAkhilesh Sanikop void DcPredFuncs_SSE4_1<
94*09537850SAkhilesh Sanikop width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
DcTop(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void *)95*09537850SAkhilesh Sanikop dc_mult>::DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
96*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
97*09537850SAkhilesh Sanikop const void* /*left_column*/) {
98*09537850SAkhilesh Sanikop const __m128i rounder = _mm_set1_epi32(1 << (width_log2 - 1));
99*09537850SAkhilesh Sanikop const __m128i sum = top_sumfn(top_row);
100*09537850SAkhilesh Sanikop const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2);
101*09537850SAkhilesh Sanikop storefn(dest, stride, dc);
102*09537850SAkhilesh Sanikop }
103*09537850SAkhilesh Sanikop
104*09537850SAkhilesh Sanikop template <int width_log2, int height_log2, DcSumFunc top_sumfn,
105*09537850SAkhilesh Sanikop DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
106*09537850SAkhilesh Sanikop void DcPredFuncs_SSE4_1<
107*09537850SAkhilesh Sanikop width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
DcLeft(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)108*09537850SAkhilesh Sanikop dc_mult>::DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
109*09537850SAkhilesh Sanikop const void* /*top_row*/,
110*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
111*09537850SAkhilesh Sanikop const __m128i rounder = _mm_set1_epi32(1 << (height_log2 - 1));
112*09537850SAkhilesh Sanikop const __m128i sum = left_sumfn(left_column);
113*09537850SAkhilesh Sanikop const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, rounder), height_log2);
114*09537850SAkhilesh Sanikop storefn(dest, stride, dc);
115*09537850SAkhilesh Sanikop }
116*09537850SAkhilesh Sanikop
117*09537850SAkhilesh Sanikop template <int width_log2, int height_log2, DcSumFunc top_sumfn,
118*09537850SAkhilesh Sanikop DcSumFunc left_sumfn, DcStoreFunc storefn, int shiftk, int dc_mult>
119*09537850SAkhilesh Sanikop void DcPredFuncs_SSE4_1<
120*09537850SAkhilesh Sanikop width_log2, height_log2, top_sumfn, left_sumfn, storefn, shiftk,
Dc(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)121*09537850SAkhilesh Sanikop dc_mult>::Dc(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
122*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
123*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
124*09537850SAkhilesh Sanikop const __m128i rounder =
125*09537850SAkhilesh Sanikop _mm_set1_epi32((1 << (width_log2 - 1)) + (1 << (height_log2 - 1)));
126*09537850SAkhilesh Sanikop const __m128i sum_top = top_sumfn(top_row);
127*09537850SAkhilesh Sanikop const __m128i sum_left = left_sumfn(left_column);
128*09537850SAkhilesh Sanikop const __m128i sum = _mm_add_epi32(sum_top, sum_left);
129*09537850SAkhilesh Sanikop if (width_log2 == height_log2) {
130*09537850SAkhilesh Sanikop const __m128i dc =
131*09537850SAkhilesh Sanikop _mm_srli_epi32(_mm_add_epi32(sum, rounder), width_log2 + 1);
132*09537850SAkhilesh Sanikop storefn(dest, stride, dc);
133*09537850SAkhilesh Sanikop } else {
134*09537850SAkhilesh Sanikop const __m128i dc =
135*09537850SAkhilesh Sanikop DivideByMultiplyShift_U32<shiftk, dc_mult>(_mm_add_epi32(sum, rounder));
136*09537850SAkhilesh Sanikop storefn(dest, stride, dc);
137*09537850SAkhilesh Sanikop }
138*09537850SAkhilesh Sanikop }
139*09537850SAkhilesh Sanikop
140*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
141*09537850SAkhilesh Sanikop // DcPredFuncs_SSE4_1 directional predictors
142*09537850SAkhilesh Sanikop
143*09537850SAkhilesh Sanikop template <ColumnStoreFunc col_storefn>
Horizontal(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)144*09537850SAkhilesh Sanikop void DirectionalPredFuncs_SSE4_1<col_storefn>::Horizontal(
145*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
146*09537850SAkhilesh Sanikop const void* /*top_row*/, const void* LIBGAV1_RESTRICT const left_column) {
147*09537850SAkhilesh Sanikop col_storefn(dest, stride, left_column);
148*09537850SAkhilesh Sanikop }
149*09537850SAkhilesh Sanikop
150*09537850SAkhilesh Sanikop } // namespace
151*09537850SAkhilesh Sanikop
152*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
153*09537850SAkhilesh Sanikop namespace low_bitdepth {
154*09537850SAkhilesh Sanikop namespace {
155*09537850SAkhilesh Sanikop
156*09537850SAkhilesh Sanikop // |ref| points to 4 bytes containing 4 packed ints.
DcSum4_SSE4_1(const void * const ref)157*09537850SAkhilesh Sanikop inline __m128i DcSum4_SSE4_1(const void* const ref) {
158*09537850SAkhilesh Sanikop const __m128i vals = Load4(ref);
159*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
160*09537850SAkhilesh Sanikop return _mm_sad_epu8(vals, zero);
161*09537850SAkhilesh Sanikop }
162*09537850SAkhilesh Sanikop
DcSum8_SSE4_1(const void * const ref)163*09537850SAkhilesh Sanikop inline __m128i DcSum8_SSE4_1(const void* const ref) {
164*09537850SAkhilesh Sanikop const __m128i vals = LoadLo8(ref);
165*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
166*09537850SAkhilesh Sanikop return _mm_sad_epu8(vals, zero);
167*09537850SAkhilesh Sanikop }
168*09537850SAkhilesh Sanikop
DcSum16_SSE4_1(const void * const ref)169*09537850SAkhilesh Sanikop inline __m128i DcSum16_SSE4_1(const void* const ref) {
170*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
171*09537850SAkhilesh Sanikop const __m128i vals = LoadUnaligned16(ref);
172*09537850SAkhilesh Sanikop const __m128i partial_sum = _mm_sad_epu8(vals, zero);
173*09537850SAkhilesh Sanikop return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
174*09537850SAkhilesh Sanikop }
175*09537850SAkhilesh Sanikop
DcSum32_SSE4_1(const void * const ref)176*09537850SAkhilesh Sanikop inline __m128i DcSum32_SSE4_1(const void* const ref) {
177*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
178*09537850SAkhilesh Sanikop const __m128i vals1 = LoadUnaligned16(ref);
179*09537850SAkhilesh Sanikop const __m128i vals2 = LoadUnaligned16(static_cast<const uint8_t*>(ref) + 16);
180*09537850SAkhilesh Sanikop const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
181*09537850SAkhilesh Sanikop const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
182*09537850SAkhilesh Sanikop const __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
183*09537850SAkhilesh Sanikop return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
184*09537850SAkhilesh Sanikop }
185*09537850SAkhilesh Sanikop
DcSum64_SSE4_1(const void * const ref)186*09537850SAkhilesh Sanikop inline __m128i DcSum64_SSE4_1(const void* const ref) {
187*09537850SAkhilesh Sanikop const auto* const ref_ptr = static_cast<const uint8_t*>(ref);
188*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
189*09537850SAkhilesh Sanikop const __m128i vals1 = LoadUnaligned16(ref_ptr);
190*09537850SAkhilesh Sanikop const __m128i vals2 = LoadUnaligned16(ref_ptr + 16);
191*09537850SAkhilesh Sanikop const __m128i vals3 = LoadUnaligned16(ref_ptr + 32);
192*09537850SAkhilesh Sanikop const __m128i vals4 = LoadUnaligned16(ref_ptr + 48);
193*09537850SAkhilesh Sanikop const __m128i partial_sum1 = _mm_sad_epu8(vals1, zero);
194*09537850SAkhilesh Sanikop const __m128i partial_sum2 = _mm_sad_epu8(vals2, zero);
195*09537850SAkhilesh Sanikop __m128i partial_sum = _mm_add_epi16(partial_sum1, partial_sum2);
196*09537850SAkhilesh Sanikop const __m128i partial_sum3 = _mm_sad_epu8(vals3, zero);
197*09537850SAkhilesh Sanikop partial_sum = _mm_add_epi16(partial_sum, partial_sum3);
198*09537850SAkhilesh Sanikop const __m128i partial_sum4 = _mm_sad_epu8(vals4, zero);
199*09537850SAkhilesh Sanikop partial_sum = _mm_add_epi16(partial_sum, partial_sum4);
200*09537850SAkhilesh Sanikop return _mm_add_epi16(partial_sum, _mm_srli_si128(partial_sum, 8));
201*09537850SAkhilesh Sanikop }
202*09537850SAkhilesh Sanikop
203*09537850SAkhilesh Sanikop template <int height>
DcStore4xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)204*09537850SAkhilesh Sanikop inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
205*09537850SAkhilesh Sanikop const __m128i dc) {
206*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
207*09537850SAkhilesh Sanikop const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
208*09537850SAkhilesh Sanikop int y = height - 1;
209*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
210*09537850SAkhilesh Sanikop do {
211*09537850SAkhilesh Sanikop Store4(dst, dc_dup);
212*09537850SAkhilesh Sanikop dst += stride;
213*09537850SAkhilesh Sanikop } while (--y != 0);
214*09537850SAkhilesh Sanikop Store4(dst, dc_dup);
215*09537850SAkhilesh Sanikop }
216*09537850SAkhilesh Sanikop
217*09537850SAkhilesh Sanikop template <int height>
DcStore8xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)218*09537850SAkhilesh Sanikop inline void DcStore8xH_SSE4_1(void* const dest, ptrdiff_t stride,
219*09537850SAkhilesh Sanikop const __m128i dc) {
220*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
221*09537850SAkhilesh Sanikop const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
222*09537850SAkhilesh Sanikop int y = height - 1;
223*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
224*09537850SAkhilesh Sanikop do {
225*09537850SAkhilesh Sanikop StoreLo8(dst, dc_dup);
226*09537850SAkhilesh Sanikop dst += stride;
227*09537850SAkhilesh Sanikop } while (--y != 0);
228*09537850SAkhilesh Sanikop StoreLo8(dst, dc_dup);
229*09537850SAkhilesh Sanikop }
230*09537850SAkhilesh Sanikop
231*09537850SAkhilesh Sanikop template <int height>
DcStore16xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)232*09537850SAkhilesh Sanikop inline void DcStore16xH_SSE4_1(void* const dest, ptrdiff_t stride,
233*09537850SAkhilesh Sanikop const __m128i dc) {
234*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
235*09537850SAkhilesh Sanikop const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
236*09537850SAkhilesh Sanikop int y = height - 1;
237*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
238*09537850SAkhilesh Sanikop do {
239*09537850SAkhilesh Sanikop StoreUnaligned16(dst, dc_dup);
240*09537850SAkhilesh Sanikop dst += stride;
241*09537850SAkhilesh Sanikop } while (--y != 0);
242*09537850SAkhilesh Sanikop StoreUnaligned16(dst, dc_dup);
243*09537850SAkhilesh Sanikop }
244*09537850SAkhilesh Sanikop
245*09537850SAkhilesh Sanikop template <int height>
DcStore32xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)246*09537850SAkhilesh Sanikop inline void DcStore32xH_SSE4_1(void* const dest, ptrdiff_t stride,
247*09537850SAkhilesh Sanikop const __m128i dc) {
248*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
249*09537850SAkhilesh Sanikop const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
250*09537850SAkhilesh Sanikop int y = height - 1;
251*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
252*09537850SAkhilesh Sanikop do {
253*09537850SAkhilesh Sanikop StoreUnaligned16(dst, dc_dup);
254*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 16, dc_dup);
255*09537850SAkhilesh Sanikop dst += stride;
256*09537850SAkhilesh Sanikop } while (--y != 0);
257*09537850SAkhilesh Sanikop StoreUnaligned16(dst, dc_dup);
258*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 16, dc_dup);
259*09537850SAkhilesh Sanikop }
260*09537850SAkhilesh Sanikop
261*09537850SAkhilesh Sanikop template <int height>
DcStore64xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)262*09537850SAkhilesh Sanikop inline void DcStore64xH_SSE4_1(void* const dest, ptrdiff_t stride,
263*09537850SAkhilesh Sanikop const __m128i dc) {
264*09537850SAkhilesh Sanikop const __m128i zero = _mm_setzero_si128();
265*09537850SAkhilesh Sanikop const __m128i dc_dup = _mm_shuffle_epi8(dc, zero);
266*09537850SAkhilesh Sanikop int y = height - 1;
267*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
268*09537850SAkhilesh Sanikop do {
269*09537850SAkhilesh Sanikop StoreUnaligned16(dst, dc_dup);
270*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 16, dc_dup);
271*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 32, dc_dup);
272*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 48, dc_dup);
273*09537850SAkhilesh Sanikop dst += stride;
274*09537850SAkhilesh Sanikop } while (--y != 0);
275*09537850SAkhilesh Sanikop StoreUnaligned16(dst, dc_dup);
276*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 16, dc_dup);
277*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 32, dc_dup);
278*09537850SAkhilesh Sanikop StoreUnaligned16(dst + 48, dc_dup);
279*09537850SAkhilesh Sanikop }
280*09537850SAkhilesh Sanikop
281*09537850SAkhilesh Sanikop // WriteDuplicateN assumes dup has 4 sets of 4 identical bytes that are meant to
282*09537850SAkhilesh Sanikop // be copied for width N into dest.
WriteDuplicate4x4(void * const dest,ptrdiff_t stride,const __m128i dup32)283*09537850SAkhilesh Sanikop inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
284*09537850SAkhilesh Sanikop const __m128i dup32) {
285*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
286*09537850SAkhilesh Sanikop Store4(dst, dup32);
287*09537850SAkhilesh Sanikop dst += stride;
288*09537850SAkhilesh Sanikop const int row1 = _mm_extract_epi32(dup32, 1);
289*09537850SAkhilesh Sanikop memcpy(dst, &row1, 4);
290*09537850SAkhilesh Sanikop dst += stride;
291*09537850SAkhilesh Sanikop const int row2 = _mm_extract_epi32(dup32, 2);
292*09537850SAkhilesh Sanikop memcpy(dst, &row2, 4);
293*09537850SAkhilesh Sanikop dst += stride;
294*09537850SAkhilesh Sanikop const int row3 = _mm_extract_epi32(dup32, 3);
295*09537850SAkhilesh Sanikop memcpy(dst, &row3, 4);
296*09537850SAkhilesh Sanikop }
297*09537850SAkhilesh Sanikop
WriteDuplicate8x4(void * const dest,ptrdiff_t stride,const __m128i dup32)298*09537850SAkhilesh Sanikop inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
299*09537850SAkhilesh Sanikop const __m128i dup32) {
300*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
301*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
302*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
303*09537850SAkhilesh Sanikop _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
304*09537850SAkhilesh Sanikop dst += stride;
305*09537850SAkhilesh Sanikop _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
306*09537850SAkhilesh Sanikop dst += stride;
307*09537850SAkhilesh Sanikop _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
308*09537850SAkhilesh Sanikop dst += stride;
309*09537850SAkhilesh Sanikop _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
310*09537850SAkhilesh Sanikop }
311*09537850SAkhilesh Sanikop
WriteDuplicate16x4(void * const dest,ptrdiff_t stride,const __m128i dup32)312*09537850SAkhilesh Sanikop inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
313*09537850SAkhilesh Sanikop const __m128i dup32) {
314*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
315*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
316*09537850SAkhilesh Sanikop
317*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
318*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
319*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
320*09537850SAkhilesh Sanikop dst += stride;
321*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
322*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
323*09537850SAkhilesh Sanikop dst += stride;
324*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
325*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
326*09537850SAkhilesh Sanikop dst += stride;
327*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
328*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
329*09537850SAkhilesh Sanikop }
330*09537850SAkhilesh Sanikop
WriteDuplicate32x4(void * const dest,ptrdiff_t stride,const __m128i dup32)331*09537850SAkhilesh Sanikop inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
332*09537850SAkhilesh Sanikop const __m128i dup32) {
333*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
334*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
335*09537850SAkhilesh Sanikop
336*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
337*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
338*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
339*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
340*09537850SAkhilesh Sanikop dst += stride;
341*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
342*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
343*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
344*09537850SAkhilesh Sanikop dst += stride;
345*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
346*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
347*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
348*09537850SAkhilesh Sanikop dst += stride;
349*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
350*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
351*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
352*09537850SAkhilesh Sanikop }
353*09537850SAkhilesh Sanikop
WriteDuplicate64x4(void * const dest,ptrdiff_t stride,const __m128i dup32)354*09537850SAkhilesh Sanikop inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
355*09537850SAkhilesh Sanikop const __m128i dup32) {
356*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
357*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
358*09537850SAkhilesh Sanikop
359*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
360*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
361*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
362*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
363*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
364*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
365*09537850SAkhilesh Sanikop dst += stride;
366*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
367*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
368*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
369*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
370*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
371*09537850SAkhilesh Sanikop dst += stride;
372*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
373*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
374*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
375*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
376*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
377*09537850SAkhilesh Sanikop dst += stride;
378*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
379*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
380*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
381*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
382*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
383*09537850SAkhilesh Sanikop }
384*09537850SAkhilesh Sanikop
385*09537850SAkhilesh Sanikop // ColStoreN<height> copies each of the |height| values in |column| across its
386*09537850SAkhilesh Sanikop // corresponding in dest.
387*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)388*09537850SAkhilesh Sanikop inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
389*09537850SAkhilesh Sanikop ptrdiff_t stride,
390*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
391*09537850SAkhilesh Sanikop const __m128i col_data = Load4(column);
392*09537850SAkhilesh Sanikop const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
393*09537850SAkhilesh Sanikop const __m128i col_dup32 = _mm_unpacklo_epi16(col_dup16, col_dup16);
394*09537850SAkhilesh Sanikop writefn(dest, stride, col_dup32);
395*09537850SAkhilesh Sanikop }
396*09537850SAkhilesh Sanikop
397*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)398*09537850SAkhilesh Sanikop inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
399*09537850SAkhilesh Sanikop ptrdiff_t stride,
400*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
401*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
402*09537850SAkhilesh Sanikop const __m128i col_data = LoadLo8(column);
403*09537850SAkhilesh Sanikop const __m128i col_dup16 = _mm_unpacklo_epi8(col_data, col_data);
404*09537850SAkhilesh Sanikop const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_dup16, col_dup16);
405*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
406*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lo);
407*09537850SAkhilesh Sanikop dst += stride4;
408*09537850SAkhilesh Sanikop const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_dup16, col_dup16);
409*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hi);
410*09537850SAkhilesh Sanikop }
411*09537850SAkhilesh Sanikop
412*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)413*09537850SAkhilesh Sanikop inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
414*09537850SAkhilesh Sanikop ptrdiff_t stride,
415*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
416*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
417*09537850SAkhilesh Sanikop const __m128i col_data = _mm_loadu_si128(static_cast<const __m128i*>(column));
418*09537850SAkhilesh Sanikop const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
419*09537850SAkhilesh Sanikop const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
420*09537850SAkhilesh Sanikop const __m128i col_dup32_lolo = _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
421*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
422*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lolo);
423*09537850SAkhilesh Sanikop dst += stride4;
424*09537850SAkhilesh Sanikop const __m128i col_dup32_lohi = _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
425*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lohi);
426*09537850SAkhilesh Sanikop dst += stride4;
427*09537850SAkhilesh Sanikop const __m128i col_dup32_hilo = _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
428*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hilo);
429*09537850SAkhilesh Sanikop dst += stride4;
430*09537850SAkhilesh Sanikop const __m128i col_dup32_hihi = _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
431*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hihi);
432*09537850SAkhilesh Sanikop }
433*09537850SAkhilesh Sanikop
434*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)435*09537850SAkhilesh Sanikop inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
436*09537850SAkhilesh Sanikop ptrdiff_t stride,
437*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
438*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
439*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
440*09537850SAkhilesh Sanikop for (int y = 0; y < 32; y += 16) {
441*09537850SAkhilesh Sanikop const __m128i col_data =
442*09537850SAkhilesh Sanikop LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
443*09537850SAkhilesh Sanikop const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
444*09537850SAkhilesh Sanikop const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
445*09537850SAkhilesh Sanikop const __m128i col_dup32_lolo =
446*09537850SAkhilesh Sanikop _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
447*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lolo);
448*09537850SAkhilesh Sanikop dst += stride4;
449*09537850SAkhilesh Sanikop const __m128i col_dup32_lohi =
450*09537850SAkhilesh Sanikop _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
451*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lohi);
452*09537850SAkhilesh Sanikop dst += stride4;
453*09537850SAkhilesh Sanikop const __m128i col_dup32_hilo =
454*09537850SAkhilesh Sanikop _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
455*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hilo);
456*09537850SAkhilesh Sanikop dst += stride4;
457*09537850SAkhilesh Sanikop const __m128i col_dup32_hihi =
458*09537850SAkhilesh Sanikop _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
459*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hihi);
460*09537850SAkhilesh Sanikop dst += stride4;
461*09537850SAkhilesh Sanikop }
462*09537850SAkhilesh Sanikop }
463*09537850SAkhilesh Sanikop
464*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)465*09537850SAkhilesh Sanikop inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
466*09537850SAkhilesh Sanikop ptrdiff_t stride,
467*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
468*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
469*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
470*09537850SAkhilesh Sanikop for (int y = 0; y < 64; y += 16) {
471*09537850SAkhilesh Sanikop const __m128i col_data =
472*09537850SAkhilesh Sanikop LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
473*09537850SAkhilesh Sanikop const __m128i col_dup16_lo = _mm_unpacklo_epi8(col_data, col_data);
474*09537850SAkhilesh Sanikop const __m128i col_dup16_hi = _mm_unpackhi_epi8(col_data, col_data);
475*09537850SAkhilesh Sanikop const __m128i col_dup32_lolo =
476*09537850SAkhilesh Sanikop _mm_unpacklo_epi16(col_dup16_lo, col_dup16_lo);
477*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lolo);
478*09537850SAkhilesh Sanikop dst += stride4;
479*09537850SAkhilesh Sanikop const __m128i col_dup32_lohi =
480*09537850SAkhilesh Sanikop _mm_unpackhi_epi16(col_dup16_lo, col_dup16_lo);
481*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lohi);
482*09537850SAkhilesh Sanikop dst += stride4;
483*09537850SAkhilesh Sanikop const __m128i col_dup32_hilo =
484*09537850SAkhilesh Sanikop _mm_unpacklo_epi16(col_dup16_hi, col_dup16_hi);
485*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hilo);
486*09537850SAkhilesh Sanikop dst += stride4;
487*09537850SAkhilesh Sanikop const __m128i col_dup32_hihi =
488*09537850SAkhilesh Sanikop _mm_unpackhi_epi16(col_dup16_hi, col_dup16_hi);
489*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hihi);
490*09537850SAkhilesh Sanikop dst += stride4;
491*09537850SAkhilesh Sanikop }
492*09537850SAkhilesh Sanikop }
493*09537850SAkhilesh Sanikop
494*09537850SAkhilesh Sanikop struct DcDefs {
495*09537850SAkhilesh Sanikop DcDefs() = delete;
496*09537850SAkhilesh Sanikop
497*09537850SAkhilesh Sanikop using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
498*09537850SAkhilesh Sanikop DcStore4xH_SSE4_1<4>, 0, 0>;
499*09537850SAkhilesh Sanikop // shiftk is the smaller of width_log2 and height_log2.
500*09537850SAkhilesh Sanikop // dc_mult corresponds to the ratio of the smaller block size to the larger.
501*09537850SAkhilesh Sanikop using _4x8 = DcPredFuncs_SSE4_1<2, 3, DcSum4_SSE4_1, DcSum8_SSE4_1,
502*09537850SAkhilesh Sanikop DcStore4xH_SSE4_1<8>, 2, kThreeInverse>;
503*09537850SAkhilesh Sanikop using _4x16 = DcPredFuncs_SSE4_1<2, 4, DcSum4_SSE4_1, DcSum16_SSE4_1,
504*09537850SAkhilesh Sanikop DcStore4xH_SSE4_1<16>, 2, kFiveInverse>;
505*09537850SAkhilesh Sanikop
506*09537850SAkhilesh Sanikop using _8x4 = DcPredFuncs_SSE4_1<3, 2, DcSum8_SSE4_1, DcSum4_SSE4_1,
507*09537850SAkhilesh Sanikop DcStore8xH_SSE4_1<4>, 2, kThreeInverse>;
508*09537850SAkhilesh Sanikop using _8x8 = DcPredFuncs_SSE4_1<3, 3, DcSum8_SSE4_1, DcSum8_SSE4_1,
509*09537850SAkhilesh Sanikop DcStore8xH_SSE4_1<8>, 0, 0>;
510*09537850SAkhilesh Sanikop using _8x16 = DcPredFuncs_SSE4_1<3, 4, DcSum8_SSE4_1, DcSum16_SSE4_1,
511*09537850SAkhilesh Sanikop DcStore8xH_SSE4_1<16>, 3, kThreeInverse>;
512*09537850SAkhilesh Sanikop using _8x32 = DcPredFuncs_SSE4_1<3, 5, DcSum8_SSE4_1, DcSum32_SSE4_1,
513*09537850SAkhilesh Sanikop DcStore8xH_SSE4_1<32>, 3, kFiveInverse>;
514*09537850SAkhilesh Sanikop
515*09537850SAkhilesh Sanikop using _16x4 = DcPredFuncs_SSE4_1<4, 2, DcSum16_SSE4_1, DcSum4_SSE4_1,
516*09537850SAkhilesh Sanikop DcStore16xH_SSE4_1<4>, 2, kFiveInverse>;
517*09537850SAkhilesh Sanikop using _16x8 = DcPredFuncs_SSE4_1<4, 3, DcSum16_SSE4_1, DcSum8_SSE4_1,
518*09537850SAkhilesh Sanikop DcStore16xH_SSE4_1<8>, 3, kThreeInverse>;
519*09537850SAkhilesh Sanikop using _16x16 = DcPredFuncs_SSE4_1<4, 4, DcSum16_SSE4_1, DcSum16_SSE4_1,
520*09537850SAkhilesh Sanikop DcStore16xH_SSE4_1<16>, 0, 0>;
521*09537850SAkhilesh Sanikop using _16x32 = DcPredFuncs_SSE4_1<4, 5, DcSum16_SSE4_1, DcSum32_SSE4_1,
522*09537850SAkhilesh Sanikop DcStore16xH_SSE4_1<32>, 4, kThreeInverse>;
523*09537850SAkhilesh Sanikop using _16x64 = DcPredFuncs_SSE4_1<4, 6, DcSum16_SSE4_1, DcSum64_SSE4_1,
524*09537850SAkhilesh Sanikop DcStore16xH_SSE4_1<64>, 4, kFiveInverse>;
525*09537850SAkhilesh Sanikop
526*09537850SAkhilesh Sanikop using _32x8 = DcPredFuncs_SSE4_1<5, 3, DcSum32_SSE4_1, DcSum8_SSE4_1,
527*09537850SAkhilesh Sanikop DcStore32xH_SSE4_1<8>, 3, kFiveInverse>;
528*09537850SAkhilesh Sanikop using _32x16 = DcPredFuncs_SSE4_1<5, 4, DcSum32_SSE4_1, DcSum16_SSE4_1,
529*09537850SAkhilesh Sanikop DcStore32xH_SSE4_1<16>, 4, kThreeInverse>;
530*09537850SAkhilesh Sanikop using _32x32 = DcPredFuncs_SSE4_1<5, 5, DcSum32_SSE4_1, DcSum32_SSE4_1,
531*09537850SAkhilesh Sanikop DcStore32xH_SSE4_1<32>, 0, 0>;
532*09537850SAkhilesh Sanikop using _32x64 = DcPredFuncs_SSE4_1<5, 6, DcSum32_SSE4_1, DcSum64_SSE4_1,
533*09537850SAkhilesh Sanikop DcStore32xH_SSE4_1<64>, 5, kThreeInverse>;
534*09537850SAkhilesh Sanikop
535*09537850SAkhilesh Sanikop using _64x16 = DcPredFuncs_SSE4_1<6, 4, DcSum64_SSE4_1, DcSum16_SSE4_1,
536*09537850SAkhilesh Sanikop DcStore64xH_SSE4_1<16>, 4, kFiveInverse>;
537*09537850SAkhilesh Sanikop using _64x32 = DcPredFuncs_SSE4_1<6, 5, DcSum64_SSE4_1, DcSum32_SSE4_1,
538*09537850SAkhilesh Sanikop DcStore64xH_SSE4_1<32>, 5, kThreeInverse>;
539*09537850SAkhilesh Sanikop using _64x64 = DcPredFuncs_SSE4_1<6, 6, DcSum64_SSE4_1, DcSum64_SSE4_1,
540*09537850SAkhilesh Sanikop DcStore64xH_SSE4_1<64>, 0, 0>;
541*09537850SAkhilesh Sanikop };
542*09537850SAkhilesh Sanikop
543*09537850SAkhilesh Sanikop struct DirDefs {
544*09537850SAkhilesh Sanikop DirDefs() = delete;
545*09537850SAkhilesh Sanikop
546*09537850SAkhilesh Sanikop using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
547*09537850SAkhilesh Sanikop using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
548*09537850SAkhilesh Sanikop using _4x16 =
549*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
550*09537850SAkhilesh Sanikop using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
551*09537850SAkhilesh Sanikop using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
552*09537850SAkhilesh Sanikop using _8x16 =
553*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
554*09537850SAkhilesh Sanikop using _8x32 =
555*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
556*09537850SAkhilesh Sanikop using _16x4 =
557*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
558*09537850SAkhilesh Sanikop using _16x8 =
559*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
560*09537850SAkhilesh Sanikop using _16x16 =
561*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
562*09537850SAkhilesh Sanikop using _16x32 =
563*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
564*09537850SAkhilesh Sanikop using _16x64 =
565*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
566*09537850SAkhilesh Sanikop using _32x8 =
567*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
568*09537850SAkhilesh Sanikop using _32x16 =
569*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
570*09537850SAkhilesh Sanikop using _32x32 =
571*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
572*09537850SAkhilesh Sanikop using _32x64 =
573*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
574*09537850SAkhilesh Sanikop using _64x16 =
575*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
576*09537850SAkhilesh Sanikop using _64x32 =
577*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
578*09537850SAkhilesh Sanikop using _64x64 =
579*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
580*09537850SAkhilesh Sanikop };
581*09537850SAkhilesh Sanikop
582*09537850SAkhilesh Sanikop template <int y_mask>
WritePaethLine4(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & top_left_diffs)583*09537850SAkhilesh Sanikop inline void WritePaethLine4(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
584*09537850SAkhilesh Sanikop const __m128i& left, const __m128i& top_lefts,
585*09537850SAkhilesh Sanikop const __m128i& top_dists, const __m128i& left_dists,
586*09537850SAkhilesh Sanikop const __m128i& top_left_diffs) {
587*09537850SAkhilesh Sanikop const __m128i top_dists_y = _mm_shuffle_epi32(top_dists, y_mask);
588*09537850SAkhilesh Sanikop
589*09537850SAkhilesh Sanikop const __m128i lefts_y = _mm_shuffle_epi32(left, y_mask);
590*09537850SAkhilesh Sanikop const __m128i top_left_dists =
591*09537850SAkhilesh Sanikop _mm_abs_epi32(_mm_add_epi32(lefts_y, top_left_diffs));
592*09537850SAkhilesh Sanikop
593*09537850SAkhilesh Sanikop // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
594*09537850SAkhilesh Sanikop // operation is unavailable, so the logic for selecting top, left, or
595*09537850SAkhilesh Sanikop // top_left is inverted.
596*09537850SAkhilesh Sanikop __m128i not_select_left = _mm_cmpgt_epi32(left_dists, top_left_dists);
597*09537850SAkhilesh Sanikop not_select_left =
598*09537850SAkhilesh Sanikop _mm_or_si128(not_select_left, _mm_cmpgt_epi32(left_dists, top_dists_y));
599*09537850SAkhilesh Sanikop const __m128i not_select_top = _mm_cmpgt_epi32(top_dists_y, top_left_dists);
600*09537850SAkhilesh Sanikop
601*09537850SAkhilesh Sanikop const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
602*09537850SAkhilesh Sanikop
603*09537850SAkhilesh Sanikop const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
604*09537850SAkhilesh Sanikop __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
605*09537850SAkhilesh Sanikop top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
606*09537850SAkhilesh Sanikop top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
607*09537850SAkhilesh Sanikop
608*09537850SAkhilesh Sanikop // The sequence of 32-bit packed operations was found (see CL via blame) to
609*09537850SAkhilesh Sanikop // outperform 16-bit operations, despite the availability of the packus
610*09537850SAkhilesh Sanikop // function, when tested on a Xeon E7 v3.
611*09537850SAkhilesh Sanikop const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
612*09537850SAkhilesh Sanikop const __m128i pred = _mm_shuffle_epi8(
613*09537850SAkhilesh Sanikop _mm_or_si128(left_out, top_or_top_left_out), cvtepi32_epi8);
614*09537850SAkhilesh Sanikop Store4(dst, pred);
615*09537850SAkhilesh Sanikop }
616*09537850SAkhilesh Sanikop
617*09537850SAkhilesh Sanikop // top_left_diffs is the only variable whose ints may exceed 8 bits. Otherwise
618*09537850SAkhilesh Sanikop // we would be able to do all of these operations as epi8 for a 16-pixel version
619*09537850SAkhilesh Sanikop // of this function. Still, since lefts_y is just a vector of duplicates, it
620*09537850SAkhilesh Sanikop // could pay off to accommodate top_left_dists for cmpgt, and repack into epi8
621*09537850SAkhilesh Sanikop // for the blends.
622*09537850SAkhilesh Sanikop template <int y_mask>
WritePaethLine8(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & top_left_diffs)623*09537850SAkhilesh Sanikop inline void WritePaethLine8(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
624*09537850SAkhilesh Sanikop const __m128i& left, const __m128i& top_lefts,
625*09537850SAkhilesh Sanikop const __m128i& top_dists, const __m128i& left_dists,
626*09537850SAkhilesh Sanikop const __m128i& top_left_diffs) {
627*09537850SAkhilesh Sanikop const __m128i select_y = _mm_set1_epi32(y_mask);
628*09537850SAkhilesh Sanikop const __m128i top_dists_y = _mm_shuffle_epi8(top_dists, select_y);
629*09537850SAkhilesh Sanikop
630*09537850SAkhilesh Sanikop const __m128i lefts_y = _mm_shuffle_epi8(left, select_y);
631*09537850SAkhilesh Sanikop const __m128i top_left_dists =
632*09537850SAkhilesh Sanikop _mm_abs_epi16(_mm_add_epi16(lefts_y, top_left_diffs));
633*09537850SAkhilesh Sanikop
634*09537850SAkhilesh Sanikop // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
635*09537850SAkhilesh Sanikop // operation is unavailable, so the logic for selecting top, left, or
636*09537850SAkhilesh Sanikop // top_left is inverted.
637*09537850SAkhilesh Sanikop __m128i not_select_left = _mm_cmpgt_epi16(left_dists, top_left_dists);
638*09537850SAkhilesh Sanikop not_select_left =
639*09537850SAkhilesh Sanikop _mm_or_si128(not_select_left, _mm_cmpgt_epi16(left_dists, top_dists_y));
640*09537850SAkhilesh Sanikop const __m128i not_select_top = _mm_cmpgt_epi16(top_dists_y, top_left_dists);
641*09537850SAkhilesh Sanikop
642*09537850SAkhilesh Sanikop const __m128i left_out = _mm_andnot_si128(not_select_left, lefts_y);
643*09537850SAkhilesh Sanikop
644*09537850SAkhilesh Sanikop const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
645*09537850SAkhilesh Sanikop __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
646*09537850SAkhilesh Sanikop top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
647*09537850SAkhilesh Sanikop top_or_top_left_out = _mm_and_si128(not_select_left, top_or_top_left_out);
648*09537850SAkhilesh Sanikop
649*09537850SAkhilesh Sanikop const __m128i pred = _mm_packus_epi16(
650*09537850SAkhilesh Sanikop _mm_or_si128(left_out, top_or_top_left_out), /* unused */ left_out);
651*09537850SAkhilesh Sanikop _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), pred);
652*09537850SAkhilesh Sanikop }
653*09537850SAkhilesh Sanikop
654*09537850SAkhilesh Sanikop // |top| is an epi8 of length 16
655*09537850SAkhilesh Sanikop // |left| is epi8 of unknown length, as y_mask specifies access
656*09537850SAkhilesh Sanikop // |top_lefts| is an epi8 of 16 duplicates
657*09537850SAkhilesh Sanikop // |top_dists| is an epi8 of unknown length, as y_mask specifies access
658*09537850SAkhilesh Sanikop // |left_dists| is an epi8 of length 16
659*09537850SAkhilesh Sanikop // |left_dists_lo| is an epi16 of length 8
660*09537850SAkhilesh Sanikop // |left_dists_hi| is an epi16 of length 8
661*09537850SAkhilesh Sanikop // |top_left_diffs_lo| is an epi16 of length 8
662*09537850SAkhilesh Sanikop // |top_left_diffs_hi| is an epi16 of length 8
663*09537850SAkhilesh Sanikop // The latter two vectors are epi16 because their values may reach -510.
664*09537850SAkhilesh Sanikop // |left_dists| is provided alongside its spread out version because it doesn't
665*09537850SAkhilesh Sanikop // change between calls and interacts with both kinds of packing.
666*09537850SAkhilesh Sanikop template <int y_mask>
WritePaethLine16(uint8_t * LIBGAV1_RESTRICT dst,const __m128i & top,const __m128i & left,const __m128i & top_lefts,const __m128i & top_dists,const __m128i & left_dists,const __m128i & left_dists_lo,const __m128i & left_dists_hi,const __m128i & top_left_diffs_lo,const __m128i & top_left_diffs_hi)667*09537850SAkhilesh Sanikop inline void WritePaethLine16(uint8_t* LIBGAV1_RESTRICT dst, const __m128i& top,
668*09537850SAkhilesh Sanikop const __m128i& left, const __m128i& top_lefts,
669*09537850SAkhilesh Sanikop const __m128i& top_dists,
670*09537850SAkhilesh Sanikop const __m128i& left_dists,
671*09537850SAkhilesh Sanikop const __m128i& left_dists_lo,
672*09537850SAkhilesh Sanikop const __m128i& left_dists_hi,
673*09537850SAkhilesh Sanikop const __m128i& top_left_diffs_lo,
674*09537850SAkhilesh Sanikop const __m128i& top_left_diffs_hi) {
675*09537850SAkhilesh Sanikop const __m128i select_y = _mm_set1_epi32(y_mask);
676*09537850SAkhilesh Sanikop const __m128i top_dists_y8 = _mm_shuffle_epi8(top_dists, select_y);
677*09537850SAkhilesh Sanikop const __m128i top_dists_y16 = _mm_cvtepu8_epi16(top_dists_y8);
678*09537850SAkhilesh Sanikop const __m128i lefts_y8 = _mm_shuffle_epi8(left, select_y);
679*09537850SAkhilesh Sanikop const __m128i lefts_y16 = _mm_cvtepu8_epi16(lefts_y8);
680*09537850SAkhilesh Sanikop
681*09537850SAkhilesh Sanikop const __m128i top_left_dists_lo =
682*09537850SAkhilesh Sanikop _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_lo));
683*09537850SAkhilesh Sanikop const __m128i top_left_dists_hi =
684*09537850SAkhilesh Sanikop _mm_abs_epi16(_mm_add_epi16(lefts_y16, top_left_diffs_hi));
685*09537850SAkhilesh Sanikop
686*09537850SAkhilesh Sanikop const __m128i left_gt_top_left_lo = _mm_packs_epi16(
687*09537850SAkhilesh Sanikop _mm_cmpgt_epi16(left_dists_lo, top_left_dists_lo), left_dists_lo);
688*09537850SAkhilesh Sanikop const __m128i left_gt_top_left_hi =
689*09537850SAkhilesh Sanikop _mm_packs_epi16(_mm_cmpgt_epi16(left_dists_hi, top_left_dists_hi),
690*09537850SAkhilesh Sanikop /* unused second arg for pack */ left_dists_hi);
691*09537850SAkhilesh Sanikop const __m128i left_gt_top_left = _mm_alignr_epi8(
692*09537850SAkhilesh Sanikop left_gt_top_left_hi, _mm_slli_si128(left_gt_top_left_lo, 8), 8);
693*09537850SAkhilesh Sanikop
694*09537850SAkhilesh Sanikop const __m128i not_select_top_lo =
695*09537850SAkhilesh Sanikop _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_lo),
696*09537850SAkhilesh Sanikop /* unused second arg for pack */ top_dists_y16);
697*09537850SAkhilesh Sanikop const __m128i not_select_top_hi =
698*09537850SAkhilesh Sanikop _mm_packs_epi16(_mm_cmpgt_epi16(top_dists_y16, top_left_dists_hi),
699*09537850SAkhilesh Sanikop /* unused second arg for pack */ top_dists_y16);
700*09537850SAkhilesh Sanikop const __m128i not_select_top = _mm_alignr_epi8(
701*09537850SAkhilesh Sanikop not_select_top_hi, _mm_slli_si128(not_select_top_lo, 8), 8);
702*09537850SAkhilesh Sanikop
703*09537850SAkhilesh Sanikop const __m128i left_leq_top =
704*09537850SAkhilesh Sanikop _mm_cmpeq_epi8(left_dists, _mm_min_epu8(top_dists_y8, left_dists));
705*09537850SAkhilesh Sanikop const __m128i select_left = _mm_andnot_si128(left_gt_top_left, left_leq_top);
706*09537850SAkhilesh Sanikop
707*09537850SAkhilesh Sanikop // Section 7.11.2.2 specifies the logic and terms here. The less-or-equal
708*09537850SAkhilesh Sanikop // operation is unavailable, so the logic for selecting top, left, or
709*09537850SAkhilesh Sanikop // top_left is inverted.
710*09537850SAkhilesh Sanikop const __m128i left_out = _mm_and_si128(select_left, lefts_y8);
711*09537850SAkhilesh Sanikop
712*09537850SAkhilesh Sanikop const __m128i top_left_out = _mm_and_si128(not_select_top, top_lefts);
713*09537850SAkhilesh Sanikop __m128i top_or_top_left_out = _mm_andnot_si128(not_select_top, top);
714*09537850SAkhilesh Sanikop top_or_top_left_out = _mm_or_si128(top_or_top_left_out, top_left_out);
715*09537850SAkhilesh Sanikop top_or_top_left_out = _mm_andnot_si128(select_left, top_or_top_left_out);
716*09537850SAkhilesh Sanikop const __m128i pred = _mm_or_si128(left_out, top_or_top_left_out);
717*09537850SAkhilesh Sanikop
718*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), pred);
719*09537850SAkhilesh Sanikop }
720*09537850SAkhilesh Sanikop
Paeth4x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)721*09537850SAkhilesh Sanikop void Paeth4x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
722*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
723*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
724*09537850SAkhilesh Sanikop const __m128i left = _mm_cvtepu8_epi32(Load4(left_column));
725*09537850SAkhilesh Sanikop const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
726*09537850SAkhilesh Sanikop
727*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
728*09537850SAkhilesh Sanikop const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
729*09537850SAkhilesh Sanikop
730*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
731*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
732*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
733*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
734*09537850SAkhilesh Sanikop const __m128i top_dists = _mm_abs_epi32(_mm_sub_epi32(left, top_lefts));
735*09537850SAkhilesh Sanikop
736*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
737*09537850SAkhilesh Sanikop const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
738*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
739*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left, top_lefts, top_dists, left_dists,
740*09537850SAkhilesh Sanikop top_left_diff);
741*09537850SAkhilesh Sanikop dst += stride;
742*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left, top_lefts, top_dists, left_dists,
743*09537850SAkhilesh Sanikop top_left_diff);
744*09537850SAkhilesh Sanikop dst += stride;
745*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left, top_lefts, top_dists, left_dists,
746*09537850SAkhilesh Sanikop top_left_diff);
747*09537850SAkhilesh Sanikop dst += stride;
748*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left, top_lefts, top_dists, left_dists,
749*09537850SAkhilesh Sanikop top_left_diff);
750*09537850SAkhilesh Sanikop }
751*09537850SAkhilesh Sanikop
Paeth4x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)752*09537850SAkhilesh Sanikop void Paeth4x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
753*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
754*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
755*09537850SAkhilesh Sanikop const __m128i left = LoadLo8(left_column);
756*09537850SAkhilesh Sanikop const __m128i left_lo = _mm_cvtepu8_epi32(left);
757*09537850SAkhilesh Sanikop const __m128i left_hi = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
758*09537850SAkhilesh Sanikop
759*09537850SAkhilesh Sanikop const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
760*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
761*09537850SAkhilesh Sanikop const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
762*09537850SAkhilesh Sanikop
763*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
764*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
765*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
766*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
767*09537850SAkhilesh Sanikop const __m128i top_dists_lo = _mm_abs_epi32(_mm_sub_epi32(left_lo, top_lefts));
768*09537850SAkhilesh Sanikop const __m128i top_dists_hi = _mm_abs_epi32(_mm_sub_epi32(left_hi, top_lefts));
769*09537850SAkhilesh Sanikop
770*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
771*09537850SAkhilesh Sanikop const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
772*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
773*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
774*09537850SAkhilesh Sanikop top_left_diff);
775*09537850SAkhilesh Sanikop dst += stride;
776*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
777*09537850SAkhilesh Sanikop top_left_diff);
778*09537850SAkhilesh Sanikop dst += stride;
779*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
780*09537850SAkhilesh Sanikop top_left_diff);
781*09537850SAkhilesh Sanikop dst += stride;
782*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left_lo, top_lefts, top_dists_lo, left_dists,
783*09537850SAkhilesh Sanikop top_left_diff);
784*09537850SAkhilesh Sanikop dst += stride;
785*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
786*09537850SAkhilesh Sanikop top_left_diff);
787*09537850SAkhilesh Sanikop dst += stride;
788*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
789*09537850SAkhilesh Sanikop top_left_diff);
790*09537850SAkhilesh Sanikop dst += stride;
791*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
792*09537850SAkhilesh Sanikop top_left_diff);
793*09537850SAkhilesh Sanikop dst += stride;
794*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left_hi, top_lefts, top_dists_hi, left_dists,
795*09537850SAkhilesh Sanikop top_left_diff);
796*09537850SAkhilesh Sanikop }
797*09537850SAkhilesh Sanikop
Paeth4x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)798*09537850SAkhilesh Sanikop void Paeth4x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
799*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
800*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
801*09537850SAkhilesh Sanikop const __m128i left = LoadUnaligned16(left_column);
802*09537850SAkhilesh Sanikop const __m128i left_0 = _mm_cvtepu8_epi32(left);
803*09537850SAkhilesh Sanikop const __m128i left_1 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 4));
804*09537850SAkhilesh Sanikop const __m128i left_2 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 8));
805*09537850SAkhilesh Sanikop const __m128i left_3 = _mm_cvtepu8_epi32(_mm_srli_si128(left, 12));
806*09537850SAkhilesh Sanikop
807*09537850SAkhilesh Sanikop const __m128i top = _mm_cvtepu8_epi32(Load4(top_row));
808*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
809*09537850SAkhilesh Sanikop const __m128i top_lefts = _mm_set1_epi32(top_ptr[-1]);
810*09537850SAkhilesh Sanikop
811*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
812*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
813*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
814*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_abs_epi32(_mm_sub_epi32(top, top_lefts));
815*09537850SAkhilesh Sanikop const __m128i top_dists_0 = _mm_abs_epi32(_mm_sub_epi32(left_0, top_lefts));
816*09537850SAkhilesh Sanikop const __m128i top_dists_1 = _mm_abs_epi32(_mm_sub_epi32(left_1, top_lefts));
817*09537850SAkhilesh Sanikop const __m128i top_dists_2 = _mm_abs_epi32(_mm_sub_epi32(left_2, top_lefts));
818*09537850SAkhilesh Sanikop const __m128i top_dists_3 = _mm_abs_epi32(_mm_sub_epi32(left_3, top_lefts));
819*09537850SAkhilesh Sanikop
820*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi32(top_lefts, top_lefts);
821*09537850SAkhilesh Sanikop const __m128i top_left_diff = _mm_sub_epi32(top, top_left_x2);
822*09537850SAkhilesh Sanikop
823*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
824*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
825*09537850SAkhilesh Sanikop top_left_diff);
826*09537850SAkhilesh Sanikop dst += stride;
827*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
828*09537850SAkhilesh Sanikop top_left_diff);
829*09537850SAkhilesh Sanikop dst += stride;
830*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
831*09537850SAkhilesh Sanikop top_left_diff);
832*09537850SAkhilesh Sanikop dst += stride;
833*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left_0, top_lefts, top_dists_0, left_dists,
834*09537850SAkhilesh Sanikop top_left_diff);
835*09537850SAkhilesh Sanikop dst += stride;
836*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
837*09537850SAkhilesh Sanikop top_left_diff);
838*09537850SAkhilesh Sanikop dst += stride;
839*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
840*09537850SAkhilesh Sanikop top_left_diff);
841*09537850SAkhilesh Sanikop dst += stride;
842*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
843*09537850SAkhilesh Sanikop top_left_diff);
844*09537850SAkhilesh Sanikop dst += stride;
845*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left_1, top_lefts, top_dists_1, left_dists,
846*09537850SAkhilesh Sanikop top_left_diff);
847*09537850SAkhilesh Sanikop dst += stride;
848*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
849*09537850SAkhilesh Sanikop top_left_diff);
850*09537850SAkhilesh Sanikop dst += stride;
851*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
852*09537850SAkhilesh Sanikop top_left_diff);
853*09537850SAkhilesh Sanikop dst += stride;
854*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
855*09537850SAkhilesh Sanikop top_left_diff);
856*09537850SAkhilesh Sanikop dst += stride;
857*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left_2, top_lefts, top_dists_2, left_dists,
858*09537850SAkhilesh Sanikop top_left_diff);
859*09537850SAkhilesh Sanikop dst += stride;
860*09537850SAkhilesh Sanikop WritePaethLine4<0>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
861*09537850SAkhilesh Sanikop top_left_diff);
862*09537850SAkhilesh Sanikop dst += stride;
863*09537850SAkhilesh Sanikop WritePaethLine4<0x55>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
864*09537850SAkhilesh Sanikop top_left_diff);
865*09537850SAkhilesh Sanikop dst += stride;
866*09537850SAkhilesh Sanikop WritePaethLine4<0xAA>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
867*09537850SAkhilesh Sanikop top_left_diff);
868*09537850SAkhilesh Sanikop dst += stride;
869*09537850SAkhilesh Sanikop WritePaethLine4<0xFF>(dst, top, left_3, top_lefts, top_dists_3, left_dists,
870*09537850SAkhilesh Sanikop top_left_diff);
871*09537850SAkhilesh Sanikop }
872*09537850SAkhilesh Sanikop
Paeth8x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)873*09537850SAkhilesh Sanikop void Paeth8x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
874*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
875*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
876*09537850SAkhilesh Sanikop const __m128i left = _mm_cvtepu8_epi16(Load4(left_column));
877*09537850SAkhilesh Sanikop const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
878*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
879*09537850SAkhilesh Sanikop const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
880*09537850SAkhilesh Sanikop
881*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
882*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
883*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
884*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
885*09537850SAkhilesh Sanikop const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
886*09537850SAkhilesh Sanikop
887*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
888*09537850SAkhilesh Sanikop const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
889*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
890*09537850SAkhilesh Sanikop WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
891*09537850SAkhilesh Sanikop top_left_diff);
892*09537850SAkhilesh Sanikop dst += stride;
893*09537850SAkhilesh Sanikop WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
894*09537850SAkhilesh Sanikop top_left_diff);
895*09537850SAkhilesh Sanikop dst += stride;
896*09537850SAkhilesh Sanikop WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
897*09537850SAkhilesh Sanikop top_left_diff);
898*09537850SAkhilesh Sanikop dst += stride;
899*09537850SAkhilesh Sanikop WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
900*09537850SAkhilesh Sanikop top_left_diff);
901*09537850SAkhilesh Sanikop }
902*09537850SAkhilesh Sanikop
Paeth8x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)903*09537850SAkhilesh Sanikop void Paeth8x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
904*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
905*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
906*09537850SAkhilesh Sanikop const __m128i left = _mm_cvtepu8_epi16(LoadLo8(left_column));
907*09537850SAkhilesh Sanikop const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
908*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
909*09537850SAkhilesh Sanikop const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
910*09537850SAkhilesh Sanikop
911*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
912*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
913*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
914*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
915*09537850SAkhilesh Sanikop const __m128i top_dists = _mm_abs_epi16(_mm_sub_epi16(left, top_lefts));
916*09537850SAkhilesh Sanikop
917*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
918*09537850SAkhilesh Sanikop const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
919*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
920*09537850SAkhilesh Sanikop WritePaethLine8<0x01000100>(dst, top, left, top_lefts, top_dists, left_dists,
921*09537850SAkhilesh Sanikop top_left_diff);
922*09537850SAkhilesh Sanikop dst += stride;
923*09537850SAkhilesh Sanikop WritePaethLine8<0x03020302>(dst, top, left, top_lefts, top_dists, left_dists,
924*09537850SAkhilesh Sanikop top_left_diff);
925*09537850SAkhilesh Sanikop dst += stride;
926*09537850SAkhilesh Sanikop WritePaethLine8<0x05040504>(dst, top, left, top_lefts, top_dists, left_dists,
927*09537850SAkhilesh Sanikop top_left_diff);
928*09537850SAkhilesh Sanikop dst += stride;
929*09537850SAkhilesh Sanikop WritePaethLine8<0x07060706>(dst, top, left, top_lefts, top_dists, left_dists,
930*09537850SAkhilesh Sanikop top_left_diff);
931*09537850SAkhilesh Sanikop dst += stride;
932*09537850SAkhilesh Sanikop WritePaethLine8<0x09080908>(dst, top, left, top_lefts, top_dists, left_dists,
933*09537850SAkhilesh Sanikop top_left_diff);
934*09537850SAkhilesh Sanikop dst += stride;
935*09537850SAkhilesh Sanikop WritePaethLine8<0x0B0A0B0A>(dst, top, left, top_lefts, top_dists, left_dists,
936*09537850SAkhilesh Sanikop top_left_diff);
937*09537850SAkhilesh Sanikop dst += stride;
938*09537850SAkhilesh Sanikop WritePaethLine8<0x0D0C0D0C>(dst, top, left, top_lefts, top_dists, left_dists,
939*09537850SAkhilesh Sanikop top_left_diff);
940*09537850SAkhilesh Sanikop dst += stride;
941*09537850SAkhilesh Sanikop WritePaethLine8<0x0F0E0F0E>(dst, top, left, top_lefts, top_dists, left_dists,
942*09537850SAkhilesh Sanikop top_left_diff);
943*09537850SAkhilesh Sanikop }
944*09537850SAkhilesh Sanikop
Paeth8x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)945*09537850SAkhilesh Sanikop void Paeth8x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
946*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
947*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
948*09537850SAkhilesh Sanikop const __m128i left = LoadUnaligned16(left_column);
949*09537850SAkhilesh Sanikop const __m128i left_lo = _mm_cvtepu8_epi16(left);
950*09537850SAkhilesh Sanikop const __m128i left_hi = _mm_cvtepu8_epi16(_mm_srli_si128(left, 8));
951*09537850SAkhilesh Sanikop const __m128i top = _mm_cvtepu8_epi16(LoadLo8(top_row));
952*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
953*09537850SAkhilesh Sanikop const __m128i top_lefts = _mm_set1_epi16(top_ptr[-1]);
954*09537850SAkhilesh Sanikop
955*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
956*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
957*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
958*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_abs_epi16(_mm_sub_epi16(top, top_lefts));
959*09537850SAkhilesh Sanikop const __m128i top_dists_lo = _mm_abs_epi16(_mm_sub_epi16(left_lo, top_lefts));
960*09537850SAkhilesh Sanikop const __m128i top_dists_hi = _mm_abs_epi16(_mm_sub_epi16(left_hi, top_lefts));
961*09537850SAkhilesh Sanikop
962*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi16(top_lefts, top_lefts);
963*09537850SAkhilesh Sanikop const __m128i top_left_diff = _mm_sub_epi16(top, top_left_x2);
964*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
965*09537850SAkhilesh Sanikop WritePaethLine8<0x01000100>(dst, top, left_lo, top_lefts, top_dists_lo,
966*09537850SAkhilesh Sanikop left_dists, top_left_diff);
967*09537850SAkhilesh Sanikop dst += stride;
968*09537850SAkhilesh Sanikop WritePaethLine8<0x03020302>(dst, top, left_lo, top_lefts, top_dists_lo,
969*09537850SAkhilesh Sanikop left_dists, top_left_diff);
970*09537850SAkhilesh Sanikop dst += stride;
971*09537850SAkhilesh Sanikop WritePaethLine8<0x05040504>(dst, top, left_lo, top_lefts, top_dists_lo,
972*09537850SAkhilesh Sanikop left_dists, top_left_diff);
973*09537850SAkhilesh Sanikop dst += stride;
974*09537850SAkhilesh Sanikop WritePaethLine8<0x07060706>(dst, top, left_lo, top_lefts, top_dists_lo,
975*09537850SAkhilesh Sanikop left_dists, top_left_diff);
976*09537850SAkhilesh Sanikop dst += stride;
977*09537850SAkhilesh Sanikop WritePaethLine8<0x09080908>(dst, top, left_lo, top_lefts, top_dists_lo,
978*09537850SAkhilesh Sanikop left_dists, top_left_diff);
979*09537850SAkhilesh Sanikop dst += stride;
980*09537850SAkhilesh Sanikop WritePaethLine8<0x0B0A0B0A>(dst, top, left_lo, top_lefts, top_dists_lo,
981*09537850SAkhilesh Sanikop left_dists, top_left_diff);
982*09537850SAkhilesh Sanikop dst += stride;
983*09537850SAkhilesh Sanikop WritePaethLine8<0x0D0C0D0C>(dst, top, left_lo, top_lefts, top_dists_lo,
984*09537850SAkhilesh Sanikop left_dists, top_left_diff);
985*09537850SAkhilesh Sanikop dst += stride;
986*09537850SAkhilesh Sanikop WritePaethLine8<0x0F0E0F0E>(dst, top, left_lo, top_lefts, top_dists_lo,
987*09537850SAkhilesh Sanikop left_dists, top_left_diff);
988*09537850SAkhilesh Sanikop dst += stride;
989*09537850SAkhilesh Sanikop WritePaethLine8<0x01000100>(dst, top, left_hi, top_lefts, top_dists_hi,
990*09537850SAkhilesh Sanikop left_dists, top_left_diff);
991*09537850SAkhilesh Sanikop dst += stride;
992*09537850SAkhilesh Sanikop WritePaethLine8<0x03020302>(dst, top, left_hi, top_lefts, top_dists_hi,
993*09537850SAkhilesh Sanikop left_dists, top_left_diff);
994*09537850SAkhilesh Sanikop dst += stride;
995*09537850SAkhilesh Sanikop WritePaethLine8<0x05040504>(dst, top, left_hi, top_lefts, top_dists_hi,
996*09537850SAkhilesh Sanikop left_dists, top_left_diff);
997*09537850SAkhilesh Sanikop dst += stride;
998*09537850SAkhilesh Sanikop WritePaethLine8<0x07060706>(dst, top, left_hi, top_lefts, top_dists_hi,
999*09537850SAkhilesh Sanikop left_dists, top_left_diff);
1000*09537850SAkhilesh Sanikop dst += stride;
1001*09537850SAkhilesh Sanikop WritePaethLine8<0x09080908>(dst, top, left_hi, top_lefts, top_dists_hi,
1002*09537850SAkhilesh Sanikop left_dists, top_left_diff);
1003*09537850SAkhilesh Sanikop dst += stride;
1004*09537850SAkhilesh Sanikop WritePaethLine8<0x0B0A0B0A>(dst, top, left_hi, top_lefts, top_dists_hi,
1005*09537850SAkhilesh Sanikop left_dists, top_left_diff);
1006*09537850SAkhilesh Sanikop dst += stride;
1007*09537850SAkhilesh Sanikop WritePaethLine8<0x0D0C0D0C>(dst, top, left_hi, top_lefts, top_dists_hi,
1008*09537850SAkhilesh Sanikop left_dists, top_left_diff);
1009*09537850SAkhilesh Sanikop dst += stride;
1010*09537850SAkhilesh Sanikop WritePaethLine8<0x0F0E0F0E>(dst, top, left_hi, top_lefts, top_dists_hi,
1011*09537850SAkhilesh Sanikop left_dists, top_left_diff);
1012*09537850SAkhilesh Sanikop }
1013*09537850SAkhilesh Sanikop
Paeth8x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1014*09537850SAkhilesh Sanikop void Paeth8x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1015*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1016*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1017*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1018*09537850SAkhilesh Sanikop auto* const dst = static_cast<uint8_t*>(dest);
1019*09537850SAkhilesh Sanikop Paeth8x16_SSE4_1(dst, stride, top_row, left_column);
1020*09537850SAkhilesh Sanikop Paeth8x16_SSE4_1(dst + (stride << 4), stride, top_row, left_ptr + 16);
1021*09537850SAkhilesh Sanikop }
1022*09537850SAkhilesh Sanikop
Paeth16x4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1023*09537850SAkhilesh Sanikop void Paeth16x4_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1024*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1025*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1026*09537850SAkhilesh Sanikop const __m128i left = Load4(left_column);
1027*09537850SAkhilesh Sanikop const __m128i top = LoadUnaligned16(top_row);
1028*09537850SAkhilesh Sanikop const __m128i top_lo = _mm_cvtepu8_epi16(top);
1029*09537850SAkhilesh Sanikop const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1030*09537850SAkhilesh Sanikop
1031*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1032*09537850SAkhilesh Sanikop const __m128i top_lefts16 = _mm_set1_epi16(top_ptr[-1]);
1033*09537850SAkhilesh Sanikop const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_ptr[-1]));
1034*09537850SAkhilesh Sanikop
1035*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
1036*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1037*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1038*09537850SAkhilesh Sanikop
1039*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1040*09537850SAkhilesh Sanikop _mm_subs_epu8(top_lefts8, top));
1041*09537850SAkhilesh Sanikop const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1042*09537850SAkhilesh Sanikop const __m128i left_dists_hi =
1043*09537850SAkhilesh Sanikop _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1044*09537850SAkhilesh Sanikop const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1045*09537850SAkhilesh Sanikop _mm_subs_epu8(top_lefts8, left));
1046*09537850SAkhilesh Sanikop
1047*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1048*09537850SAkhilesh Sanikop const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1049*09537850SAkhilesh Sanikop const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1050*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1051*09537850SAkhilesh Sanikop WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1052*09537850SAkhilesh Sanikop left_dists_lo, left_dists_hi, top_left_diff_lo,
1053*09537850SAkhilesh Sanikop top_left_diff_hi);
1054*09537850SAkhilesh Sanikop dst += stride;
1055*09537850SAkhilesh Sanikop WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1056*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1057*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1058*09537850SAkhilesh Sanikop dst += stride;
1059*09537850SAkhilesh Sanikop WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1060*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1061*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1062*09537850SAkhilesh Sanikop dst += stride;
1063*09537850SAkhilesh Sanikop WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1064*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1065*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1066*09537850SAkhilesh Sanikop }
1067*09537850SAkhilesh Sanikop
1068*09537850SAkhilesh Sanikop // Inlined for calling with offsets in larger transform sizes, mainly to
1069*09537850SAkhilesh Sanikop // preserve top_left.
WritePaeth16x8(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const uint8_t top_left,const __m128i top,const __m128i left)1070*09537850SAkhilesh Sanikop inline void WritePaeth16x8(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1071*09537850SAkhilesh Sanikop const uint8_t top_left, const __m128i top,
1072*09537850SAkhilesh Sanikop const __m128i left) {
1073*09537850SAkhilesh Sanikop const __m128i top_lo = _mm_cvtepu8_epi16(top);
1074*09537850SAkhilesh Sanikop const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1075*09537850SAkhilesh Sanikop
1076*09537850SAkhilesh Sanikop const __m128i top_lefts16 = _mm_set1_epi16(top_left);
1077*09537850SAkhilesh Sanikop const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
1078*09537850SAkhilesh Sanikop
1079*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top_left,
1080*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1081*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1082*09537850SAkhilesh Sanikop
1083*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1084*09537850SAkhilesh Sanikop _mm_subs_epu8(top_lefts8, top));
1085*09537850SAkhilesh Sanikop const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1086*09537850SAkhilesh Sanikop const __m128i left_dists_hi =
1087*09537850SAkhilesh Sanikop _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1088*09537850SAkhilesh Sanikop const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1089*09537850SAkhilesh Sanikop _mm_subs_epu8(top_lefts8, left));
1090*09537850SAkhilesh Sanikop
1091*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1092*09537850SAkhilesh Sanikop const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1093*09537850SAkhilesh Sanikop const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1094*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1095*09537850SAkhilesh Sanikop WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1096*09537850SAkhilesh Sanikop left_dists_lo, left_dists_hi, top_left_diff_lo,
1097*09537850SAkhilesh Sanikop top_left_diff_hi);
1098*09537850SAkhilesh Sanikop dst += stride;
1099*09537850SAkhilesh Sanikop WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1100*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1101*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1102*09537850SAkhilesh Sanikop dst += stride;
1103*09537850SAkhilesh Sanikop WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1104*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1105*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1106*09537850SAkhilesh Sanikop dst += stride;
1107*09537850SAkhilesh Sanikop WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1108*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1109*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1110*09537850SAkhilesh Sanikop dst += stride;
1111*09537850SAkhilesh Sanikop WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
1112*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1113*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1114*09537850SAkhilesh Sanikop dst += stride;
1115*09537850SAkhilesh Sanikop WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
1116*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1117*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1118*09537850SAkhilesh Sanikop dst += stride;
1119*09537850SAkhilesh Sanikop WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
1120*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1121*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1122*09537850SAkhilesh Sanikop dst += stride;
1123*09537850SAkhilesh Sanikop WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
1124*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1125*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1126*09537850SAkhilesh Sanikop }
1127*09537850SAkhilesh Sanikop
Paeth16x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1128*09537850SAkhilesh Sanikop void Paeth16x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1129*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1130*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1131*09537850SAkhilesh Sanikop const __m128i top = LoadUnaligned16(top_row);
1132*09537850SAkhilesh Sanikop const __m128i left = LoadLo8(left_column);
1133*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1134*09537850SAkhilesh Sanikop WritePaeth16x8(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
1135*09537850SAkhilesh Sanikop }
1136*09537850SAkhilesh Sanikop
WritePaeth16x16(void * const dest,ptrdiff_t stride,const uint8_t top_left,const __m128i top,const __m128i left)1137*09537850SAkhilesh Sanikop void WritePaeth16x16(void* const dest, ptrdiff_t stride, const uint8_t top_left,
1138*09537850SAkhilesh Sanikop const __m128i top, const __m128i left) {
1139*09537850SAkhilesh Sanikop const __m128i top_lo = _mm_cvtepu8_epi16(top);
1140*09537850SAkhilesh Sanikop const __m128i top_hi = _mm_cvtepu8_epi16(_mm_srli_si128(top, 8));
1141*09537850SAkhilesh Sanikop
1142*09537850SAkhilesh Sanikop const __m128i top_lefts16 = _mm_set1_epi16(top_left);
1143*09537850SAkhilesh Sanikop const __m128i top_lefts8 = _mm_set1_epi8(static_cast<int8_t>(top_left));
1144*09537850SAkhilesh Sanikop
1145*09537850SAkhilesh Sanikop // Given that the spec defines "base" as top[x] + left[y] - top[-1],
1146*09537850SAkhilesh Sanikop // pLeft = abs(base - left[y]) = abs(top[x] - top[-1])
1147*09537850SAkhilesh Sanikop // pTop = abs(base - top[x]) = abs(left[y] - top[-1])
1148*09537850SAkhilesh Sanikop
1149*09537850SAkhilesh Sanikop const __m128i left_dists = _mm_or_si128(_mm_subs_epu8(top, top_lefts8),
1150*09537850SAkhilesh Sanikop _mm_subs_epu8(top_lefts8, top));
1151*09537850SAkhilesh Sanikop const __m128i left_dists_lo = _mm_cvtepu8_epi16(left_dists);
1152*09537850SAkhilesh Sanikop const __m128i left_dists_hi =
1153*09537850SAkhilesh Sanikop _mm_cvtepu8_epi16(_mm_srli_si128(left_dists, 8));
1154*09537850SAkhilesh Sanikop const __m128i top_dists = _mm_or_si128(_mm_subs_epu8(left, top_lefts8),
1155*09537850SAkhilesh Sanikop _mm_subs_epu8(top_lefts8, left));
1156*09537850SAkhilesh Sanikop
1157*09537850SAkhilesh Sanikop const __m128i top_left_x2 = _mm_add_epi16(top_lefts16, top_lefts16);
1158*09537850SAkhilesh Sanikop const __m128i top_left_diff_lo = _mm_sub_epi16(top_lo, top_left_x2);
1159*09537850SAkhilesh Sanikop const __m128i top_left_diff_hi = _mm_sub_epi16(top_hi, top_left_x2);
1160*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1161*09537850SAkhilesh Sanikop WritePaethLine16<0>(dst, top, left, top_lefts8, top_dists, left_dists,
1162*09537850SAkhilesh Sanikop left_dists_lo, left_dists_hi, top_left_diff_lo,
1163*09537850SAkhilesh Sanikop top_left_diff_hi);
1164*09537850SAkhilesh Sanikop dst += stride;
1165*09537850SAkhilesh Sanikop WritePaethLine16<0x01010101>(dst, top, left, top_lefts8, top_dists,
1166*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1167*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1168*09537850SAkhilesh Sanikop dst += stride;
1169*09537850SAkhilesh Sanikop WritePaethLine16<0x02020202>(dst, top, left, top_lefts8, top_dists,
1170*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1171*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1172*09537850SAkhilesh Sanikop dst += stride;
1173*09537850SAkhilesh Sanikop WritePaethLine16<0x03030303>(dst, top, left, top_lefts8, top_dists,
1174*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1175*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1176*09537850SAkhilesh Sanikop dst += stride;
1177*09537850SAkhilesh Sanikop WritePaethLine16<0x04040404>(dst, top, left, top_lefts8, top_dists,
1178*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1179*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1180*09537850SAkhilesh Sanikop dst += stride;
1181*09537850SAkhilesh Sanikop WritePaethLine16<0x05050505>(dst, top, left, top_lefts8, top_dists,
1182*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1183*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1184*09537850SAkhilesh Sanikop dst += stride;
1185*09537850SAkhilesh Sanikop WritePaethLine16<0x06060606>(dst, top, left, top_lefts8, top_dists,
1186*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1187*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1188*09537850SAkhilesh Sanikop dst += stride;
1189*09537850SAkhilesh Sanikop WritePaethLine16<0x07070707>(dst, top, left, top_lefts8, top_dists,
1190*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1191*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1192*09537850SAkhilesh Sanikop dst += stride;
1193*09537850SAkhilesh Sanikop WritePaethLine16<0x08080808>(dst, top, left, top_lefts8, top_dists,
1194*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1195*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1196*09537850SAkhilesh Sanikop dst += stride;
1197*09537850SAkhilesh Sanikop WritePaethLine16<0x09090909>(dst, top, left, top_lefts8, top_dists,
1198*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1199*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1200*09537850SAkhilesh Sanikop dst += stride;
1201*09537850SAkhilesh Sanikop WritePaethLine16<0x0A0A0A0A>(dst, top, left, top_lefts8, top_dists,
1202*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1203*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1204*09537850SAkhilesh Sanikop dst += stride;
1205*09537850SAkhilesh Sanikop WritePaethLine16<0x0B0B0B0B>(dst, top, left, top_lefts8, top_dists,
1206*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1207*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1208*09537850SAkhilesh Sanikop dst += stride;
1209*09537850SAkhilesh Sanikop WritePaethLine16<0x0C0C0C0C>(dst, top, left, top_lefts8, top_dists,
1210*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1211*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1212*09537850SAkhilesh Sanikop dst += stride;
1213*09537850SAkhilesh Sanikop WritePaethLine16<0x0D0D0D0D>(dst, top, left, top_lefts8, top_dists,
1214*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1215*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1216*09537850SAkhilesh Sanikop dst += stride;
1217*09537850SAkhilesh Sanikop WritePaethLine16<0x0E0E0E0E>(dst, top, left, top_lefts8, top_dists,
1218*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1219*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1220*09537850SAkhilesh Sanikop dst += stride;
1221*09537850SAkhilesh Sanikop WritePaethLine16<0x0F0F0F0F>(dst, top, left, top_lefts8, top_dists,
1222*09537850SAkhilesh Sanikop left_dists, left_dists_lo, left_dists_hi,
1223*09537850SAkhilesh Sanikop top_left_diff_lo, top_left_diff_hi);
1224*09537850SAkhilesh Sanikop }
1225*09537850SAkhilesh Sanikop
Paeth16x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1226*09537850SAkhilesh Sanikop void Paeth16x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1227*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1228*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1229*09537850SAkhilesh Sanikop const __m128i left = LoadUnaligned16(left_column);
1230*09537850SAkhilesh Sanikop const __m128i top = LoadUnaligned16(top_row);
1231*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1232*09537850SAkhilesh Sanikop WritePaeth16x16(static_cast<uint8_t*>(dest), stride, top_ptr[-1], top, left);
1233*09537850SAkhilesh Sanikop }
1234*09537850SAkhilesh Sanikop
Paeth16x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1235*09537850SAkhilesh Sanikop void Paeth16x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1236*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1237*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1238*09537850SAkhilesh Sanikop const __m128i left_0 = LoadUnaligned16(left_column);
1239*09537850SAkhilesh Sanikop const __m128i top = LoadUnaligned16(top_row);
1240*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1241*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1242*09537850SAkhilesh Sanikop auto* const dst = static_cast<uint8_t*>(dest);
1243*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top, left_0);
1244*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1245*09537850SAkhilesh Sanikop const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1246*09537850SAkhilesh Sanikop WritePaeth16x16(dst + (stride << 4), stride, top_left, top, left_1);
1247*09537850SAkhilesh Sanikop }
1248*09537850SAkhilesh Sanikop
Paeth16x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1249*09537850SAkhilesh Sanikop void Paeth16x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1250*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1251*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1252*09537850SAkhilesh Sanikop const ptrdiff_t stride16 = stride << 4;
1253*09537850SAkhilesh Sanikop const __m128i left_0 = LoadUnaligned16(left_column);
1254*09537850SAkhilesh Sanikop const __m128i top = LoadUnaligned16(top_row);
1255*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1256*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1257*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1258*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top, left_0);
1259*09537850SAkhilesh Sanikop dst += stride16;
1260*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1261*09537850SAkhilesh Sanikop const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1262*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top, left_1);
1263*09537850SAkhilesh Sanikop dst += stride16;
1264*09537850SAkhilesh Sanikop const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1265*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top, left_2);
1266*09537850SAkhilesh Sanikop dst += stride16;
1267*09537850SAkhilesh Sanikop const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1268*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top, left_3);
1269*09537850SAkhilesh Sanikop }
1270*09537850SAkhilesh Sanikop
Paeth32x8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1271*09537850SAkhilesh Sanikop void Paeth32x8_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1272*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1273*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1274*09537850SAkhilesh Sanikop const __m128i left = LoadLo8(left_column);
1275*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1276*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_row);
1277*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1278*09537850SAkhilesh Sanikop auto* const dst = static_cast<uint8_t*>(dest);
1279*09537850SAkhilesh Sanikop WritePaeth16x8(dst, stride, top_left, top_0, left);
1280*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1281*09537850SAkhilesh Sanikop WritePaeth16x8(dst + 16, stride, top_left, top_1, left);
1282*09537850SAkhilesh Sanikop }
1283*09537850SAkhilesh Sanikop
Paeth32x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1284*09537850SAkhilesh Sanikop void Paeth32x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1285*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1286*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1287*09537850SAkhilesh Sanikop const __m128i left = LoadUnaligned16(left_column);
1288*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1289*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_row);
1290*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1291*09537850SAkhilesh Sanikop auto* const dst = static_cast<uint8_t*>(dest);
1292*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left);
1293*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1294*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
1295*09537850SAkhilesh Sanikop }
1296*09537850SAkhilesh Sanikop
Paeth32x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1297*09537850SAkhilesh Sanikop void Paeth32x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1298*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1299*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1300*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1301*09537850SAkhilesh Sanikop const __m128i left_0 = LoadUnaligned16(left_ptr);
1302*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1303*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_ptr);
1304*09537850SAkhilesh Sanikop const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1305*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1306*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1307*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1308*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1309*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1310*09537850SAkhilesh Sanikop dst += (stride << 4);
1311*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1312*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1313*09537850SAkhilesh Sanikop }
1314*09537850SAkhilesh Sanikop
Paeth32x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1315*09537850SAkhilesh Sanikop void Paeth32x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1316*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1317*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1318*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1319*09537850SAkhilesh Sanikop const __m128i left_0 = LoadUnaligned16(left_ptr);
1320*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1321*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_ptr);
1322*09537850SAkhilesh Sanikop const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1323*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1324*09537850SAkhilesh Sanikop const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1325*09537850SAkhilesh Sanikop const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1326*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1327*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1328*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1329*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1330*09537850SAkhilesh Sanikop dst += (stride << 4);
1331*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1332*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1333*09537850SAkhilesh Sanikop dst += (stride << 4);
1334*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_2);
1335*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
1336*09537850SAkhilesh Sanikop dst += (stride << 4);
1337*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_3);
1338*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
1339*09537850SAkhilesh Sanikop }
1340*09537850SAkhilesh Sanikop
Paeth64x16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1341*09537850SAkhilesh Sanikop void Paeth64x16_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1342*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1343*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1344*09537850SAkhilesh Sanikop const __m128i left = LoadUnaligned16(left_column);
1345*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1346*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_ptr);
1347*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1348*09537850SAkhilesh Sanikop const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1349*09537850SAkhilesh Sanikop const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1350*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1351*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1352*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left);
1353*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left);
1354*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left);
1355*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left);
1356*09537850SAkhilesh Sanikop }
1357*09537850SAkhilesh Sanikop
Paeth64x32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1358*09537850SAkhilesh Sanikop void Paeth64x32_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1359*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1360*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1361*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1362*09537850SAkhilesh Sanikop const __m128i left_0 = LoadUnaligned16(left_ptr);
1363*09537850SAkhilesh Sanikop const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1364*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1365*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_ptr);
1366*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1367*09537850SAkhilesh Sanikop const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1368*09537850SAkhilesh Sanikop const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1369*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1370*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1371*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1372*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1373*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
1374*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
1375*09537850SAkhilesh Sanikop dst += (stride << 4);
1376*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1377*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1378*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
1379*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
1380*09537850SAkhilesh Sanikop }
1381*09537850SAkhilesh Sanikop
Paeth64x64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)1382*09537850SAkhilesh Sanikop void Paeth64x64_SSE4_1(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1383*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const top_row,
1384*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const left_column) {
1385*09537850SAkhilesh Sanikop const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
1386*09537850SAkhilesh Sanikop const __m128i left_0 = LoadUnaligned16(left_ptr);
1387*09537850SAkhilesh Sanikop const __m128i left_1 = LoadUnaligned16(left_ptr + 16);
1388*09537850SAkhilesh Sanikop const __m128i left_2 = LoadUnaligned16(left_ptr + 32);
1389*09537850SAkhilesh Sanikop const __m128i left_3 = LoadUnaligned16(left_ptr + 48);
1390*09537850SAkhilesh Sanikop const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
1391*09537850SAkhilesh Sanikop const __m128i top_0 = LoadUnaligned16(top_ptr);
1392*09537850SAkhilesh Sanikop const __m128i top_1 = LoadUnaligned16(top_ptr + 16);
1393*09537850SAkhilesh Sanikop const __m128i top_2 = LoadUnaligned16(top_ptr + 32);
1394*09537850SAkhilesh Sanikop const __m128i top_3 = LoadUnaligned16(top_ptr + 48);
1395*09537850SAkhilesh Sanikop const uint8_t top_left = top_ptr[-1];
1396*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1397*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_0);
1398*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_0);
1399*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left_0);
1400*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left_0);
1401*09537850SAkhilesh Sanikop dst += (stride << 4);
1402*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_1);
1403*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_1);
1404*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left_1);
1405*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left_1);
1406*09537850SAkhilesh Sanikop dst += (stride << 4);
1407*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_2);
1408*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_2);
1409*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left_2);
1410*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left_2);
1411*09537850SAkhilesh Sanikop dst += (stride << 4);
1412*09537850SAkhilesh Sanikop WritePaeth16x16(dst, stride, top_left, top_0, left_3);
1413*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 16, stride, top_left, top_1, left_3);
1414*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 32, stride, top_left, top_2, left_3);
1415*09537850SAkhilesh Sanikop WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
1416*09537850SAkhilesh Sanikop }
1417*09537850SAkhilesh Sanikop
Init8bpp()1418*09537850SAkhilesh Sanikop void Init8bpp() {
1419*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1420*09537850SAkhilesh Sanikop assert(dsp != nullptr);
1421*09537850SAkhilesh Sanikop static_cast<void>(dsp);
1422*09537850SAkhilesh Sanikop // These guards check if this version of the function was not superseded by
1423*09537850SAkhilesh Sanikop // a higher optimization level, such as AVX. The corresponding #define also
1424*09537850SAkhilesh Sanikop // prevents the C version from being added to the table.
1425*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
1426*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
1427*09537850SAkhilesh Sanikop DcDefs::_4x4::DcTop;
1428*09537850SAkhilesh Sanikop #endif
1429*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcTop)
1430*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
1431*09537850SAkhilesh Sanikop DcDefs::_4x8::DcTop;
1432*09537850SAkhilesh Sanikop #endif
1433*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcTop)
1434*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
1435*09537850SAkhilesh Sanikop DcDefs::_4x16::DcTop;
1436*09537850SAkhilesh Sanikop #endif
1437*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcTop)
1438*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
1439*09537850SAkhilesh Sanikop DcDefs::_8x4::DcTop;
1440*09537850SAkhilesh Sanikop #endif
1441*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcTop)
1442*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
1443*09537850SAkhilesh Sanikop DcDefs::_8x8::DcTop;
1444*09537850SAkhilesh Sanikop #endif
1445*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcTop)
1446*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
1447*09537850SAkhilesh Sanikop DcDefs::_8x16::DcTop;
1448*09537850SAkhilesh Sanikop #endif
1449*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcTop)
1450*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
1451*09537850SAkhilesh Sanikop DcDefs::_8x32::DcTop;
1452*09537850SAkhilesh Sanikop #endif
1453*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcTop)
1454*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
1455*09537850SAkhilesh Sanikop DcDefs::_16x4::DcTop;
1456*09537850SAkhilesh Sanikop #endif
1457*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcTop)
1458*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
1459*09537850SAkhilesh Sanikop DcDefs::_16x8::DcTop;
1460*09537850SAkhilesh Sanikop #endif
1461*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcTop)
1462*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
1463*09537850SAkhilesh Sanikop DcDefs::_16x16::DcTop;
1464*09537850SAkhilesh Sanikop #endif
1465*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcTop)
1466*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
1467*09537850SAkhilesh Sanikop DcDefs::_16x32::DcTop;
1468*09537850SAkhilesh Sanikop #endif
1469*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcTop)
1470*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
1471*09537850SAkhilesh Sanikop DcDefs::_16x64::DcTop;
1472*09537850SAkhilesh Sanikop #endif
1473*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcTop)
1474*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
1475*09537850SAkhilesh Sanikop DcDefs::_32x8::DcTop;
1476*09537850SAkhilesh Sanikop #endif
1477*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcTop)
1478*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
1479*09537850SAkhilesh Sanikop DcDefs::_32x16::DcTop;
1480*09537850SAkhilesh Sanikop #endif
1481*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcTop)
1482*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
1483*09537850SAkhilesh Sanikop DcDefs::_32x32::DcTop;
1484*09537850SAkhilesh Sanikop #endif
1485*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcTop)
1486*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
1487*09537850SAkhilesh Sanikop DcDefs::_32x64::DcTop;
1488*09537850SAkhilesh Sanikop #endif
1489*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcTop)
1490*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
1491*09537850SAkhilesh Sanikop DcDefs::_64x16::DcTop;
1492*09537850SAkhilesh Sanikop #endif
1493*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcTop)
1494*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
1495*09537850SAkhilesh Sanikop DcDefs::_64x32::DcTop;
1496*09537850SAkhilesh Sanikop #endif
1497*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcTop)
1498*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
1499*09537850SAkhilesh Sanikop DcDefs::_64x64::DcTop;
1500*09537850SAkhilesh Sanikop #endif
1501*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
1502*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
1503*09537850SAkhilesh Sanikop DcDefs::_4x4::DcLeft;
1504*09537850SAkhilesh Sanikop #endif
1505*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDcLeft)
1506*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
1507*09537850SAkhilesh Sanikop DcDefs::_4x8::DcLeft;
1508*09537850SAkhilesh Sanikop #endif
1509*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDcLeft)
1510*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
1511*09537850SAkhilesh Sanikop DcDefs::_4x16::DcLeft;
1512*09537850SAkhilesh Sanikop #endif
1513*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDcLeft)
1514*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
1515*09537850SAkhilesh Sanikop DcDefs::_8x4::DcLeft;
1516*09537850SAkhilesh Sanikop #endif
1517*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDcLeft)
1518*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
1519*09537850SAkhilesh Sanikop DcDefs::_8x8::DcLeft;
1520*09537850SAkhilesh Sanikop #endif
1521*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDcLeft)
1522*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
1523*09537850SAkhilesh Sanikop DcDefs::_8x16::DcLeft;
1524*09537850SAkhilesh Sanikop #endif
1525*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDcLeft)
1526*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
1527*09537850SAkhilesh Sanikop DcDefs::_8x32::DcLeft;
1528*09537850SAkhilesh Sanikop #endif
1529*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDcLeft)
1530*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
1531*09537850SAkhilesh Sanikop DcDefs::_16x4::DcLeft;
1532*09537850SAkhilesh Sanikop #endif
1533*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDcLeft)
1534*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
1535*09537850SAkhilesh Sanikop DcDefs::_16x8::DcLeft;
1536*09537850SAkhilesh Sanikop #endif
1537*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDcLeft)
1538*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
1539*09537850SAkhilesh Sanikop DcDefs::_16x16::DcLeft;
1540*09537850SAkhilesh Sanikop #endif
1541*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDcLeft)
1542*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
1543*09537850SAkhilesh Sanikop DcDefs::_16x32::DcLeft;
1544*09537850SAkhilesh Sanikop #endif
1545*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDcLeft)
1546*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
1547*09537850SAkhilesh Sanikop DcDefs::_16x64::DcLeft;
1548*09537850SAkhilesh Sanikop #endif
1549*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDcLeft)
1550*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
1551*09537850SAkhilesh Sanikop DcDefs::_32x8::DcLeft;
1552*09537850SAkhilesh Sanikop #endif
1553*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDcLeft)
1554*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
1555*09537850SAkhilesh Sanikop DcDefs::_32x16::DcLeft;
1556*09537850SAkhilesh Sanikop #endif
1557*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDcLeft)
1558*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
1559*09537850SAkhilesh Sanikop DcDefs::_32x32::DcLeft;
1560*09537850SAkhilesh Sanikop #endif
1561*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDcLeft)
1562*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
1563*09537850SAkhilesh Sanikop DcDefs::_32x64::DcLeft;
1564*09537850SAkhilesh Sanikop #endif
1565*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDcLeft)
1566*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
1567*09537850SAkhilesh Sanikop DcDefs::_64x16::DcLeft;
1568*09537850SAkhilesh Sanikop #endif
1569*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDcLeft)
1570*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
1571*09537850SAkhilesh Sanikop DcDefs::_64x32::DcLeft;
1572*09537850SAkhilesh Sanikop #endif
1573*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDcLeft)
1574*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
1575*09537850SAkhilesh Sanikop DcDefs::_64x64::DcLeft;
1576*09537850SAkhilesh Sanikop #endif
1577*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
1578*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
1579*09537850SAkhilesh Sanikop DcDefs::_4x4::Dc;
1580*09537850SAkhilesh Sanikop #endif
1581*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorDc)
1582*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
1583*09537850SAkhilesh Sanikop DcDefs::_4x8::Dc;
1584*09537850SAkhilesh Sanikop #endif
1585*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorDc)
1586*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
1587*09537850SAkhilesh Sanikop DcDefs::_4x16::Dc;
1588*09537850SAkhilesh Sanikop #endif
1589*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorDc)
1590*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
1591*09537850SAkhilesh Sanikop DcDefs::_8x4::Dc;
1592*09537850SAkhilesh Sanikop #endif
1593*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorDc)
1594*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
1595*09537850SAkhilesh Sanikop DcDefs::_8x8::Dc;
1596*09537850SAkhilesh Sanikop #endif
1597*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorDc)
1598*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
1599*09537850SAkhilesh Sanikop DcDefs::_8x16::Dc;
1600*09537850SAkhilesh Sanikop #endif
1601*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorDc)
1602*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
1603*09537850SAkhilesh Sanikop DcDefs::_8x32::Dc;
1604*09537850SAkhilesh Sanikop #endif
1605*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorDc)
1606*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
1607*09537850SAkhilesh Sanikop DcDefs::_16x4::Dc;
1608*09537850SAkhilesh Sanikop #endif
1609*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorDc)
1610*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
1611*09537850SAkhilesh Sanikop DcDefs::_16x8::Dc;
1612*09537850SAkhilesh Sanikop #endif
1613*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorDc)
1614*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
1615*09537850SAkhilesh Sanikop DcDefs::_16x16::Dc;
1616*09537850SAkhilesh Sanikop #endif
1617*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorDc)
1618*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
1619*09537850SAkhilesh Sanikop DcDefs::_16x32::Dc;
1620*09537850SAkhilesh Sanikop #endif
1621*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorDc)
1622*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
1623*09537850SAkhilesh Sanikop DcDefs::_16x64::Dc;
1624*09537850SAkhilesh Sanikop #endif
1625*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorDc)
1626*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
1627*09537850SAkhilesh Sanikop DcDefs::_32x8::Dc;
1628*09537850SAkhilesh Sanikop #endif
1629*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorDc)
1630*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
1631*09537850SAkhilesh Sanikop DcDefs::_32x16::Dc;
1632*09537850SAkhilesh Sanikop #endif
1633*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorDc)
1634*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
1635*09537850SAkhilesh Sanikop DcDefs::_32x32::Dc;
1636*09537850SAkhilesh Sanikop #endif
1637*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorDc)
1638*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
1639*09537850SAkhilesh Sanikop DcDefs::_32x64::Dc;
1640*09537850SAkhilesh Sanikop #endif
1641*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorDc)
1642*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
1643*09537850SAkhilesh Sanikop DcDefs::_64x16::Dc;
1644*09537850SAkhilesh Sanikop #endif
1645*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorDc)
1646*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
1647*09537850SAkhilesh Sanikop DcDefs::_64x32::Dc;
1648*09537850SAkhilesh Sanikop #endif
1649*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorDc)
1650*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
1651*09537850SAkhilesh Sanikop DcDefs::_64x64::Dc;
1652*09537850SAkhilesh Sanikop #endif
1653*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorPaeth)
1654*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
1655*09537850SAkhilesh Sanikop Paeth4x4_SSE4_1;
1656*09537850SAkhilesh Sanikop #endif
1657*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorPaeth)
1658*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
1659*09537850SAkhilesh Sanikop Paeth4x8_SSE4_1;
1660*09537850SAkhilesh Sanikop #endif
1661*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorPaeth)
1662*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
1663*09537850SAkhilesh Sanikop Paeth4x16_SSE4_1;
1664*09537850SAkhilesh Sanikop #endif
1665*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorPaeth)
1666*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
1667*09537850SAkhilesh Sanikop Paeth8x4_SSE4_1;
1668*09537850SAkhilesh Sanikop #endif
1669*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorPaeth)
1670*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
1671*09537850SAkhilesh Sanikop Paeth8x8_SSE4_1;
1672*09537850SAkhilesh Sanikop #endif
1673*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorPaeth)
1674*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
1675*09537850SAkhilesh Sanikop Paeth8x16_SSE4_1;
1676*09537850SAkhilesh Sanikop #endif
1677*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorPaeth)
1678*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
1679*09537850SAkhilesh Sanikop Paeth8x32_SSE4_1;
1680*09537850SAkhilesh Sanikop #endif
1681*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorPaeth)
1682*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
1683*09537850SAkhilesh Sanikop Paeth16x4_SSE4_1;
1684*09537850SAkhilesh Sanikop #endif
1685*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorPaeth)
1686*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
1687*09537850SAkhilesh Sanikop Paeth16x8_SSE4_1;
1688*09537850SAkhilesh Sanikop #endif
1689*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorPaeth)
1690*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
1691*09537850SAkhilesh Sanikop Paeth16x16_SSE4_1;
1692*09537850SAkhilesh Sanikop #endif
1693*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorPaeth)
1694*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
1695*09537850SAkhilesh Sanikop Paeth16x32_SSE4_1;
1696*09537850SAkhilesh Sanikop #endif
1697*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorPaeth)
1698*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
1699*09537850SAkhilesh Sanikop Paeth16x64_SSE4_1;
1700*09537850SAkhilesh Sanikop #endif
1701*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorPaeth)
1702*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
1703*09537850SAkhilesh Sanikop Paeth32x8_SSE4_1;
1704*09537850SAkhilesh Sanikop #endif
1705*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorPaeth)
1706*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
1707*09537850SAkhilesh Sanikop Paeth32x16_SSE4_1;
1708*09537850SAkhilesh Sanikop #endif
1709*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorPaeth)
1710*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
1711*09537850SAkhilesh Sanikop Paeth32x32_SSE4_1;
1712*09537850SAkhilesh Sanikop #endif
1713*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorPaeth)
1714*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
1715*09537850SAkhilesh Sanikop Paeth32x64_SSE4_1;
1716*09537850SAkhilesh Sanikop #endif
1717*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorPaeth)
1718*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
1719*09537850SAkhilesh Sanikop Paeth64x16_SSE4_1;
1720*09537850SAkhilesh Sanikop #endif
1721*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorPaeth)
1722*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
1723*09537850SAkhilesh Sanikop Paeth64x32_SSE4_1;
1724*09537850SAkhilesh Sanikop #endif
1725*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorPaeth)
1726*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
1727*09537850SAkhilesh Sanikop Paeth64x64_SSE4_1;
1728*09537850SAkhilesh Sanikop #endif
1729*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
1730*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
1731*09537850SAkhilesh Sanikop DirDefs::_4x4::Horizontal;
1732*09537850SAkhilesh Sanikop #endif
1733*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
1734*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
1735*09537850SAkhilesh Sanikop DirDefs::_4x8::Horizontal;
1736*09537850SAkhilesh Sanikop #endif
1737*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
1738*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
1739*09537850SAkhilesh Sanikop DirDefs::_4x16::Horizontal;
1740*09537850SAkhilesh Sanikop #endif
1741*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
1742*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
1743*09537850SAkhilesh Sanikop DirDefs::_8x4::Horizontal;
1744*09537850SAkhilesh Sanikop #endif
1745*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
1746*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
1747*09537850SAkhilesh Sanikop DirDefs::_8x8::Horizontal;
1748*09537850SAkhilesh Sanikop #endif
1749*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
1750*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
1751*09537850SAkhilesh Sanikop DirDefs::_8x16::Horizontal;
1752*09537850SAkhilesh Sanikop #endif
1753*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
1754*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
1755*09537850SAkhilesh Sanikop DirDefs::_8x32::Horizontal;
1756*09537850SAkhilesh Sanikop #endif
1757*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
1758*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
1759*09537850SAkhilesh Sanikop DirDefs::_16x4::Horizontal;
1760*09537850SAkhilesh Sanikop #endif
1761*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
1762*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
1763*09537850SAkhilesh Sanikop DirDefs::_16x8::Horizontal;
1764*09537850SAkhilesh Sanikop #endif
1765*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
1766*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
1767*09537850SAkhilesh Sanikop DirDefs::_16x16::Horizontal;
1768*09537850SAkhilesh Sanikop #endif
1769*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
1770*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
1771*09537850SAkhilesh Sanikop DirDefs::_16x32::Horizontal;
1772*09537850SAkhilesh Sanikop #endif
1773*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
1774*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
1775*09537850SAkhilesh Sanikop DirDefs::_16x64::Horizontal;
1776*09537850SAkhilesh Sanikop #endif
1777*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
1778*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
1779*09537850SAkhilesh Sanikop DirDefs::_32x8::Horizontal;
1780*09537850SAkhilesh Sanikop #endif
1781*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
1782*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
1783*09537850SAkhilesh Sanikop DirDefs::_32x16::Horizontal;
1784*09537850SAkhilesh Sanikop #endif
1785*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
1786*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
1787*09537850SAkhilesh Sanikop DirDefs::_32x32::Horizontal;
1788*09537850SAkhilesh Sanikop #endif
1789*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
1790*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
1791*09537850SAkhilesh Sanikop DirDefs::_32x64::Horizontal;
1792*09537850SAkhilesh Sanikop #endif
1793*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
1794*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
1795*09537850SAkhilesh Sanikop DirDefs::_64x16::Horizontal;
1796*09537850SAkhilesh Sanikop #endif
1797*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
1798*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
1799*09537850SAkhilesh Sanikop DirDefs::_64x32::Horizontal;
1800*09537850SAkhilesh Sanikop #endif
1801*09537850SAkhilesh Sanikop #if DSP_ENABLED_8BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
1802*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
1803*09537850SAkhilesh Sanikop DirDefs::_64x64::Horizontal;
1804*09537850SAkhilesh Sanikop #endif
1805*09537850SAkhilesh Sanikop } // NOLINT(readability/fn_size)
1806*09537850SAkhilesh Sanikop
1807*09537850SAkhilesh Sanikop } // namespace
1808*09537850SAkhilesh Sanikop } // namespace low_bitdepth
1809*09537850SAkhilesh Sanikop
1810*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
1811*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
1812*09537850SAkhilesh Sanikop namespace high_bitdepth {
1813*09537850SAkhilesh Sanikop namespace {
1814*09537850SAkhilesh Sanikop
1815*09537850SAkhilesh Sanikop template <int height>
DcStore4xH_SSE4_1(void * const dest,ptrdiff_t stride,const __m128i dc)1816*09537850SAkhilesh Sanikop inline void DcStore4xH_SSE4_1(void* const dest, ptrdiff_t stride,
1817*09537850SAkhilesh Sanikop const __m128i dc) {
1818*09537850SAkhilesh Sanikop const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0);
1819*09537850SAkhilesh Sanikop int y = height - 1;
1820*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1821*09537850SAkhilesh Sanikop do {
1822*09537850SAkhilesh Sanikop StoreLo8(dst, dc_dup);
1823*09537850SAkhilesh Sanikop dst += stride;
1824*09537850SAkhilesh Sanikop } while (--y != 0);
1825*09537850SAkhilesh Sanikop StoreLo8(dst, dc_dup);
1826*09537850SAkhilesh Sanikop }
1827*09537850SAkhilesh Sanikop
1828*09537850SAkhilesh Sanikop // WriteDuplicateN assumes dup has 4 32-bit "units," each of which comprises 2
1829*09537850SAkhilesh Sanikop // identical shorts that need N total copies written into dest. The unpacking
1830*09537850SAkhilesh Sanikop // works the same as in the 8bpp case, except that each 32-bit unit needs twice
1831*09537850SAkhilesh Sanikop // as many copies.
WriteDuplicate4x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1832*09537850SAkhilesh Sanikop inline void WriteDuplicate4x4(void* const dest, ptrdiff_t stride,
1833*09537850SAkhilesh Sanikop const __m128i dup32) {
1834*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1835*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1836*09537850SAkhilesh Sanikop _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_lo);
1837*09537850SAkhilesh Sanikop dst += stride;
1838*09537850SAkhilesh Sanikop _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_lo));
1839*09537850SAkhilesh Sanikop dst += stride;
1840*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1841*09537850SAkhilesh Sanikop _mm_storel_epi64(reinterpret_cast<__m128i*>(dst), dup64_hi);
1842*09537850SAkhilesh Sanikop dst += stride;
1843*09537850SAkhilesh Sanikop _mm_storeh_pi(reinterpret_cast<__m64*>(dst), _mm_castsi128_ps(dup64_hi));
1844*09537850SAkhilesh Sanikop }
1845*09537850SAkhilesh Sanikop
WriteDuplicate8x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1846*09537850SAkhilesh Sanikop inline void WriteDuplicate8x4(void* const dest, ptrdiff_t stride,
1847*09537850SAkhilesh Sanikop const __m128i dup32) {
1848*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1849*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1850*09537850SAkhilesh Sanikop
1851*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1852*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1853*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1854*09537850SAkhilesh Sanikop dst += stride;
1855*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1856*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1857*09537850SAkhilesh Sanikop dst += stride;
1858*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1859*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1860*09537850SAkhilesh Sanikop dst += stride;
1861*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1862*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1863*09537850SAkhilesh Sanikop }
1864*09537850SAkhilesh Sanikop
WriteDuplicate16x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1865*09537850SAkhilesh Sanikop inline void WriteDuplicate16x4(void* const dest, ptrdiff_t stride,
1866*09537850SAkhilesh Sanikop const __m128i dup32) {
1867*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1868*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1869*09537850SAkhilesh Sanikop
1870*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1871*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1872*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1873*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
1874*09537850SAkhilesh Sanikop dst += stride;
1875*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1876*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1877*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
1878*09537850SAkhilesh Sanikop dst += stride;
1879*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1880*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1881*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
1882*09537850SAkhilesh Sanikop dst += stride;
1883*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1884*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1885*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
1886*09537850SAkhilesh Sanikop }
1887*09537850SAkhilesh Sanikop
WriteDuplicate32x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1888*09537850SAkhilesh Sanikop inline void WriteDuplicate32x4(void* const dest, ptrdiff_t stride,
1889*09537850SAkhilesh Sanikop const __m128i dup32) {
1890*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1891*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1892*09537850SAkhilesh Sanikop
1893*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1894*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1895*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_0);
1896*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_0);
1897*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_0);
1898*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_0);
1899*09537850SAkhilesh Sanikop dst += stride;
1900*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1901*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_1);
1902*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_1);
1903*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_1);
1904*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_1);
1905*09537850SAkhilesh Sanikop dst += stride;
1906*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1907*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_2);
1908*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_2);
1909*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_2);
1910*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_2);
1911*09537850SAkhilesh Sanikop dst += stride;
1912*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1913*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), dup128_3);
1914*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 16), dup128_3);
1915*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 32), dup128_3);
1916*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + 48), dup128_3);
1917*09537850SAkhilesh Sanikop }
1918*09537850SAkhilesh Sanikop
WriteDuplicate64x4(void * const dest,ptrdiff_t stride,const __m128i dup32)1919*09537850SAkhilesh Sanikop inline void WriteDuplicate64x4(void* const dest, ptrdiff_t stride,
1920*09537850SAkhilesh Sanikop const __m128i dup32) {
1921*09537850SAkhilesh Sanikop const __m128i dup64_lo = _mm_unpacklo_epi32(dup32, dup32);
1922*09537850SAkhilesh Sanikop const __m128i dup64_hi = _mm_unpackhi_epi32(dup32, dup32);
1923*09537850SAkhilesh Sanikop
1924*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1925*09537850SAkhilesh Sanikop const __m128i dup128_0 = _mm_unpacklo_epi64(dup64_lo, dup64_lo);
1926*09537850SAkhilesh Sanikop for (int x = 0; x < 128; x += 16) {
1927*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_0);
1928*09537850SAkhilesh Sanikop }
1929*09537850SAkhilesh Sanikop dst += stride;
1930*09537850SAkhilesh Sanikop const __m128i dup128_1 = _mm_unpackhi_epi64(dup64_lo, dup64_lo);
1931*09537850SAkhilesh Sanikop for (int x = 0; x < 128; x += 16) {
1932*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_1);
1933*09537850SAkhilesh Sanikop }
1934*09537850SAkhilesh Sanikop dst += stride;
1935*09537850SAkhilesh Sanikop const __m128i dup128_2 = _mm_unpacklo_epi64(dup64_hi, dup64_hi);
1936*09537850SAkhilesh Sanikop for (int x = 0; x < 128; x += 16) {
1937*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_2);
1938*09537850SAkhilesh Sanikop }
1939*09537850SAkhilesh Sanikop dst += stride;
1940*09537850SAkhilesh Sanikop const __m128i dup128_3 = _mm_unpackhi_epi64(dup64_hi, dup64_hi);
1941*09537850SAkhilesh Sanikop for (int x = 0; x < 128; x += 16) {
1942*09537850SAkhilesh Sanikop _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), dup128_3);
1943*09537850SAkhilesh Sanikop }
1944*09537850SAkhilesh Sanikop }
1945*09537850SAkhilesh Sanikop
1946*09537850SAkhilesh Sanikop // ColStoreN<height> copies each of the |height| values in |column| across its
1947*09537850SAkhilesh Sanikop // corresponding row in dest.
1948*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore4_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1949*09537850SAkhilesh Sanikop inline void ColStore4_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1950*09537850SAkhilesh Sanikop ptrdiff_t stride,
1951*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
1952*09537850SAkhilesh Sanikop const __m128i col_data = LoadLo8(column);
1953*09537850SAkhilesh Sanikop const __m128i col_dup32 = _mm_unpacklo_epi16(col_data, col_data);
1954*09537850SAkhilesh Sanikop writefn(dest, stride, col_dup32);
1955*09537850SAkhilesh Sanikop }
1956*09537850SAkhilesh Sanikop
1957*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore8_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1958*09537850SAkhilesh Sanikop inline void ColStore8_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1959*09537850SAkhilesh Sanikop ptrdiff_t stride,
1960*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
1961*09537850SAkhilesh Sanikop const __m128i col_data = LoadUnaligned16(column);
1962*09537850SAkhilesh Sanikop const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1963*09537850SAkhilesh Sanikop const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
1964*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1965*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lo);
1966*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
1967*09537850SAkhilesh Sanikop dst += stride4;
1968*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hi);
1969*09537850SAkhilesh Sanikop }
1970*09537850SAkhilesh Sanikop
1971*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore16_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1972*09537850SAkhilesh Sanikop inline void ColStore16_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1973*09537850SAkhilesh Sanikop ptrdiff_t stride,
1974*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
1975*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
1976*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1977*09537850SAkhilesh Sanikop for (int y = 0; y < 32; y += 16) {
1978*09537850SAkhilesh Sanikop const __m128i col_data =
1979*09537850SAkhilesh Sanikop LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
1980*09537850SAkhilesh Sanikop const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1981*09537850SAkhilesh Sanikop const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
1982*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lo);
1983*09537850SAkhilesh Sanikop dst += stride4;
1984*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hi);
1985*09537850SAkhilesh Sanikop dst += stride4;
1986*09537850SAkhilesh Sanikop }
1987*09537850SAkhilesh Sanikop }
1988*09537850SAkhilesh Sanikop
1989*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore32_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)1990*09537850SAkhilesh Sanikop inline void ColStore32_SSE4_1(void* LIBGAV1_RESTRICT const dest,
1991*09537850SAkhilesh Sanikop ptrdiff_t stride,
1992*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
1993*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
1994*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
1995*09537850SAkhilesh Sanikop for (int y = 0; y < 64; y += 16) {
1996*09537850SAkhilesh Sanikop const __m128i col_data =
1997*09537850SAkhilesh Sanikop LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
1998*09537850SAkhilesh Sanikop const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
1999*09537850SAkhilesh Sanikop const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
2000*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lo);
2001*09537850SAkhilesh Sanikop dst += stride4;
2002*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hi);
2003*09537850SAkhilesh Sanikop dst += stride4;
2004*09537850SAkhilesh Sanikop }
2005*09537850SAkhilesh Sanikop }
2006*09537850SAkhilesh Sanikop
2007*09537850SAkhilesh Sanikop template <WriteDuplicateFunc writefn>
ColStore64_SSE4_1(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const column)2008*09537850SAkhilesh Sanikop inline void ColStore64_SSE4_1(void* LIBGAV1_RESTRICT const dest,
2009*09537850SAkhilesh Sanikop ptrdiff_t stride,
2010*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const column) {
2011*09537850SAkhilesh Sanikop const ptrdiff_t stride4 = stride << 2;
2012*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
2013*09537850SAkhilesh Sanikop for (int y = 0; y < 128; y += 16) {
2014*09537850SAkhilesh Sanikop const __m128i col_data =
2015*09537850SAkhilesh Sanikop LoadUnaligned16(static_cast<const uint8_t*>(column) + y);
2016*09537850SAkhilesh Sanikop const __m128i col_dup32_lo = _mm_unpacklo_epi16(col_data, col_data);
2017*09537850SAkhilesh Sanikop const __m128i col_dup32_hi = _mm_unpackhi_epi16(col_data, col_data);
2018*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_lo);
2019*09537850SAkhilesh Sanikop dst += stride4;
2020*09537850SAkhilesh Sanikop writefn(dst, stride, col_dup32_hi);
2021*09537850SAkhilesh Sanikop dst += stride4;
2022*09537850SAkhilesh Sanikop }
2023*09537850SAkhilesh Sanikop }
2024*09537850SAkhilesh Sanikop
2025*09537850SAkhilesh Sanikop // |ref| points to 8 bytes containing 4 packed int16 values.
DcSum4_SSE4_1(const void * ref)2026*09537850SAkhilesh Sanikop inline __m128i DcSum4_SSE4_1(const void* ref) {
2027*09537850SAkhilesh Sanikop const __m128i vals = _mm_loadl_epi64(static_cast<const __m128i*>(ref));
2028*09537850SAkhilesh Sanikop const __m128i ones = _mm_set1_epi16(1);
2029*09537850SAkhilesh Sanikop
2030*09537850SAkhilesh Sanikop // half_sum[31:0] = a1+a2
2031*09537850SAkhilesh Sanikop // half_sum[63:32] = a3+a4
2032*09537850SAkhilesh Sanikop const __m128i half_sum = _mm_madd_epi16(vals, ones);
2033*09537850SAkhilesh Sanikop // Place half_sum[63:32] in shift_sum[31:0].
2034*09537850SAkhilesh Sanikop const __m128i shift_sum = _mm_srli_si128(half_sum, 4);
2035*09537850SAkhilesh Sanikop return _mm_add_epi32(half_sum, shift_sum);
2036*09537850SAkhilesh Sanikop }
2037*09537850SAkhilesh Sanikop
2038*09537850SAkhilesh Sanikop struct DcDefs {
2039*09537850SAkhilesh Sanikop DcDefs() = delete;
2040*09537850SAkhilesh Sanikop
2041*09537850SAkhilesh Sanikop using _4x4 = DcPredFuncs_SSE4_1<2, 2, DcSum4_SSE4_1, DcSum4_SSE4_1,
2042*09537850SAkhilesh Sanikop DcStore4xH_SSE4_1<4>, 0, 0>;
2043*09537850SAkhilesh Sanikop };
2044*09537850SAkhilesh Sanikop
2045*09537850SAkhilesh Sanikop struct DirDefs {
2046*09537850SAkhilesh Sanikop DirDefs() = delete;
2047*09537850SAkhilesh Sanikop
2048*09537850SAkhilesh Sanikop using _4x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate4x4>>;
2049*09537850SAkhilesh Sanikop using _4x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate4x4>>;
2050*09537850SAkhilesh Sanikop using _4x16 =
2051*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate4x4>>;
2052*09537850SAkhilesh Sanikop using _8x4 = DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate8x4>>;
2053*09537850SAkhilesh Sanikop using _8x8 = DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate8x4>>;
2054*09537850SAkhilesh Sanikop using _8x16 =
2055*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate8x4>>;
2056*09537850SAkhilesh Sanikop using _8x32 =
2057*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate8x4>>;
2058*09537850SAkhilesh Sanikop using _16x4 =
2059*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore4_SSE4_1<WriteDuplicate16x4>>;
2060*09537850SAkhilesh Sanikop using _16x8 =
2061*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate16x4>>;
2062*09537850SAkhilesh Sanikop using _16x16 =
2063*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate16x4>>;
2064*09537850SAkhilesh Sanikop using _16x32 =
2065*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate16x4>>;
2066*09537850SAkhilesh Sanikop using _16x64 =
2067*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate16x4>>;
2068*09537850SAkhilesh Sanikop using _32x8 =
2069*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore8_SSE4_1<WriteDuplicate32x4>>;
2070*09537850SAkhilesh Sanikop using _32x16 =
2071*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate32x4>>;
2072*09537850SAkhilesh Sanikop using _32x32 =
2073*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate32x4>>;
2074*09537850SAkhilesh Sanikop using _32x64 =
2075*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate32x4>>;
2076*09537850SAkhilesh Sanikop using _64x16 =
2077*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore16_SSE4_1<WriteDuplicate64x4>>;
2078*09537850SAkhilesh Sanikop using _64x32 =
2079*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore32_SSE4_1<WriteDuplicate64x4>>;
2080*09537850SAkhilesh Sanikop using _64x64 =
2081*09537850SAkhilesh Sanikop DirectionalPredFuncs_SSE4_1<ColStore64_SSE4_1<WriteDuplicate64x4>>;
2082*09537850SAkhilesh Sanikop };
2083*09537850SAkhilesh Sanikop
Init10bpp()2084*09537850SAkhilesh Sanikop void Init10bpp() {
2085*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
2086*09537850SAkhilesh Sanikop assert(dsp != nullptr);
2087*09537850SAkhilesh Sanikop static_cast<void>(dsp);
2088*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
2089*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
2090*09537850SAkhilesh Sanikop DcDefs::_4x4::DcTop;
2091*09537850SAkhilesh Sanikop #endif
2092*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDcLeft)
2093*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
2094*09537850SAkhilesh Sanikop DcDefs::_4x4::DcLeft;
2095*09537850SAkhilesh Sanikop #endif
2096*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
2097*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
2098*09537850SAkhilesh Sanikop DcDefs::_4x4::Dc;
2099*09537850SAkhilesh Sanikop #endif
2100*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_IntraPredictorHorizontal)
2101*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x4][kIntraPredictorHorizontal] =
2102*09537850SAkhilesh Sanikop DirDefs::_4x4::Horizontal;
2103*09537850SAkhilesh Sanikop #endif
2104*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_IntraPredictorHorizontal)
2105*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
2106*09537850SAkhilesh Sanikop DirDefs::_4x8::Horizontal;
2107*09537850SAkhilesh Sanikop #endif
2108*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_IntraPredictorHorizontal)
2109*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
2110*09537850SAkhilesh Sanikop DirDefs::_4x16::Horizontal;
2111*09537850SAkhilesh Sanikop #endif
2112*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_IntraPredictorHorizontal)
2113*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x4][kIntraPredictorHorizontal] =
2114*09537850SAkhilesh Sanikop DirDefs::_8x4::Horizontal;
2115*09537850SAkhilesh Sanikop #endif
2116*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_IntraPredictorHorizontal)
2117*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
2118*09537850SAkhilesh Sanikop DirDefs::_8x8::Horizontal;
2119*09537850SAkhilesh Sanikop #endif
2120*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_IntraPredictorHorizontal)
2121*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x16][kIntraPredictorHorizontal] =
2122*09537850SAkhilesh Sanikop DirDefs::_8x16::Horizontal;
2123*09537850SAkhilesh Sanikop #endif
2124*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_IntraPredictorHorizontal)
2125*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
2126*09537850SAkhilesh Sanikop DirDefs::_8x32::Horizontal;
2127*09537850SAkhilesh Sanikop #endif
2128*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_IntraPredictorHorizontal)
2129*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x4][kIntraPredictorHorizontal] =
2130*09537850SAkhilesh Sanikop DirDefs::_16x4::Horizontal;
2131*09537850SAkhilesh Sanikop #endif
2132*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_IntraPredictorHorizontal)
2133*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
2134*09537850SAkhilesh Sanikop DirDefs::_16x8::Horizontal;
2135*09537850SAkhilesh Sanikop #endif
2136*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_IntraPredictorHorizontal)
2137*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x16][kIntraPredictorHorizontal] =
2138*09537850SAkhilesh Sanikop DirDefs::_16x16::Horizontal;
2139*09537850SAkhilesh Sanikop #endif
2140*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_IntraPredictorHorizontal)
2141*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x32][kIntraPredictorHorizontal] =
2142*09537850SAkhilesh Sanikop DirDefs::_16x32::Horizontal;
2143*09537850SAkhilesh Sanikop #endif
2144*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x64_IntraPredictorHorizontal)
2145*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize16x64][kIntraPredictorHorizontal] =
2146*09537850SAkhilesh Sanikop DirDefs::_16x64::Horizontal;
2147*09537850SAkhilesh Sanikop #endif
2148*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_IntraPredictorHorizontal)
2149*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x8][kIntraPredictorHorizontal] =
2150*09537850SAkhilesh Sanikop DirDefs::_32x8::Horizontal;
2151*09537850SAkhilesh Sanikop #endif
2152*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_IntraPredictorHorizontal)
2153*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x16][kIntraPredictorHorizontal] =
2154*09537850SAkhilesh Sanikop DirDefs::_32x16::Horizontal;
2155*09537850SAkhilesh Sanikop #endif
2156*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_IntraPredictorHorizontal)
2157*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x32][kIntraPredictorHorizontal] =
2158*09537850SAkhilesh Sanikop DirDefs::_32x32::Horizontal;
2159*09537850SAkhilesh Sanikop #endif
2160*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x64_IntraPredictorHorizontal)
2161*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
2162*09537850SAkhilesh Sanikop DirDefs::_32x64::Horizontal;
2163*09537850SAkhilesh Sanikop #endif
2164*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x16_IntraPredictorHorizontal)
2165*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x16][kIntraPredictorHorizontal] =
2166*09537850SAkhilesh Sanikop DirDefs::_64x16::Horizontal;
2167*09537850SAkhilesh Sanikop #endif
2168*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x32_IntraPredictorHorizontal)
2169*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x32][kIntraPredictorHorizontal] =
2170*09537850SAkhilesh Sanikop DirDefs::_64x32::Horizontal;
2171*09537850SAkhilesh Sanikop #endif
2172*09537850SAkhilesh Sanikop #if DSP_ENABLED_10BPP_SSE4_1(TransformSize64x64_IntraPredictorHorizontal)
2173*09537850SAkhilesh Sanikop dsp->intra_predictors[kTransformSize64x64][kIntraPredictorHorizontal] =
2174*09537850SAkhilesh Sanikop DirDefs::_64x64::Horizontal;
2175*09537850SAkhilesh Sanikop #endif
2176*09537850SAkhilesh Sanikop }
2177*09537850SAkhilesh Sanikop
2178*09537850SAkhilesh Sanikop } // namespace
2179*09537850SAkhilesh Sanikop } // namespace high_bitdepth
2180*09537850SAkhilesh Sanikop #endif // LIBGAV1_MAX_BITDEPTH >= 10
2181*09537850SAkhilesh Sanikop
IntraPredInit_SSE4_1()2182*09537850SAkhilesh Sanikop void IntraPredInit_SSE4_1() {
2183*09537850SAkhilesh Sanikop low_bitdepth::Init8bpp();
2184*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
2185*09537850SAkhilesh Sanikop high_bitdepth::Init10bpp();
2186*09537850SAkhilesh Sanikop #endif
2187*09537850SAkhilesh Sanikop }
2188*09537850SAkhilesh Sanikop
2189*09537850SAkhilesh Sanikop } // namespace dsp
2190*09537850SAkhilesh Sanikop } // namespace libgav1
2191*09537850SAkhilesh Sanikop
2192*09537850SAkhilesh Sanikop #else // !LIBGAV1_TARGETING_SSE4_1
2193*09537850SAkhilesh Sanikop namespace libgav1 {
2194*09537850SAkhilesh Sanikop namespace dsp {
2195*09537850SAkhilesh Sanikop
IntraPredInit_SSE4_1()2196*09537850SAkhilesh Sanikop void IntraPredInit_SSE4_1() {}
2197*09537850SAkhilesh Sanikop
2198*09537850SAkhilesh Sanikop } // namespace dsp
2199*09537850SAkhilesh Sanikop } // namespace libgav1
2200*09537850SAkhilesh Sanikop #endif // LIBGAV1_TARGETING_SSE4_1
2201