xref: /aosp_15_r20/external/libgav1/src/dsp/arm/intrapred_neon.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop //      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop 
15*09537850SAkhilesh Sanikop #include "src/dsp/intrapred.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop 
18*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_NEON
19*09537850SAkhilesh Sanikop 
20*09537850SAkhilesh Sanikop #include <arm_neon.h>
21*09537850SAkhilesh Sanikop 
22*09537850SAkhilesh Sanikop #include <cassert>
23*09537850SAkhilesh Sanikop #include <cstddef>
24*09537850SAkhilesh Sanikop #include <cstdint>
25*09537850SAkhilesh Sanikop 
26*09537850SAkhilesh Sanikop #include "src/dsp/arm/common_neon.h"
27*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
28*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
29*09537850SAkhilesh Sanikop #include "src/utils/common.h"
30*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
31*09537850SAkhilesh Sanikop 
32*09537850SAkhilesh Sanikop namespace libgav1 {
33*09537850SAkhilesh Sanikop namespace dsp {
34*09537850SAkhilesh Sanikop namespace {
35*09537850SAkhilesh Sanikop 
36*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
37*09537850SAkhilesh Sanikop // DcPredFuncs_NEON
38*09537850SAkhilesh Sanikop 
39*09537850SAkhilesh Sanikop using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
40*09537850SAkhilesh Sanikop                                  const bool use_ref_1, const void* ref_1,
41*09537850SAkhilesh Sanikop                                  const int ref_1_size_log2);
42*09537850SAkhilesh Sanikop using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
43*09537850SAkhilesh Sanikop 
44*09537850SAkhilesh Sanikop // DC intra-predictors for square blocks.
45*09537850SAkhilesh Sanikop template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
46*09537850SAkhilesh Sanikop           DcStoreFunc storefn>
47*09537850SAkhilesh Sanikop struct DcPredFuncs_NEON {
48*09537850SAkhilesh Sanikop   DcPredFuncs_NEON() = delete;
49*09537850SAkhilesh Sanikop 
50*09537850SAkhilesh Sanikop   static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
51*09537850SAkhilesh Sanikop                     const void* left_column);
52*09537850SAkhilesh Sanikop   static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
53*09537850SAkhilesh Sanikop                      const void* left_column);
54*09537850SAkhilesh Sanikop   static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
55*09537850SAkhilesh Sanikop                  const void* left_column);
56*09537850SAkhilesh Sanikop };
57*09537850SAkhilesh Sanikop 
58*09537850SAkhilesh Sanikop template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
59*09537850SAkhilesh Sanikop           DcStoreFunc storefn>
60*09537850SAkhilesh Sanikop void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
DcTop(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void *)61*09537850SAkhilesh Sanikop     DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
62*09537850SAkhilesh Sanikop           const void* LIBGAV1_RESTRICT const top_row,
63*09537850SAkhilesh Sanikop           const void* /*left_column*/) {
64*09537850SAkhilesh Sanikop   const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
65*09537850SAkhilesh Sanikop   const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
66*09537850SAkhilesh Sanikop   storefn(dest, stride, dc);
67*09537850SAkhilesh Sanikop }
68*09537850SAkhilesh Sanikop 
69*09537850SAkhilesh Sanikop template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
70*09537850SAkhilesh Sanikop           DcStoreFunc storefn>
71*09537850SAkhilesh Sanikop void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
DcLeft(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)72*09537850SAkhilesh Sanikop     DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
73*09537850SAkhilesh Sanikop            const void* /*top_row*/,
74*09537850SAkhilesh Sanikop            const void* LIBGAV1_RESTRICT const left_column) {
75*09537850SAkhilesh Sanikop   const uint32x2_t sum =
76*09537850SAkhilesh Sanikop       sumfn(left_column, block_height_log2, false, nullptr, 0);
77*09537850SAkhilesh Sanikop   const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
78*09537850SAkhilesh Sanikop   storefn(dest, stride, dc);
79*09537850SAkhilesh Sanikop }
80*09537850SAkhilesh Sanikop 
81*09537850SAkhilesh Sanikop template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
82*09537850SAkhilesh Sanikop           DcStoreFunc storefn>
Dc(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)83*09537850SAkhilesh Sanikop void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
84*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
85*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const top_row,
86*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const left_column) {
87*09537850SAkhilesh Sanikop   const uint32x2_t sum =
88*09537850SAkhilesh Sanikop       sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
89*09537850SAkhilesh Sanikop   if (block_width_log2 == block_height_log2) {
90*09537850SAkhilesh Sanikop     const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
91*09537850SAkhilesh Sanikop     storefn(dest, stride, dc);
92*09537850SAkhilesh Sanikop   } else {
93*09537850SAkhilesh Sanikop     // TODO(johannkoenig): Compare this to mul/shift in vectors.
94*09537850SAkhilesh Sanikop     const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
95*09537850SAkhilesh Sanikop     uint32_t dc = vget_lane_u32(sum, 0);
96*09537850SAkhilesh Sanikop     dc += divisor >> 1;
97*09537850SAkhilesh Sanikop     dc /= divisor;
98*09537850SAkhilesh Sanikop     storefn(dest, stride, vdup_n_u32(dc));
99*09537850SAkhilesh Sanikop   }
100*09537850SAkhilesh Sanikop }
101*09537850SAkhilesh Sanikop 
102*09537850SAkhilesh Sanikop // Sum all the elements in the vector into the low 32 bits.
Sum(const uint16x4_t val)103*09537850SAkhilesh Sanikop inline uint32x2_t Sum(const uint16x4_t val) {
104*09537850SAkhilesh Sanikop   const uint32x2_t sum = vpaddl_u16(val);
105*09537850SAkhilesh Sanikop   return vpadd_u32(sum, sum);
106*09537850SAkhilesh Sanikop }
107*09537850SAkhilesh Sanikop 
108*09537850SAkhilesh Sanikop // Sum all the elements in the vector into the low 32 bits.
Sum(const uint16x8_t val)109*09537850SAkhilesh Sanikop inline uint32x2_t Sum(const uint16x8_t val) {
110*09537850SAkhilesh Sanikop   const uint32x4_t sum_0 = vpaddlq_u16(val);
111*09537850SAkhilesh Sanikop   const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
112*09537850SAkhilesh Sanikop   return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
113*09537850SAkhilesh Sanikop                   vget_high_u32(vreinterpretq_u32_u64(sum_1)));
114*09537850SAkhilesh Sanikop }
115*09537850SAkhilesh Sanikop 
116*09537850SAkhilesh Sanikop }  // namespace
117*09537850SAkhilesh Sanikop 
118*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
119*09537850SAkhilesh Sanikop namespace low_bitdepth {
120*09537850SAkhilesh Sanikop namespace {
121*09537850SAkhilesh Sanikop 
122*09537850SAkhilesh Sanikop // Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
123*09537850SAkhilesh Sanikop // entire vector.
Add(const uint8x16_t val_0,const uint8x16_t val_1)124*09537850SAkhilesh Sanikop inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
125*09537850SAkhilesh Sanikop   const uint16x8_t sum_0 = vpaddlq_u8(val_0);
126*09537850SAkhilesh Sanikop   const uint16x8_t sum_1 = vpaddlq_u8(val_1);
127*09537850SAkhilesh Sanikop   return vaddq_u16(sum_0, sum_1);
128*09537850SAkhilesh Sanikop }
129*09537850SAkhilesh Sanikop 
130*09537850SAkhilesh Sanikop // Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
131*09537850SAkhilesh Sanikop // the entire vector.
Add(const uint8x16_t val_0,const uint8x16_t val_1,const uint8x16_t val_2,const uint8x16_t val_3)132*09537850SAkhilesh Sanikop inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
133*09537850SAkhilesh Sanikop                       const uint8x16_t val_2, const uint8x16_t val_3) {
134*09537850SAkhilesh Sanikop   const uint16x8_t sum_0 = Add(val_0, val_1);
135*09537850SAkhilesh Sanikop   const uint16x8_t sum_1 = Add(val_2, val_3);
136*09537850SAkhilesh Sanikop   return vaddq_u16(sum_0, sum_1);
137*09537850SAkhilesh Sanikop }
138*09537850SAkhilesh Sanikop 
139*09537850SAkhilesh Sanikop // Load and combine 32 uint8_t values.
LoadAndAdd32(const uint8_t * buf)140*09537850SAkhilesh Sanikop inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
141*09537850SAkhilesh Sanikop   const uint8x16_t val_0 = vld1q_u8(buf);
142*09537850SAkhilesh Sanikop   const uint8x16_t val_1 = vld1q_u8(buf + 16);
143*09537850SAkhilesh Sanikop   return Add(val_0, val_1);
144*09537850SAkhilesh Sanikop }
145*09537850SAkhilesh Sanikop 
146*09537850SAkhilesh Sanikop // Load and combine 64 uint8_t values.
LoadAndAdd64(const uint8_t * buf)147*09537850SAkhilesh Sanikop inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
148*09537850SAkhilesh Sanikop   const uint8x16_t val_0 = vld1q_u8(buf);
149*09537850SAkhilesh Sanikop   const uint8x16_t val_1 = vld1q_u8(buf + 16);
150*09537850SAkhilesh Sanikop   const uint8x16_t val_2 = vld1q_u8(buf + 32);
151*09537850SAkhilesh Sanikop   const uint8x16_t val_3 = vld1q_u8(buf + 48);
152*09537850SAkhilesh Sanikop   return Add(val_0, val_1, val_2, val_3);
153*09537850SAkhilesh Sanikop }
154*09537850SAkhilesh Sanikop 
155*09537850SAkhilesh Sanikop // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
156*09537850SAkhilesh Sanikop // If |use_ref_1| is false then only sum |ref_0|.
157*09537850SAkhilesh Sanikop // For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
158*09537850SAkhilesh Sanikop // uint32_t.
DcSum_NEON(const void * LIBGAV1_RESTRICT ref_0,const int ref_0_size_log2,const bool use_ref_1,const void * LIBGAV1_RESTRICT ref_1,const int ref_1_size_log2)159*09537850SAkhilesh Sanikop inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
160*09537850SAkhilesh Sanikop                              const int ref_0_size_log2, const bool use_ref_1,
161*09537850SAkhilesh Sanikop                              const void* LIBGAV1_RESTRICT ref_1,
162*09537850SAkhilesh Sanikop                              const int ref_1_size_log2) {
163*09537850SAkhilesh Sanikop   const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
164*09537850SAkhilesh Sanikop   const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
165*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 2) {
166*09537850SAkhilesh Sanikop     uint8x8_t val = Load4(ref_0_u8);
167*09537850SAkhilesh Sanikop     if (use_ref_1) {
168*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
169*09537850SAkhilesh Sanikop         case 2: {  // 4x4
170*09537850SAkhilesh Sanikop           val = Load4<1>(ref_1_u8, val);
171*09537850SAkhilesh Sanikop           return Sum(vpaddl_u8(val));
172*09537850SAkhilesh Sanikop         }
173*09537850SAkhilesh Sanikop         case 3: {  // 4x8
174*09537850SAkhilesh Sanikop           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
175*09537850SAkhilesh Sanikop           const uint16x4_t sum_0 = vpaddl_u8(val);
176*09537850SAkhilesh Sanikop           const uint16x4_t sum_1 = vpaddl_u8(val_1);
177*09537850SAkhilesh Sanikop           return Sum(vadd_u16(sum_0, sum_1));
178*09537850SAkhilesh Sanikop         }
179*09537850SAkhilesh Sanikop         case 4: {  // 4x16
180*09537850SAkhilesh Sanikop           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
181*09537850SAkhilesh Sanikop           return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
182*09537850SAkhilesh Sanikop         }
183*09537850SAkhilesh Sanikop       }
184*09537850SAkhilesh Sanikop     }
185*09537850SAkhilesh Sanikop     // 4x1
186*09537850SAkhilesh Sanikop     const uint16x4_t sum = vpaddl_u8(val);
187*09537850SAkhilesh Sanikop     return vpaddl_u16(sum);
188*09537850SAkhilesh Sanikop   }
189*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 3) {
190*09537850SAkhilesh Sanikop     const uint8x8_t val_0 = vld1_u8(ref_0_u8);
191*09537850SAkhilesh Sanikop     if (use_ref_1) {
192*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
193*09537850SAkhilesh Sanikop         case 2: {  // 8x4
194*09537850SAkhilesh Sanikop           const uint8x8_t val_1 = Load4(ref_1_u8);
195*09537850SAkhilesh Sanikop           const uint16x4_t sum_0 = vpaddl_u8(val_0);
196*09537850SAkhilesh Sanikop           const uint16x4_t sum_1 = vpaddl_u8(val_1);
197*09537850SAkhilesh Sanikop           return Sum(vadd_u16(sum_0, sum_1));
198*09537850SAkhilesh Sanikop         }
199*09537850SAkhilesh Sanikop         case 3: {  // 8x8
200*09537850SAkhilesh Sanikop           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
201*09537850SAkhilesh Sanikop           const uint16x4_t sum_0 = vpaddl_u8(val_0);
202*09537850SAkhilesh Sanikop           const uint16x4_t sum_1 = vpaddl_u8(val_1);
203*09537850SAkhilesh Sanikop           return Sum(vadd_u16(sum_0, sum_1));
204*09537850SAkhilesh Sanikop         }
205*09537850SAkhilesh Sanikop         case 4: {  // 8x16
206*09537850SAkhilesh Sanikop           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
207*09537850SAkhilesh Sanikop           return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
208*09537850SAkhilesh Sanikop         }
209*09537850SAkhilesh Sanikop         case 5: {  // 8x32
210*09537850SAkhilesh Sanikop           return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
211*09537850SAkhilesh Sanikop         }
212*09537850SAkhilesh Sanikop       }
213*09537850SAkhilesh Sanikop     }
214*09537850SAkhilesh Sanikop     // 8x1
215*09537850SAkhilesh Sanikop     return Sum(vpaddl_u8(val_0));
216*09537850SAkhilesh Sanikop   }
217*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 4) {
218*09537850SAkhilesh Sanikop     const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
219*09537850SAkhilesh Sanikop     if (use_ref_1) {
220*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
221*09537850SAkhilesh Sanikop         case 2: {  // 16x4
222*09537850SAkhilesh Sanikop           const uint8x8_t val_1 = Load4(ref_1_u8);
223*09537850SAkhilesh Sanikop           return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
224*09537850SAkhilesh Sanikop         }
225*09537850SAkhilesh Sanikop         case 3: {  // 16x8
226*09537850SAkhilesh Sanikop           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
227*09537850SAkhilesh Sanikop           return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
228*09537850SAkhilesh Sanikop         }
229*09537850SAkhilesh Sanikop         case 4: {  // 16x16
230*09537850SAkhilesh Sanikop           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
231*09537850SAkhilesh Sanikop           return Sum(Add(val_0, val_1));
232*09537850SAkhilesh Sanikop         }
233*09537850SAkhilesh Sanikop         case 5: {  // 16x32
234*09537850SAkhilesh Sanikop           const uint16x8_t sum_0 = vpaddlq_u8(val_0);
235*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
236*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
237*09537850SAkhilesh Sanikop         }
238*09537850SAkhilesh Sanikop         case 6: {  // 16x64
239*09537850SAkhilesh Sanikop           const uint16x8_t sum_0 = vpaddlq_u8(val_0);
240*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
241*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
242*09537850SAkhilesh Sanikop         }
243*09537850SAkhilesh Sanikop       }
244*09537850SAkhilesh Sanikop     }
245*09537850SAkhilesh Sanikop     // 16x1
246*09537850SAkhilesh Sanikop     return Sum(vpaddlq_u8(val_0));
247*09537850SAkhilesh Sanikop   }
248*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 5) {
249*09537850SAkhilesh Sanikop     const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
250*09537850SAkhilesh Sanikop     if (use_ref_1) {
251*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
252*09537850SAkhilesh Sanikop         case 3: {  // 32x8
253*09537850SAkhilesh Sanikop           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
254*09537850SAkhilesh Sanikop           return Sum(vaddw_u8(sum_0, val_1));
255*09537850SAkhilesh Sanikop         }
256*09537850SAkhilesh Sanikop         case 4: {  // 32x16
257*09537850SAkhilesh Sanikop           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
258*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = vpaddlq_u8(val_1);
259*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
260*09537850SAkhilesh Sanikop         }
261*09537850SAkhilesh Sanikop         case 5: {  // 32x32
262*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
263*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
264*09537850SAkhilesh Sanikop         }
265*09537850SAkhilesh Sanikop         case 6: {  // 32x64
266*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
267*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
268*09537850SAkhilesh Sanikop         }
269*09537850SAkhilesh Sanikop       }
270*09537850SAkhilesh Sanikop     }
271*09537850SAkhilesh Sanikop     // 32x1
272*09537850SAkhilesh Sanikop     return Sum(sum_0);
273*09537850SAkhilesh Sanikop   }
274*09537850SAkhilesh Sanikop 
275*09537850SAkhilesh Sanikop   assert(ref_0_size_log2 == 6);
276*09537850SAkhilesh Sanikop   const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
277*09537850SAkhilesh Sanikop   if (use_ref_1) {
278*09537850SAkhilesh Sanikop     switch (ref_1_size_log2) {
279*09537850SAkhilesh Sanikop       case 4: {  // 64x16
280*09537850SAkhilesh Sanikop         const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
281*09537850SAkhilesh Sanikop         const uint16x8_t sum_1 = vpaddlq_u8(val_1);
282*09537850SAkhilesh Sanikop         return Sum(vaddq_u16(sum_0, sum_1));
283*09537850SAkhilesh Sanikop       }
284*09537850SAkhilesh Sanikop       case 5: {  // 64x32
285*09537850SAkhilesh Sanikop         const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
286*09537850SAkhilesh Sanikop         return Sum(vaddq_u16(sum_0, sum_1));
287*09537850SAkhilesh Sanikop       }
288*09537850SAkhilesh Sanikop       case 6: {  // 64x64
289*09537850SAkhilesh Sanikop         const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
290*09537850SAkhilesh Sanikop         return Sum(vaddq_u16(sum_0, sum_1));
291*09537850SAkhilesh Sanikop       }
292*09537850SAkhilesh Sanikop     }
293*09537850SAkhilesh Sanikop   }
294*09537850SAkhilesh Sanikop   // 64x1
295*09537850SAkhilesh Sanikop   return Sum(sum_0);
296*09537850SAkhilesh Sanikop }
297*09537850SAkhilesh Sanikop 
298*09537850SAkhilesh Sanikop template <int width, int height>
DcStore_NEON(void * const dest,ptrdiff_t stride,const uint32x2_t dc)299*09537850SAkhilesh Sanikop inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
300*09537850SAkhilesh Sanikop                          const uint32x2_t dc) {
301*09537850SAkhilesh Sanikop   const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
302*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
303*09537850SAkhilesh Sanikop   if (width == 4) {
304*09537850SAkhilesh Sanikop     int i = height - 1;
305*09537850SAkhilesh Sanikop     do {
306*09537850SAkhilesh Sanikop       StoreLo4(dst, vget_low_u8(dc_dup));
307*09537850SAkhilesh Sanikop       dst += stride;
308*09537850SAkhilesh Sanikop     } while (--i != 0);
309*09537850SAkhilesh Sanikop     StoreLo4(dst, vget_low_u8(dc_dup));
310*09537850SAkhilesh Sanikop   } else if (width == 8) {
311*09537850SAkhilesh Sanikop     int i = height - 1;
312*09537850SAkhilesh Sanikop     do {
313*09537850SAkhilesh Sanikop       vst1_u8(dst, vget_low_u8(dc_dup));
314*09537850SAkhilesh Sanikop       dst += stride;
315*09537850SAkhilesh Sanikop     } while (--i != 0);
316*09537850SAkhilesh Sanikop     vst1_u8(dst, vget_low_u8(dc_dup));
317*09537850SAkhilesh Sanikop   } else if (width == 16) {
318*09537850SAkhilesh Sanikop     int i = height - 1;
319*09537850SAkhilesh Sanikop     do {
320*09537850SAkhilesh Sanikop       vst1q_u8(dst, dc_dup);
321*09537850SAkhilesh Sanikop       dst += stride;
322*09537850SAkhilesh Sanikop     } while (--i != 0);
323*09537850SAkhilesh Sanikop     vst1q_u8(dst, dc_dup);
324*09537850SAkhilesh Sanikop   } else if (width == 32) {
325*09537850SAkhilesh Sanikop     int i = height - 1;
326*09537850SAkhilesh Sanikop     do {
327*09537850SAkhilesh Sanikop       vst1q_u8(dst, dc_dup);
328*09537850SAkhilesh Sanikop       vst1q_u8(dst + 16, dc_dup);
329*09537850SAkhilesh Sanikop       dst += stride;
330*09537850SAkhilesh Sanikop     } while (--i != 0);
331*09537850SAkhilesh Sanikop     vst1q_u8(dst, dc_dup);
332*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, dc_dup);
333*09537850SAkhilesh Sanikop   } else {
334*09537850SAkhilesh Sanikop     assert(width == 64);
335*09537850SAkhilesh Sanikop     int i = height - 1;
336*09537850SAkhilesh Sanikop     do {
337*09537850SAkhilesh Sanikop       vst1q_u8(dst, dc_dup);
338*09537850SAkhilesh Sanikop       vst1q_u8(dst + 16, dc_dup);
339*09537850SAkhilesh Sanikop       vst1q_u8(dst + 32, dc_dup);
340*09537850SAkhilesh Sanikop       vst1q_u8(dst + 48, dc_dup);
341*09537850SAkhilesh Sanikop       dst += stride;
342*09537850SAkhilesh Sanikop     } while (--i != 0);
343*09537850SAkhilesh Sanikop     vst1q_u8(dst, dc_dup);
344*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, dc_dup);
345*09537850SAkhilesh Sanikop     vst1q_u8(dst + 32, dc_dup);
346*09537850SAkhilesh Sanikop     vst1q_u8(dst + 48, dc_dup);
347*09537850SAkhilesh Sanikop   }
348*09537850SAkhilesh Sanikop }
349*09537850SAkhilesh Sanikop 
350*09537850SAkhilesh Sanikop template <int width, int height>
Paeth4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)351*09537850SAkhilesh Sanikop inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
352*09537850SAkhilesh Sanikop                              ptrdiff_t stride,
353*09537850SAkhilesh Sanikop                              const void* LIBGAV1_RESTRICT const top_row,
354*09537850SAkhilesh Sanikop                              const void* LIBGAV1_RESTRICT const left_column) {
355*09537850SAkhilesh Sanikop   auto* dest_u8 = static_cast<uint8_t*>(dest);
356*09537850SAkhilesh Sanikop   const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
357*09537850SAkhilesh Sanikop   const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
358*09537850SAkhilesh Sanikop 
359*09537850SAkhilesh Sanikop   const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
360*09537850SAkhilesh Sanikop   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
361*09537850SAkhilesh Sanikop   uint8x8_t top;
362*09537850SAkhilesh Sanikop   if (width == 4) {
363*09537850SAkhilesh Sanikop     top = Load4(top_row_u8);
364*09537850SAkhilesh Sanikop   } else {  // width == 8
365*09537850SAkhilesh Sanikop     top = vld1_u8(top_row_u8);
366*09537850SAkhilesh Sanikop   }
367*09537850SAkhilesh Sanikop 
368*09537850SAkhilesh Sanikop   for (int y = 0; y < height; ++y) {
369*09537850SAkhilesh Sanikop     const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
370*09537850SAkhilesh Sanikop 
371*09537850SAkhilesh Sanikop     const uint8x8_t left_dist = vabd_u8(top, top_left);
372*09537850SAkhilesh Sanikop     const uint8x8_t top_dist = vabd_u8(left, top_left);
373*09537850SAkhilesh Sanikop     const uint16x8_t top_left_dist =
374*09537850SAkhilesh Sanikop         vabdq_u16(vaddl_u8(top, left), top_left_x2);
375*09537850SAkhilesh Sanikop 
376*09537850SAkhilesh Sanikop     const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
377*09537850SAkhilesh Sanikop     const uint8x8_t left_le_top_left =
378*09537850SAkhilesh Sanikop         vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
379*09537850SAkhilesh Sanikop     const uint8x8_t top_le_top_left =
380*09537850SAkhilesh Sanikop         vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
381*09537850SAkhilesh Sanikop 
382*09537850SAkhilesh Sanikop     // if (left_dist <= top_dist && left_dist <= top_left_dist)
383*09537850SAkhilesh Sanikop     const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
384*09537850SAkhilesh Sanikop     //   dest[x] = left_column[y];
385*09537850SAkhilesh Sanikop     // Fill all the unused spaces with 'top'. They will be overwritten when
386*09537850SAkhilesh Sanikop     // the positions for top_left are known.
387*09537850SAkhilesh Sanikop     uint8x8_t result = vbsl_u8(left_mask, left, top);
388*09537850SAkhilesh Sanikop     // else if (top_dist <= top_left_dist)
389*09537850SAkhilesh Sanikop     //   dest[x] = top_row[x];
390*09537850SAkhilesh Sanikop     // Add these values to the mask. They were already set.
391*09537850SAkhilesh Sanikop     const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
392*09537850SAkhilesh Sanikop     // else
393*09537850SAkhilesh Sanikop     //   dest[x] = top_left;
394*09537850SAkhilesh Sanikop     result = vbsl_u8(left_or_top_mask, result, top_left);
395*09537850SAkhilesh Sanikop 
396*09537850SAkhilesh Sanikop     if (width == 4) {
397*09537850SAkhilesh Sanikop       StoreLo4(dest_u8, result);
398*09537850SAkhilesh Sanikop     } else {  // width == 8
399*09537850SAkhilesh Sanikop       vst1_u8(dest_u8, result);
400*09537850SAkhilesh Sanikop     }
401*09537850SAkhilesh Sanikop     dest_u8 += stride;
402*09537850SAkhilesh Sanikop   }
403*09537850SAkhilesh Sanikop }
404*09537850SAkhilesh Sanikop 
405*09537850SAkhilesh Sanikop // Calculate X distance <= TopLeft distance and pack the resulting mask into
406*09537850SAkhilesh Sanikop // uint8x8_t.
XLeTopLeft(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)407*09537850SAkhilesh Sanikop inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
408*09537850SAkhilesh Sanikop                              const uint16x8_t top_left_dist_low,
409*09537850SAkhilesh Sanikop                              const uint16x8_t top_left_dist_high) {
410*09537850SAkhilesh Sanikop   const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
411*09537850SAkhilesh Sanikop                                                vqmovn_u16(top_left_dist_high));
412*09537850SAkhilesh Sanikop   return vcleq_u8(x_dist, top_left_dist);
413*09537850SAkhilesh Sanikop }
414*09537850SAkhilesh Sanikop 
415*09537850SAkhilesh Sanikop // Select the closest values and collect them.
SelectPaeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)416*09537850SAkhilesh Sanikop inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
417*09537850SAkhilesh Sanikop                               const uint8x16_t top_left,
418*09537850SAkhilesh Sanikop                               const uint8x16_t left_le_top,
419*09537850SAkhilesh Sanikop                               const uint8x16_t left_le_top_left,
420*09537850SAkhilesh Sanikop                               const uint8x16_t top_le_top_left) {
421*09537850SAkhilesh Sanikop   // if (left_dist <= top_dist && left_dist <= top_left_dist)
422*09537850SAkhilesh Sanikop   const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
423*09537850SAkhilesh Sanikop   //   dest[x] = left_column[y];
424*09537850SAkhilesh Sanikop   // Fill all the unused spaces with 'top'. They will be overwritten when
425*09537850SAkhilesh Sanikop   // the positions for top_left are known.
426*09537850SAkhilesh Sanikop   uint8x16_t result = vbslq_u8(left_mask, left, top);
427*09537850SAkhilesh Sanikop   // else if (top_dist <= top_left_dist)
428*09537850SAkhilesh Sanikop   //   dest[x] = top_row[x];
429*09537850SAkhilesh Sanikop   // Add these values to the mask. They were already set.
430*09537850SAkhilesh Sanikop   const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
431*09537850SAkhilesh Sanikop   // else
432*09537850SAkhilesh Sanikop   //   dest[x] = top_left;
433*09537850SAkhilesh Sanikop   return vbslq_u8(left_or_top_mask, result, top_left);
434*09537850SAkhilesh Sanikop }
435*09537850SAkhilesh Sanikop 
436*09537850SAkhilesh Sanikop // Generate numbered and high/low versions of top_left_dist.
437*09537850SAkhilesh Sanikop #define TOP_LEFT_DIST(num)                                              \
438*09537850SAkhilesh Sanikop   const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
439*09537850SAkhilesh Sanikop       vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
440*09537850SAkhilesh Sanikop   const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
441*09537850SAkhilesh Sanikop       vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
442*09537850SAkhilesh Sanikop 
443*09537850SAkhilesh Sanikop // Generate numbered versions of XLeTopLeft with x = left.
444*09537850SAkhilesh Sanikop #define LEFT_LE_TOP_LEFT(num)                                  \
445*09537850SAkhilesh Sanikop   const uint8x16_t left_le_top_left_##num =                    \
446*09537850SAkhilesh Sanikop       XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
447*09537850SAkhilesh Sanikop                  top_left_##num##_dist_high)
448*09537850SAkhilesh Sanikop 
449*09537850SAkhilesh Sanikop // Generate numbered versions of XLeTopLeft with x = top.
450*09537850SAkhilesh Sanikop #define TOP_LE_TOP_LEFT(num)                           \
451*09537850SAkhilesh Sanikop   const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
452*09537850SAkhilesh Sanikop       top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
453*09537850SAkhilesh Sanikop 
454*09537850SAkhilesh Sanikop template <int width, int height>
Paeth16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)455*09537850SAkhilesh Sanikop inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
456*09537850SAkhilesh Sanikop                                ptrdiff_t stride,
457*09537850SAkhilesh Sanikop                                const void* LIBGAV1_RESTRICT const top_row,
458*09537850SAkhilesh Sanikop                                const void* LIBGAV1_RESTRICT const left_column) {
459*09537850SAkhilesh Sanikop   auto* dest_u8 = static_cast<uint8_t*>(dest);
460*09537850SAkhilesh Sanikop   const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
461*09537850SAkhilesh Sanikop   const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
462*09537850SAkhilesh Sanikop 
463*09537850SAkhilesh Sanikop   const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
464*09537850SAkhilesh Sanikop   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
465*09537850SAkhilesh Sanikop   uint8x16_t top[4];
466*09537850SAkhilesh Sanikop   top[0] = vld1q_u8(top_row_u8);
467*09537850SAkhilesh Sanikop   if (width > 16) {
468*09537850SAkhilesh Sanikop     top[1] = vld1q_u8(top_row_u8 + 16);
469*09537850SAkhilesh Sanikop     if (width == 64) {
470*09537850SAkhilesh Sanikop       top[2] = vld1q_u8(top_row_u8 + 32);
471*09537850SAkhilesh Sanikop       top[3] = vld1q_u8(top_row_u8 + 48);
472*09537850SAkhilesh Sanikop     }
473*09537850SAkhilesh Sanikop   }
474*09537850SAkhilesh Sanikop 
475*09537850SAkhilesh Sanikop   for (int y = 0; y < height; ++y) {
476*09537850SAkhilesh Sanikop     const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
477*09537850SAkhilesh Sanikop 
478*09537850SAkhilesh Sanikop     const uint8x16_t top_dist = vabdq_u8(left, top_left);
479*09537850SAkhilesh Sanikop 
480*09537850SAkhilesh Sanikop     const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
481*09537850SAkhilesh Sanikop     TOP_LEFT_DIST(0);
482*09537850SAkhilesh Sanikop     const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
483*09537850SAkhilesh Sanikop     LEFT_LE_TOP_LEFT(0);
484*09537850SAkhilesh Sanikop     TOP_LE_TOP_LEFT(0);
485*09537850SAkhilesh Sanikop 
486*09537850SAkhilesh Sanikop     const uint8x16_t result_0 =
487*09537850SAkhilesh Sanikop         SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
488*09537850SAkhilesh Sanikop                     top_le_top_left_0);
489*09537850SAkhilesh Sanikop     vst1q_u8(dest_u8, result_0);
490*09537850SAkhilesh Sanikop 
491*09537850SAkhilesh Sanikop     if (width > 16) {
492*09537850SAkhilesh Sanikop       const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
493*09537850SAkhilesh Sanikop       TOP_LEFT_DIST(1);
494*09537850SAkhilesh Sanikop       const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
495*09537850SAkhilesh Sanikop       LEFT_LE_TOP_LEFT(1);
496*09537850SAkhilesh Sanikop       TOP_LE_TOP_LEFT(1);
497*09537850SAkhilesh Sanikop 
498*09537850SAkhilesh Sanikop       const uint8x16_t result_1 =
499*09537850SAkhilesh Sanikop           SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
500*09537850SAkhilesh Sanikop                       top_le_top_left_1);
501*09537850SAkhilesh Sanikop       vst1q_u8(dest_u8 + 16, result_1);
502*09537850SAkhilesh Sanikop 
503*09537850SAkhilesh Sanikop       if (width == 64) {
504*09537850SAkhilesh Sanikop         const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
505*09537850SAkhilesh Sanikop         TOP_LEFT_DIST(2);
506*09537850SAkhilesh Sanikop         const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
507*09537850SAkhilesh Sanikop         LEFT_LE_TOP_LEFT(2);
508*09537850SAkhilesh Sanikop         TOP_LE_TOP_LEFT(2);
509*09537850SAkhilesh Sanikop 
510*09537850SAkhilesh Sanikop         const uint8x16_t result_2 =
511*09537850SAkhilesh Sanikop             SelectPaeth(top[2], left, top_left, left_2_le_top,
512*09537850SAkhilesh Sanikop                         left_le_top_left_2, top_le_top_left_2);
513*09537850SAkhilesh Sanikop         vst1q_u8(dest_u8 + 32, result_2);
514*09537850SAkhilesh Sanikop 
515*09537850SAkhilesh Sanikop         const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
516*09537850SAkhilesh Sanikop         TOP_LEFT_DIST(3);
517*09537850SAkhilesh Sanikop         const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
518*09537850SAkhilesh Sanikop         LEFT_LE_TOP_LEFT(3);
519*09537850SAkhilesh Sanikop         TOP_LE_TOP_LEFT(3);
520*09537850SAkhilesh Sanikop 
521*09537850SAkhilesh Sanikop         const uint8x16_t result_3 =
522*09537850SAkhilesh Sanikop             SelectPaeth(top[3], left, top_left, left_3_le_top,
523*09537850SAkhilesh Sanikop                         left_le_top_left_3, top_le_top_left_3);
524*09537850SAkhilesh Sanikop         vst1q_u8(dest_u8 + 48, result_3);
525*09537850SAkhilesh Sanikop       }
526*09537850SAkhilesh Sanikop     }
527*09537850SAkhilesh Sanikop 
528*09537850SAkhilesh Sanikop     dest_u8 += stride;
529*09537850SAkhilesh Sanikop   }
530*09537850SAkhilesh Sanikop }
531*09537850SAkhilesh Sanikop 
532*09537850SAkhilesh Sanikop struct DcDefs {
533*09537850SAkhilesh Sanikop   DcDefs() = delete;
534*09537850SAkhilesh Sanikop 
535*09537850SAkhilesh Sanikop   using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
536*09537850SAkhilesh Sanikop   using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
537*09537850SAkhilesh Sanikop   using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
538*09537850SAkhilesh Sanikop   using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
539*09537850SAkhilesh Sanikop   using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
540*09537850SAkhilesh Sanikop   using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
541*09537850SAkhilesh Sanikop   using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
542*09537850SAkhilesh Sanikop   using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
543*09537850SAkhilesh Sanikop   using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
544*09537850SAkhilesh Sanikop   using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
545*09537850SAkhilesh Sanikop   using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
546*09537850SAkhilesh Sanikop   using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
547*09537850SAkhilesh Sanikop   using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
548*09537850SAkhilesh Sanikop   using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
549*09537850SAkhilesh Sanikop   using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
550*09537850SAkhilesh Sanikop   using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
551*09537850SAkhilesh Sanikop   using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
552*09537850SAkhilesh Sanikop   using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
553*09537850SAkhilesh Sanikop   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
554*09537850SAkhilesh Sanikop };
555*09537850SAkhilesh Sanikop 
Init8bpp()556*09537850SAkhilesh Sanikop void Init8bpp() {
557*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
558*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
559*09537850SAkhilesh Sanikop   // 4x4
560*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
561*09537850SAkhilesh Sanikop       DcDefs::_4x4::DcTop;
562*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
563*09537850SAkhilesh Sanikop       DcDefs::_4x4::DcLeft;
564*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
565*09537850SAkhilesh Sanikop       DcDefs::_4x4::Dc;
566*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
567*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<4, 4>;
568*09537850SAkhilesh Sanikop 
569*09537850SAkhilesh Sanikop   // 4x8
570*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
571*09537850SAkhilesh Sanikop       DcDefs::_4x8::DcTop;
572*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
573*09537850SAkhilesh Sanikop       DcDefs::_4x8::DcLeft;
574*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
575*09537850SAkhilesh Sanikop       DcDefs::_4x8::Dc;
576*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
577*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<4, 8>;
578*09537850SAkhilesh Sanikop 
579*09537850SAkhilesh Sanikop   // 4x16
580*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
581*09537850SAkhilesh Sanikop       DcDefs::_4x16::DcTop;
582*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
583*09537850SAkhilesh Sanikop       DcDefs::_4x16::DcLeft;
584*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
585*09537850SAkhilesh Sanikop       DcDefs::_4x16::Dc;
586*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
587*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<4, 16>;
588*09537850SAkhilesh Sanikop 
589*09537850SAkhilesh Sanikop   // 8x4
590*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
591*09537850SAkhilesh Sanikop       DcDefs::_8x4::DcTop;
592*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
593*09537850SAkhilesh Sanikop       DcDefs::_8x4::DcLeft;
594*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
595*09537850SAkhilesh Sanikop       DcDefs::_8x4::Dc;
596*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
597*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<8, 4>;
598*09537850SAkhilesh Sanikop 
599*09537850SAkhilesh Sanikop   // 8x8
600*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
601*09537850SAkhilesh Sanikop       DcDefs::_8x8::DcTop;
602*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
603*09537850SAkhilesh Sanikop       DcDefs::_8x8::DcLeft;
604*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
605*09537850SAkhilesh Sanikop       DcDefs::_8x8::Dc;
606*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
607*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<8, 8>;
608*09537850SAkhilesh Sanikop 
609*09537850SAkhilesh Sanikop   // 8x16
610*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
611*09537850SAkhilesh Sanikop       DcDefs::_8x16::DcTop;
612*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
613*09537850SAkhilesh Sanikop       DcDefs::_8x16::DcLeft;
614*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
615*09537850SAkhilesh Sanikop       DcDefs::_8x16::Dc;
616*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
617*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<8, 16>;
618*09537850SAkhilesh Sanikop 
619*09537850SAkhilesh Sanikop   // 8x32
620*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
621*09537850SAkhilesh Sanikop       DcDefs::_8x32::DcTop;
622*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
623*09537850SAkhilesh Sanikop       DcDefs::_8x32::DcLeft;
624*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
625*09537850SAkhilesh Sanikop       DcDefs::_8x32::Dc;
626*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
627*09537850SAkhilesh Sanikop       Paeth4Or8xN_NEON<8, 32>;
628*09537850SAkhilesh Sanikop 
629*09537850SAkhilesh Sanikop   // 16x4
630*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
631*09537850SAkhilesh Sanikop       DcDefs::_16x4::DcTop;
632*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
633*09537850SAkhilesh Sanikop       DcDefs::_16x4::DcLeft;
634*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
635*09537850SAkhilesh Sanikop       DcDefs::_16x4::Dc;
636*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
637*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<16, 4>;
638*09537850SAkhilesh Sanikop 
639*09537850SAkhilesh Sanikop   // 16x8
640*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
641*09537850SAkhilesh Sanikop       DcDefs::_16x8::DcTop;
642*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
643*09537850SAkhilesh Sanikop       DcDefs::_16x8::DcLeft;
644*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
645*09537850SAkhilesh Sanikop       DcDefs::_16x8::Dc;
646*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
647*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<16, 8>;
648*09537850SAkhilesh Sanikop 
649*09537850SAkhilesh Sanikop   // 16x16
650*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
651*09537850SAkhilesh Sanikop       DcDefs::_16x16::DcTop;
652*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
653*09537850SAkhilesh Sanikop       DcDefs::_16x16::DcLeft;
654*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
655*09537850SAkhilesh Sanikop       DcDefs::_16x16::Dc;
656*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
657*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<16, 16>;
658*09537850SAkhilesh Sanikop 
659*09537850SAkhilesh Sanikop   // 16x32
660*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
661*09537850SAkhilesh Sanikop       DcDefs::_16x32::DcTop;
662*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
663*09537850SAkhilesh Sanikop       DcDefs::_16x32::DcLeft;
664*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
665*09537850SAkhilesh Sanikop       DcDefs::_16x32::Dc;
666*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
667*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<16, 32>;
668*09537850SAkhilesh Sanikop 
669*09537850SAkhilesh Sanikop   // 16x64
670*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
671*09537850SAkhilesh Sanikop       DcDefs::_16x64::DcTop;
672*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
673*09537850SAkhilesh Sanikop       DcDefs::_16x64::DcLeft;
674*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
675*09537850SAkhilesh Sanikop       DcDefs::_16x64::Dc;
676*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
677*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<16, 64>;
678*09537850SAkhilesh Sanikop 
679*09537850SAkhilesh Sanikop   // 32x8
680*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
681*09537850SAkhilesh Sanikop       DcDefs::_32x8::DcTop;
682*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
683*09537850SAkhilesh Sanikop       DcDefs::_32x8::DcLeft;
684*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
685*09537850SAkhilesh Sanikop       DcDefs::_32x8::Dc;
686*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
687*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<32, 8>;
688*09537850SAkhilesh Sanikop 
689*09537850SAkhilesh Sanikop   // 32x16
690*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
691*09537850SAkhilesh Sanikop       DcDefs::_32x16::DcTop;
692*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
693*09537850SAkhilesh Sanikop       DcDefs::_32x16::DcLeft;
694*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
695*09537850SAkhilesh Sanikop       DcDefs::_32x16::Dc;
696*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
697*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<32, 16>;
698*09537850SAkhilesh Sanikop 
699*09537850SAkhilesh Sanikop   // 32x32
700*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
701*09537850SAkhilesh Sanikop       DcDefs::_32x32::DcTop;
702*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
703*09537850SAkhilesh Sanikop       DcDefs::_32x32::DcLeft;
704*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
705*09537850SAkhilesh Sanikop       DcDefs::_32x32::Dc;
706*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
707*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<32, 32>;
708*09537850SAkhilesh Sanikop 
709*09537850SAkhilesh Sanikop   // 32x64
710*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
711*09537850SAkhilesh Sanikop       DcDefs::_32x64::DcTop;
712*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
713*09537850SAkhilesh Sanikop       DcDefs::_32x64::DcLeft;
714*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
715*09537850SAkhilesh Sanikop       DcDefs::_32x64::Dc;
716*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
717*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<32, 64>;
718*09537850SAkhilesh Sanikop 
719*09537850SAkhilesh Sanikop   // 64x16
720*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
721*09537850SAkhilesh Sanikop       DcDefs::_64x16::DcTop;
722*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
723*09537850SAkhilesh Sanikop       DcDefs::_64x16::DcLeft;
724*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
725*09537850SAkhilesh Sanikop       DcDefs::_64x16::Dc;
726*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
727*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<64, 16>;
728*09537850SAkhilesh Sanikop 
729*09537850SAkhilesh Sanikop   // 64x32
730*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
731*09537850SAkhilesh Sanikop       DcDefs::_64x32::DcTop;
732*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
733*09537850SAkhilesh Sanikop       DcDefs::_64x32::DcLeft;
734*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
735*09537850SAkhilesh Sanikop       DcDefs::_64x32::Dc;
736*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
737*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<64, 32>;
738*09537850SAkhilesh Sanikop 
739*09537850SAkhilesh Sanikop   // 64x64
740*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
741*09537850SAkhilesh Sanikop       DcDefs::_64x64::DcTop;
742*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
743*09537850SAkhilesh Sanikop       DcDefs::_64x64::DcLeft;
744*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
745*09537850SAkhilesh Sanikop       DcDefs::_64x64::Dc;
746*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
747*09537850SAkhilesh Sanikop       Paeth16PlusxN_NEON<64, 64>;
748*09537850SAkhilesh Sanikop }
749*09537850SAkhilesh Sanikop 
750*09537850SAkhilesh Sanikop }  // namespace
751*09537850SAkhilesh Sanikop }  // namespace low_bitdepth
752*09537850SAkhilesh Sanikop 
753*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
754*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
755*09537850SAkhilesh Sanikop namespace high_bitdepth {
756*09537850SAkhilesh Sanikop namespace {
757*09537850SAkhilesh Sanikop 
758*09537850SAkhilesh Sanikop // Add the elements in the given vectors together but do not sum the entire
759*09537850SAkhilesh Sanikop // vector.
Add(const uint16x8_t val_0,const uint16x8_t val_1,const uint16x8_t val_2,const uint16x8_t val_3)760*09537850SAkhilesh Sanikop inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
761*09537850SAkhilesh Sanikop                       const uint16x8_t val_2, const uint16x8_t val_3) {
762*09537850SAkhilesh Sanikop   const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
763*09537850SAkhilesh Sanikop   const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
764*09537850SAkhilesh Sanikop   return vaddq_u16(sum_0, sum_1);
765*09537850SAkhilesh Sanikop }
766*09537850SAkhilesh Sanikop 
767*09537850SAkhilesh Sanikop // Load and combine 16 uint16_t values.
LoadAndAdd16(const uint16_t * buf)768*09537850SAkhilesh Sanikop inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
769*09537850SAkhilesh Sanikop   const uint16x8_t val_0 = vld1q_u16(buf);
770*09537850SAkhilesh Sanikop   const uint16x8_t val_1 = vld1q_u16(buf + 8);
771*09537850SAkhilesh Sanikop   return vaddq_u16(val_0, val_1);
772*09537850SAkhilesh Sanikop }
773*09537850SAkhilesh Sanikop 
774*09537850SAkhilesh Sanikop // Load and combine 32 uint16_t values.
LoadAndAdd32(const uint16_t * buf)775*09537850SAkhilesh Sanikop inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
776*09537850SAkhilesh Sanikop   const uint16x8_t val_0 = vld1q_u16(buf);
777*09537850SAkhilesh Sanikop   const uint16x8_t val_1 = vld1q_u16(buf + 8);
778*09537850SAkhilesh Sanikop   const uint16x8_t val_2 = vld1q_u16(buf + 16);
779*09537850SAkhilesh Sanikop   const uint16x8_t val_3 = vld1q_u16(buf + 24);
780*09537850SAkhilesh Sanikop   return Add(val_0, val_1, val_2, val_3);
781*09537850SAkhilesh Sanikop }
782*09537850SAkhilesh Sanikop 
783*09537850SAkhilesh Sanikop // Load and combine 64 uint16_t values.
LoadAndAdd64(const uint16_t * buf)784*09537850SAkhilesh Sanikop inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
785*09537850SAkhilesh Sanikop   const uint16x8_t val_0 = vld1q_u16(buf);
786*09537850SAkhilesh Sanikop   const uint16x8_t val_1 = vld1q_u16(buf + 8);
787*09537850SAkhilesh Sanikop   const uint16x8_t val_2 = vld1q_u16(buf + 16);
788*09537850SAkhilesh Sanikop   const uint16x8_t val_3 = vld1q_u16(buf + 24);
789*09537850SAkhilesh Sanikop   const uint16x8_t val_4 = vld1q_u16(buf + 32);
790*09537850SAkhilesh Sanikop   const uint16x8_t val_5 = vld1q_u16(buf + 40);
791*09537850SAkhilesh Sanikop   const uint16x8_t val_6 = vld1q_u16(buf + 48);
792*09537850SAkhilesh Sanikop   const uint16x8_t val_7 = vld1q_u16(buf + 56);
793*09537850SAkhilesh Sanikop   const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
794*09537850SAkhilesh Sanikop   const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
795*09537850SAkhilesh Sanikop   return vaddq_u16(sum_0, sum_1);
796*09537850SAkhilesh Sanikop }
797*09537850SAkhilesh Sanikop 
798*09537850SAkhilesh Sanikop // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
799*09537850SAkhilesh Sanikop // If |use_ref_1| is false then only sum |ref_0|.
DcSum_NEON(const void * LIBGAV1_RESTRICT ref_0,const int ref_0_size_log2,const bool use_ref_1,const void * LIBGAV1_RESTRICT ref_1,const int ref_1_size_log2)800*09537850SAkhilesh Sanikop inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
801*09537850SAkhilesh Sanikop                              const int ref_0_size_log2, const bool use_ref_1,
802*09537850SAkhilesh Sanikop                              const void* LIBGAV1_RESTRICT ref_1,
803*09537850SAkhilesh Sanikop                              const int ref_1_size_log2) {
804*09537850SAkhilesh Sanikop   const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
805*09537850SAkhilesh Sanikop   const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
806*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 2) {
807*09537850SAkhilesh Sanikop     const uint16x4_t val_0 = vld1_u16(ref_0_u16);
808*09537850SAkhilesh Sanikop     if (use_ref_1) {
809*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
810*09537850SAkhilesh Sanikop         case 2: {  // 4x4
811*09537850SAkhilesh Sanikop           const uint16x4_t val_1 = vld1_u16(ref_1_u16);
812*09537850SAkhilesh Sanikop           return Sum(vadd_u16(val_0, val_1));
813*09537850SAkhilesh Sanikop         }
814*09537850SAkhilesh Sanikop         case 3: {  // 4x8
815*09537850SAkhilesh Sanikop           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
816*09537850SAkhilesh Sanikop           const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
817*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, val_1));
818*09537850SAkhilesh Sanikop         }
819*09537850SAkhilesh Sanikop         case 4: {  // 4x16
820*09537850SAkhilesh Sanikop           const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
821*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
822*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
823*09537850SAkhilesh Sanikop         }
824*09537850SAkhilesh Sanikop       }
825*09537850SAkhilesh Sanikop     }
826*09537850SAkhilesh Sanikop     // 4x1
827*09537850SAkhilesh Sanikop     return Sum(val_0);
828*09537850SAkhilesh Sanikop   }
829*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 3) {
830*09537850SAkhilesh Sanikop     const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
831*09537850SAkhilesh Sanikop     if (use_ref_1) {
832*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
833*09537850SAkhilesh Sanikop         case 2: {  // 8x4
834*09537850SAkhilesh Sanikop           const uint16x4_t val_1 = vld1_u16(ref_1_u16);
835*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
836*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(val_0, sum_1));
837*09537850SAkhilesh Sanikop         }
838*09537850SAkhilesh Sanikop         case 3: {  // 8x8
839*09537850SAkhilesh Sanikop           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
840*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(val_0, val_1));
841*09537850SAkhilesh Sanikop         }
842*09537850SAkhilesh Sanikop         case 4: {  // 8x16
843*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
844*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(val_0, sum_1));
845*09537850SAkhilesh Sanikop         }
846*09537850SAkhilesh Sanikop         case 5: {  // 8x32
847*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
848*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(val_0, sum_1));
849*09537850SAkhilesh Sanikop         }
850*09537850SAkhilesh Sanikop       }
851*09537850SAkhilesh Sanikop     }
852*09537850SAkhilesh Sanikop     // 8x1
853*09537850SAkhilesh Sanikop     return Sum(val_0);
854*09537850SAkhilesh Sanikop   }
855*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 4) {
856*09537850SAkhilesh Sanikop     const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
857*09537850SAkhilesh Sanikop     if (use_ref_1) {
858*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
859*09537850SAkhilesh Sanikop         case 2: {  // 16x4
860*09537850SAkhilesh Sanikop           const uint16x4_t val_1 = vld1_u16(ref_1_u16);
861*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
862*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
863*09537850SAkhilesh Sanikop         }
864*09537850SAkhilesh Sanikop         case 3: {  // 16x8
865*09537850SAkhilesh Sanikop           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
866*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, val_1));
867*09537850SAkhilesh Sanikop         }
868*09537850SAkhilesh Sanikop         case 4: {  // 16x16
869*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
870*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
871*09537850SAkhilesh Sanikop         }
872*09537850SAkhilesh Sanikop         case 5: {  // 16x32
873*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
874*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
875*09537850SAkhilesh Sanikop         }
876*09537850SAkhilesh Sanikop         case 6: {  // 16x64
877*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
878*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
879*09537850SAkhilesh Sanikop         }
880*09537850SAkhilesh Sanikop       }
881*09537850SAkhilesh Sanikop     }
882*09537850SAkhilesh Sanikop     // 16x1
883*09537850SAkhilesh Sanikop     return Sum(sum_0);
884*09537850SAkhilesh Sanikop   }
885*09537850SAkhilesh Sanikop   if (ref_0_size_log2 == 5) {
886*09537850SAkhilesh Sanikop     const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
887*09537850SAkhilesh Sanikop     if (use_ref_1) {
888*09537850SAkhilesh Sanikop       switch (ref_1_size_log2) {
889*09537850SAkhilesh Sanikop         case 3: {  // 32x8
890*09537850SAkhilesh Sanikop           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
891*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, val_1));
892*09537850SAkhilesh Sanikop         }
893*09537850SAkhilesh Sanikop         case 4: {  // 32x16
894*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
895*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
896*09537850SAkhilesh Sanikop         }
897*09537850SAkhilesh Sanikop         case 5: {  // 32x32
898*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
899*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
900*09537850SAkhilesh Sanikop         }
901*09537850SAkhilesh Sanikop         case 6: {  // 32x64
902*09537850SAkhilesh Sanikop           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
903*09537850SAkhilesh Sanikop           return Sum(vaddq_u16(sum_0, sum_1));
904*09537850SAkhilesh Sanikop         }
905*09537850SAkhilesh Sanikop       }
906*09537850SAkhilesh Sanikop     }
907*09537850SAkhilesh Sanikop     // 32x1
908*09537850SAkhilesh Sanikop     return Sum(sum_0);
909*09537850SAkhilesh Sanikop   }
910*09537850SAkhilesh Sanikop 
911*09537850SAkhilesh Sanikop   assert(ref_0_size_log2 == 6);
912*09537850SAkhilesh Sanikop   const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
913*09537850SAkhilesh Sanikop   if (use_ref_1) {
914*09537850SAkhilesh Sanikop     switch (ref_1_size_log2) {
915*09537850SAkhilesh Sanikop       case 4: {  // 64x16
916*09537850SAkhilesh Sanikop         const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
917*09537850SAkhilesh Sanikop         return Sum(vaddq_u16(sum_0, sum_1));
918*09537850SAkhilesh Sanikop       }
919*09537850SAkhilesh Sanikop       case 5: {  // 64x32
920*09537850SAkhilesh Sanikop         const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
921*09537850SAkhilesh Sanikop         return Sum(vaddq_u16(sum_0, sum_1));
922*09537850SAkhilesh Sanikop       }
923*09537850SAkhilesh Sanikop       case 6: {  // 64x64
924*09537850SAkhilesh Sanikop         const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
925*09537850SAkhilesh Sanikop         return Sum(vaddq_u16(sum_0, sum_1));
926*09537850SAkhilesh Sanikop       }
927*09537850SAkhilesh Sanikop     }
928*09537850SAkhilesh Sanikop   }
929*09537850SAkhilesh Sanikop   // 64x1
930*09537850SAkhilesh Sanikop   return Sum(sum_0);
931*09537850SAkhilesh Sanikop }
932*09537850SAkhilesh Sanikop 
933*09537850SAkhilesh Sanikop template <int width, int height>
DcStore_NEON(void * const dest,ptrdiff_t stride,const uint32x2_t dc)934*09537850SAkhilesh Sanikop inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
935*09537850SAkhilesh Sanikop                          const uint32x2_t dc) {
936*09537850SAkhilesh Sanikop   auto* dest_u16 = static_cast<uint16_t*>(dest);
937*09537850SAkhilesh Sanikop   ptrdiff_t stride_u16 = stride >> 1;
938*09537850SAkhilesh Sanikop   const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
939*09537850SAkhilesh Sanikop   if (width == 4) {
940*09537850SAkhilesh Sanikop     int i = height - 1;
941*09537850SAkhilesh Sanikop     do {
942*09537850SAkhilesh Sanikop       vst1_u16(dest_u16, vget_low_u16(dc_dup));
943*09537850SAkhilesh Sanikop       dest_u16 += stride_u16;
944*09537850SAkhilesh Sanikop     } while (--i != 0);
945*09537850SAkhilesh Sanikop     vst1_u16(dest_u16, vget_low_u16(dc_dup));
946*09537850SAkhilesh Sanikop   } else if (width == 8) {
947*09537850SAkhilesh Sanikop     int i = height - 1;
948*09537850SAkhilesh Sanikop     do {
949*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16, dc_dup);
950*09537850SAkhilesh Sanikop       dest_u16 += stride_u16;
951*09537850SAkhilesh Sanikop     } while (--i != 0);
952*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16, dc_dup);
953*09537850SAkhilesh Sanikop   } else if (width == 16) {
954*09537850SAkhilesh Sanikop     int i = height - 1;
955*09537850SAkhilesh Sanikop     do {
956*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16, dc_dup);
957*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 8, dc_dup);
958*09537850SAkhilesh Sanikop       dest_u16 += stride_u16;
959*09537850SAkhilesh Sanikop     } while (--i != 0);
960*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16, dc_dup);
961*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 8, dc_dup);
962*09537850SAkhilesh Sanikop   } else if (width == 32) {
963*09537850SAkhilesh Sanikop     int i = height - 1;
964*09537850SAkhilesh Sanikop     do {
965*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16, dc_dup);
966*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 8, dc_dup);
967*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 16, dc_dup);
968*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 24, dc_dup);
969*09537850SAkhilesh Sanikop       dest_u16 += stride_u16;
970*09537850SAkhilesh Sanikop     } while (--i != 0);
971*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16, dc_dup);
972*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 8, dc_dup);
973*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 16, dc_dup);
974*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 24, dc_dup);
975*09537850SAkhilesh Sanikop   } else {
976*09537850SAkhilesh Sanikop     assert(width == 64);
977*09537850SAkhilesh Sanikop     int i = height - 1;
978*09537850SAkhilesh Sanikop     do {
979*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16, dc_dup);
980*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 8, dc_dup);
981*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 16, dc_dup);
982*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 24, dc_dup);
983*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 32, dc_dup);
984*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 40, dc_dup);
985*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 48, dc_dup);
986*09537850SAkhilesh Sanikop       vst1q_u16(dest_u16 + 56, dc_dup);
987*09537850SAkhilesh Sanikop       dest_u16 += stride_u16;
988*09537850SAkhilesh Sanikop     } while (--i != 0);
989*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16, dc_dup);
990*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 8, dc_dup);
991*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 16, dc_dup);
992*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 24, dc_dup);
993*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 32, dc_dup);
994*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 40, dc_dup);
995*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 48, dc_dup);
996*09537850SAkhilesh Sanikop     vst1q_u16(dest_u16 + 56, dc_dup);
997*09537850SAkhilesh Sanikop   }
998*09537850SAkhilesh Sanikop }
999*09537850SAkhilesh Sanikop 
1000*09537850SAkhilesh Sanikop struct DcDefs {
1001*09537850SAkhilesh Sanikop   DcDefs() = delete;
1002*09537850SAkhilesh Sanikop 
1003*09537850SAkhilesh Sanikop   using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
1004*09537850SAkhilesh Sanikop   using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
1005*09537850SAkhilesh Sanikop   using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
1006*09537850SAkhilesh Sanikop   using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
1007*09537850SAkhilesh Sanikop   using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
1008*09537850SAkhilesh Sanikop   using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
1009*09537850SAkhilesh Sanikop   using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
1010*09537850SAkhilesh Sanikop   using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
1011*09537850SAkhilesh Sanikop   using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
1012*09537850SAkhilesh Sanikop   using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
1013*09537850SAkhilesh Sanikop   using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
1014*09537850SAkhilesh Sanikop   using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
1015*09537850SAkhilesh Sanikop   using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
1016*09537850SAkhilesh Sanikop   using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
1017*09537850SAkhilesh Sanikop   using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
1018*09537850SAkhilesh Sanikop   using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
1019*09537850SAkhilesh Sanikop   using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
1020*09537850SAkhilesh Sanikop   using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
1021*09537850SAkhilesh Sanikop   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
1022*09537850SAkhilesh Sanikop };
1023*09537850SAkhilesh Sanikop 
1024*09537850SAkhilesh Sanikop // IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
1025*09537850SAkhilesh Sanikop 
1026*09537850SAkhilesh Sanikop template <int block_height>
Horizontal4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1027*09537850SAkhilesh Sanikop void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1028*09537850SAkhilesh Sanikop                         const void* /*top_row*/,
1029*09537850SAkhilesh Sanikop                         const void* LIBGAV1_RESTRICT const left_column) {
1030*09537850SAkhilesh Sanikop   const auto* const left = static_cast<const uint16_t*>(left_column);
1031*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1032*09537850SAkhilesh Sanikop   int y = 0;
1033*09537850SAkhilesh Sanikop   do {
1034*09537850SAkhilesh Sanikop     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1035*09537850SAkhilesh Sanikop     const uint16x4_t row = vld1_dup_u16(left + y);
1036*09537850SAkhilesh Sanikop     vst1_u16(dst16, row);
1037*09537850SAkhilesh Sanikop     dst += stride;
1038*09537850SAkhilesh Sanikop   } while (++y < block_height);
1039*09537850SAkhilesh Sanikop }
1040*09537850SAkhilesh Sanikop 
1041*09537850SAkhilesh Sanikop template <int block_height>
Horizontal8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1042*09537850SAkhilesh Sanikop void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1043*09537850SAkhilesh Sanikop                         const void* /*top_row*/,
1044*09537850SAkhilesh Sanikop                         const void* LIBGAV1_RESTRICT const left_column) {
1045*09537850SAkhilesh Sanikop   const auto* const left = static_cast<const uint16_t*>(left_column);
1046*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1047*09537850SAkhilesh Sanikop   int y = 0;
1048*09537850SAkhilesh Sanikop   do {
1049*09537850SAkhilesh Sanikop     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1050*09537850SAkhilesh Sanikop     const uint16x8_t row = vld1q_dup_u16(left + y);
1051*09537850SAkhilesh Sanikop     vst1q_u16(dst16, row);
1052*09537850SAkhilesh Sanikop     dst += stride;
1053*09537850SAkhilesh Sanikop   } while (++y < block_height);
1054*09537850SAkhilesh Sanikop }
1055*09537850SAkhilesh Sanikop 
1056*09537850SAkhilesh Sanikop template <int block_height>
Horizontal16xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1057*09537850SAkhilesh Sanikop void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1058*09537850SAkhilesh Sanikop                          const void* /*top_row*/,
1059*09537850SAkhilesh Sanikop                          const void* LIBGAV1_RESTRICT const left_column) {
1060*09537850SAkhilesh Sanikop   const auto* const left = static_cast<const uint16_t*>(left_column);
1061*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1062*09537850SAkhilesh Sanikop   int y = 0;
1063*09537850SAkhilesh Sanikop   do {
1064*09537850SAkhilesh Sanikop     const uint16x8_t row0 = vld1q_dup_u16(left + y);
1065*09537850SAkhilesh Sanikop     const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
1066*09537850SAkhilesh Sanikop     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1067*09537850SAkhilesh Sanikop     vst1q_u16(dst16, row0);
1068*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 8, row0);
1069*09537850SAkhilesh Sanikop     dst += stride;
1070*09537850SAkhilesh Sanikop     dst16 = reinterpret_cast<uint16_t*>(dst);
1071*09537850SAkhilesh Sanikop     vst1q_u16(dst16, row1);
1072*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 8, row1);
1073*09537850SAkhilesh Sanikop     dst += stride;
1074*09537850SAkhilesh Sanikop     y += 2;
1075*09537850SAkhilesh Sanikop   } while (y < block_height);
1076*09537850SAkhilesh Sanikop }
1077*09537850SAkhilesh Sanikop 
1078*09537850SAkhilesh Sanikop template <int block_height>
Horizontal32xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1079*09537850SAkhilesh Sanikop void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1080*09537850SAkhilesh Sanikop                          const void* /*top_row*/,
1081*09537850SAkhilesh Sanikop                          const void* LIBGAV1_RESTRICT const left_column) {
1082*09537850SAkhilesh Sanikop   const auto* const left = static_cast<const uint16_t*>(left_column);
1083*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1084*09537850SAkhilesh Sanikop   int y = 0;
1085*09537850SAkhilesh Sanikop   do {
1086*09537850SAkhilesh Sanikop     const uint16x8_t row0 = vld1q_dup_u16(left + y);
1087*09537850SAkhilesh Sanikop     const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
1088*09537850SAkhilesh Sanikop     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1089*09537850SAkhilesh Sanikop     vst1q_u16(dst16, row0);
1090*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 8, row0);
1091*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 16, row0);
1092*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 24, row0);
1093*09537850SAkhilesh Sanikop     dst += stride;
1094*09537850SAkhilesh Sanikop     dst16 = reinterpret_cast<uint16_t*>(dst);
1095*09537850SAkhilesh Sanikop     vst1q_u16(dst16, row1);
1096*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 8, row1);
1097*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 16, row1);
1098*09537850SAkhilesh Sanikop     vst1q_u16(dst16 + 24, row1);
1099*09537850SAkhilesh Sanikop     dst += stride;
1100*09537850SAkhilesh Sanikop     y += 2;
1101*09537850SAkhilesh Sanikop   } while (y < block_height);
1102*09537850SAkhilesh Sanikop }
1103*09537850SAkhilesh Sanikop 
1104*09537850SAkhilesh Sanikop // IntraPredFuncs_NEON::Vertical -- copy top row to all rows
1105*09537850SAkhilesh Sanikop 
1106*09537850SAkhilesh Sanikop template <int block_height>
Vertical4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1107*09537850SAkhilesh Sanikop void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1108*09537850SAkhilesh Sanikop                       const void* LIBGAV1_RESTRICT const top_row,
1109*09537850SAkhilesh Sanikop                       const void* const /*left_column*/) {
1110*09537850SAkhilesh Sanikop   const auto* const top = static_cast<const uint8_t*>(top_row);
1111*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1112*09537850SAkhilesh Sanikop   const uint8x8_t row = vld1_u8(top);
1113*09537850SAkhilesh Sanikop   int y = block_height;
1114*09537850SAkhilesh Sanikop   do {
1115*09537850SAkhilesh Sanikop     vst1_u8(dst, row);
1116*09537850SAkhilesh Sanikop     dst += stride;
1117*09537850SAkhilesh Sanikop   } while (--y != 0);
1118*09537850SAkhilesh Sanikop }
1119*09537850SAkhilesh Sanikop 
1120*09537850SAkhilesh Sanikop template <int block_height>
Vertical8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1121*09537850SAkhilesh Sanikop void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1122*09537850SAkhilesh Sanikop                       const void* LIBGAV1_RESTRICT const top_row,
1123*09537850SAkhilesh Sanikop                       const void* const /*left_column*/) {
1124*09537850SAkhilesh Sanikop   const auto* const top = static_cast<const uint8_t*>(top_row);
1125*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1126*09537850SAkhilesh Sanikop   const uint8x16_t row = vld1q_u8(top);
1127*09537850SAkhilesh Sanikop   int y = block_height;
1128*09537850SAkhilesh Sanikop   do {
1129*09537850SAkhilesh Sanikop     vst1q_u8(dst, row);
1130*09537850SAkhilesh Sanikop     dst += stride;
1131*09537850SAkhilesh Sanikop   } while (--y != 0);
1132*09537850SAkhilesh Sanikop }
1133*09537850SAkhilesh Sanikop 
1134*09537850SAkhilesh Sanikop template <int block_height>
Vertical16xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1135*09537850SAkhilesh Sanikop void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1136*09537850SAkhilesh Sanikop                        const void* LIBGAV1_RESTRICT const top_row,
1137*09537850SAkhilesh Sanikop                        const void* const /*left_column*/) {
1138*09537850SAkhilesh Sanikop   const auto* const top = static_cast<const uint8_t*>(top_row);
1139*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1140*09537850SAkhilesh Sanikop   const uint8x16_t row0 = vld1q_u8(top);
1141*09537850SAkhilesh Sanikop   const uint8x16_t row1 = vld1q_u8(top + 16);
1142*09537850SAkhilesh Sanikop   int y = block_height;
1143*09537850SAkhilesh Sanikop   do {
1144*09537850SAkhilesh Sanikop     vst1q_u8(dst, row0);
1145*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, row1);
1146*09537850SAkhilesh Sanikop     dst += stride;
1147*09537850SAkhilesh Sanikop     vst1q_u8(dst, row0);
1148*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, row1);
1149*09537850SAkhilesh Sanikop     dst += stride;
1150*09537850SAkhilesh Sanikop     y -= 2;
1151*09537850SAkhilesh Sanikop   } while (y != 0);
1152*09537850SAkhilesh Sanikop }
1153*09537850SAkhilesh Sanikop 
1154*09537850SAkhilesh Sanikop template <int block_height>
Vertical32xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1155*09537850SAkhilesh Sanikop void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1156*09537850SAkhilesh Sanikop                        const void* LIBGAV1_RESTRICT const top_row,
1157*09537850SAkhilesh Sanikop                        const void* const /*left_column*/) {
1158*09537850SAkhilesh Sanikop   const auto* const top = static_cast<const uint8_t*>(top_row);
1159*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1160*09537850SAkhilesh Sanikop   const uint8x16_t row0 = vld1q_u8(top);
1161*09537850SAkhilesh Sanikop   const uint8x16_t row1 = vld1q_u8(top + 16);
1162*09537850SAkhilesh Sanikop   const uint8x16_t row2 = vld1q_u8(top + 32);
1163*09537850SAkhilesh Sanikop   const uint8x16_t row3 = vld1q_u8(top + 48);
1164*09537850SAkhilesh Sanikop   int y = block_height;
1165*09537850SAkhilesh Sanikop   do {
1166*09537850SAkhilesh Sanikop     vst1q_u8(dst, row0);
1167*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, row1);
1168*09537850SAkhilesh Sanikop     vst1q_u8(dst + 32, row2);
1169*09537850SAkhilesh Sanikop     vst1q_u8(dst + 48, row3);
1170*09537850SAkhilesh Sanikop     dst += stride;
1171*09537850SAkhilesh Sanikop     vst1q_u8(dst, row0);
1172*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, row1);
1173*09537850SAkhilesh Sanikop     vst1q_u8(dst + 32, row2);
1174*09537850SAkhilesh Sanikop     vst1q_u8(dst + 48, row3);
1175*09537850SAkhilesh Sanikop     dst += stride;
1176*09537850SAkhilesh Sanikop     y -= 2;
1177*09537850SAkhilesh Sanikop   } while (y != 0);
1178*09537850SAkhilesh Sanikop }
1179*09537850SAkhilesh Sanikop 
1180*09537850SAkhilesh Sanikop template <int block_height>
Vertical64xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1181*09537850SAkhilesh Sanikop void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1182*09537850SAkhilesh Sanikop                        const void* LIBGAV1_RESTRICT const top_row,
1183*09537850SAkhilesh Sanikop                        const void* const /*left_column*/) {
1184*09537850SAkhilesh Sanikop   const auto* const top = static_cast<const uint8_t*>(top_row);
1185*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1186*09537850SAkhilesh Sanikop   const uint8x16_t row0 = vld1q_u8(top);
1187*09537850SAkhilesh Sanikop   const uint8x16_t row1 = vld1q_u8(top + 16);
1188*09537850SAkhilesh Sanikop   const uint8x16_t row2 = vld1q_u8(top + 32);
1189*09537850SAkhilesh Sanikop   const uint8x16_t row3 = vld1q_u8(top + 48);
1190*09537850SAkhilesh Sanikop   const uint8x16_t row4 = vld1q_u8(top + 64);
1191*09537850SAkhilesh Sanikop   const uint8x16_t row5 = vld1q_u8(top + 80);
1192*09537850SAkhilesh Sanikop   const uint8x16_t row6 = vld1q_u8(top + 96);
1193*09537850SAkhilesh Sanikop   const uint8x16_t row7 = vld1q_u8(top + 112);
1194*09537850SAkhilesh Sanikop   int y = block_height;
1195*09537850SAkhilesh Sanikop   do {
1196*09537850SAkhilesh Sanikop     vst1q_u8(dst, row0);
1197*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, row1);
1198*09537850SAkhilesh Sanikop     vst1q_u8(dst + 32, row2);
1199*09537850SAkhilesh Sanikop     vst1q_u8(dst + 48, row3);
1200*09537850SAkhilesh Sanikop     vst1q_u8(dst + 64, row4);
1201*09537850SAkhilesh Sanikop     vst1q_u8(dst + 80, row5);
1202*09537850SAkhilesh Sanikop     vst1q_u8(dst + 96, row6);
1203*09537850SAkhilesh Sanikop     vst1q_u8(dst + 112, row7);
1204*09537850SAkhilesh Sanikop     dst += stride;
1205*09537850SAkhilesh Sanikop     vst1q_u8(dst, row0);
1206*09537850SAkhilesh Sanikop     vst1q_u8(dst + 16, row1);
1207*09537850SAkhilesh Sanikop     vst1q_u8(dst + 32, row2);
1208*09537850SAkhilesh Sanikop     vst1q_u8(dst + 48, row3);
1209*09537850SAkhilesh Sanikop     vst1q_u8(dst + 64, row4);
1210*09537850SAkhilesh Sanikop     vst1q_u8(dst + 80, row5);
1211*09537850SAkhilesh Sanikop     vst1q_u8(dst + 96, row6);
1212*09537850SAkhilesh Sanikop     vst1q_u8(dst + 112, row7);
1213*09537850SAkhilesh Sanikop     dst += stride;
1214*09537850SAkhilesh Sanikop     y -= 2;
1215*09537850SAkhilesh Sanikop   } while (y != 0);
1216*09537850SAkhilesh Sanikop }
1217*09537850SAkhilesh Sanikop 
1218*09537850SAkhilesh Sanikop template <int height>
Paeth4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1219*09537850SAkhilesh Sanikop inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1220*09537850SAkhilesh Sanikop                           const void* LIBGAV1_RESTRICT const top_ptr,
1221*09537850SAkhilesh Sanikop                           const void* LIBGAV1_RESTRICT const left_ptr) {
1222*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1223*09537850SAkhilesh Sanikop   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1224*09537850SAkhilesh Sanikop   const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1225*09537850SAkhilesh Sanikop 
1226*09537850SAkhilesh Sanikop   const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
1227*09537850SAkhilesh Sanikop   const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
1228*09537850SAkhilesh Sanikop   const uint16x4_t top = vld1_u16(top_row);
1229*09537850SAkhilesh Sanikop 
1230*09537850SAkhilesh Sanikop   for (int y = 0; y < height; ++y) {
1231*09537850SAkhilesh Sanikop     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1232*09537850SAkhilesh Sanikop     const uint16x4_t left = vdup_n_u16(left_col[y]);
1233*09537850SAkhilesh Sanikop 
1234*09537850SAkhilesh Sanikop     const uint16x4_t left_dist = vabd_u16(top, top_left);
1235*09537850SAkhilesh Sanikop     const uint16x4_t top_dist = vabd_u16(left, top_left);
1236*09537850SAkhilesh Sanikop     const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
1237*09537850SAkhilesh Sanikop 
1238*09537850SAkhilesh Sanikop     const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
1239*09537850SAkhilesh Sanikop     const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
1240*09537850SAkhilesh Sanikop     const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
1241*09537850SAkhilesh Sanikop 
1242*09537850SAkhilesh Sanikop     // if (left_dist <= top_dist && left_dist <= top_left_dist)
1243*09537850SAkhilesh Sanikop     const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
1244*09537850SAkhilesh Sanikop     //   dest[x] = left_column[y];
1245*09537850SAkhilesh Sanikop     // Fill all the unused spaces with 'top'. They will be overwritten when
1246*09537850SAkhilesh Sanikop     // the positions for top_left are known.
1247*09537850SAkhilesh Sanikop     uint16x4_t result = vbsl_u16(left_mask, left, top);
1248*09537850SAkhilesh Sanikop     // else if (top_dist <= top_left_dist)
1249*09537850SAkhilesh Sanikop     //   dest[x] = top_row[x];
1250*09537850SAkhilesh Sanikop     // Add these values to the mask. They were already set.
1251*09537850SAkhilesh Sanikop     const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
1252*09537850SAkhilesh Sanikop     // else
1253*09537850SAkhilesh Sanikop     //   dest[x] = top_left;
1254*09537850SAkhilesh Sanikop     result = vbsl_u16(left_or_top_mask, result, top_left);
1255*09537850SAkhilesh Sanikop 
1256*09537850SAkhilesh Sanikop     vst1_u16(dst16, result);
1257*09537850SAkhilesh Sanikop     dst += stride;
1258*09537850SAkhilesh Sanikop   }
1259*09537850SAkhilesh Sanikop }
1260*09537850SAkhilesh Sanikop 
1261*09537850SAkhilesh Sanikop template <int height>
Paeth8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1262*09537850SAkhilesh Sanikop inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1263*09537850SAkhilesh Sanikop                           const void* LIBGAV1_RESTRICT const top_ptr,
1264*09537850SAkhilesh Sanikop                           const void* LIBGAV1_RESTRICT const left_ptr) {
1265*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1266*09537850SAkhilesh Sanikop   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1267*09537850SAkhilesh Sanikop   const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1268*09537850SAkhilesh Sanikop 
1269*09537850SAkhilesh Sanikop   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
1270*09537850SAkhilesh Sanikop   const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
1271*09537850SAkhilesh Sanikop   const uint16x8_t top = vld1q_u16(top_row);
1272*09537850SAkhilesh Sanikop 
1273*09537850SAkhilesh Sanikop   for (int y = 0; y < height; ++y) {
1274*09537850SAkhilesh Sanikop     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1275*09537850SAkhilesh Sanikop     const uint16x8_t left = vdupq_n_u16(left_col[y]);
1276*09537850SAkhilesh Sanikop 
1277*09537850SAkhilesh Sanikop     const uint16x8_t left_dist = vabdq_u16(top, top_left);
1278*09537850SAkhilesh Sanikop     const uint16x8_t top_dist = vabdq_u16(left, top_left);
1279*09537850SAkhilesh Sanikop     const uint16x8_t top_left_dist =
1280*09537850SAkhilesh Sanikop         vabdq_u16(vaddq_u16(top, left), top_left_x2);
1281*09537850SAkhilesh Sanikop 
1282*09537850SAkhilesh Sanikop     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
1283*09537850SAkhilesh Sanikop     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
1284*09537850SAkhilesh Sanikop     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
1285*09537850SAkhilesh Sanikop 
1286*09537850SAkhilesh Sanikop     // if (left_dist <= top_dist && left_dist <= top_left_dist)
1287*09537850SAkhilesh Sanikop     const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
1288*09537850SAkhilesh Sanikop     //   dest[x] = left_column[y];
1289*09537850SAkhilesh Sanikop     // Fill all the unused spaces with 'top'. They will be overwritten when
1290*09537850SAkhilesh Sanikop     // the positions for top_left are known.
1291*09537850SAkhilesh Sanikop     uint16x8_t result = vbslq_u16(left_mask, left, top);
1292*09537850SAkhilesh Sanikop     // else if (top_dist <= top_left_dist)
1293*09537850SAkhilesh Sanikop     //   dest[x] = top_row[x];
1294*09537850SAkhilesh Sanikop     // Add these values to the mask. They were already set.
1295*09537850SAkhilesh Sanikop     const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
1296*09537850SAkhilesh Sanikop     // else
1297*09537850SAkhilesh Sanikop     //   dest[x] = top_left;
1298*09537850SAkhilesh Sanikop     result = vbslq_u16(left_or_top_mask, result, top_left);
1299*09537850SAkhilesh Sanikop 
1300*09537850SAkhilesh Sanikop     vst1q_u16(dst16, result);
1301*09537850SAkhilesh Sanikop     dst += stride;
1302*09537850SAkhilesh Sanikop   }
1303*09537850SAkhilesh Sanikop }
1304*09537850SAkhilesh Sanikop 
1305*09537850SAkhilesh Sanikop // For 16xH and above.
1306*09537850SAkhilesh Sanikop template <int width, int height>
PaethWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1307*09537850SAkhilesh Sanikop inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1308*09537850SAkhilesh Sanikop                           const void* LIBGAV1_RESTRICT const top_ptr,
1309*09537850SAkhilesh Sanikop                           const void* LIBGAV1_RESTRICT const left_ptr) {
1310*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
1311*09537850SAkhilesh Sanikop   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1312*09537850SAkhilesh Sanikop   const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1313*09537850SAkhilesh Sanikop 
1314*09537850SAkhilesh Sanikop   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
1315*09537850SAkhilesh Sanikop   const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
1316*09537850SAkhilesh Sanikop 
1317*09537850SAkhilesh Sanikop   uint16x8_t top[width >> 3];
1318*09537850SAkhilesh Sanikop   for (int i = 0; i < width >> 3; ++i) {
1319*09537850SAkhilesh Sanikop     top[i] = vld1q_u16(top_row + (i << 3));
1320*09537850SAkhilesh Sanikop   }
1321*09537850SAkhilesh Sanikop 
1322*09537850SAkhilesh Sanikop   for (int y = 0; y < height; ++y) {
1323*09537850SAkhilesh Sanikop     auto* dst_x = reinterpret_cast<uint16_t*>(dst);
1324*09537850SAkhilesh Sanikop     const uint16x8_t left = vdupq_n_u16(left_col[y]);
1325*09537850SAkhilesh Sanikop     const uint16x8_t top_dist = vabdq_u16(left, top_left);
1326*09537850SAkhilesh Sanikop 
1327*09537850SAkhilesh Sanikop     for (int i = 0; i < (width >> 3); ++i) {
1328*09537850SAkhilesh Sanikop       const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
1329*09537850SAkhilesh Sanikop       const uint16x8_t top_left_dist =
1330*09537850SAkhilesh Sanikop           vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
1331*09537850SAkhilesh Sanikop 
1332*09537850SAkhilesh Sanikop       const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
1333*09537850SAkhilesh Sanikop       const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
1334*09537850SAkhilesh Sanikop       const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
1335*09537850SAkhilesh Sanikop 
1336*09537850SAkhilesh Sanikop       // if (left_dist <= top_dist && left_dist <= top_left_dist)
1337*09537850SAkhilesh Sanikop       const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
1338*09537850SAkhilesh Sanikop       //   dest[x] = left_column[y];
1339*09537850SAkhilesh Sanikop       // Fill all the unused spaces with 'top'. They will be overwritten when
1340*09537850SAkhilesh Sanikop       // the positions for top_left are known.
1341*09537850SAkhilesh Sanikop       uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
1342*09537850SAkhilesh Sanikop       // else if (top_dist <= top_left_dist)
1343*09537850SAkhilesh Sanikop       //   dest[x] = top_row[x];
1344*09537850SAkhilesh Sanikop       // Add these values to the mask. They were already set.
1345*09537850SAkhilesh Sanikop       const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
1346*09537850SAkhilesh Sanikop       // else
1347*09537850SAkhilesh Sanikop       //   dest[x] = top_left;
1348*09537850SAkhilesh Sanikop       result = vbslq_u16(left_or_top_mask, result, top_left);
1349*09537850SAkhilesh Sanikop 
1350*09537850SAkhilesh Sanikop       vst1q_u16(dst_x, result);
1351*09537850SAkhilesh Sanikop       dst_x += 8;
1352*09537850SAkhilesh Sanikop     }
1353*09537850SAkhilesh Sanikop     dst += stride;
1354*09537850SAkhilesh Sanikop   }
1355*09537850SAkhilesh Sanikop }
1356*09537850SAkhilesh Sanikop 
Init10bpp()1357*09537850SAkhilesh Sanikop void Init10bpp() {
1358*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1359*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
1360*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
1361*09537850SAkhilesh Sanikop       DcDefs::_4x4::DcTop;
1362*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
1363*09537850SAkhilesh Sanikop       DcDefs::_4x4::DcLeft;
1364*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
1365*09537850SAkhilesh Sanikop       DcDefs::_4x4::Dc;
1366*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
1367*09537850SAkhilesh Sanikop       Vertical4xH_NEON<4>;
1368*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
1369*09537850SAkhilesh Sanikop       Paeth4xH_NEON<4>;
1370*09537850SAkhilesh Sanikop 
1371*09537850SAkhilesh Sanikop   // 4x8
1372*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
1373*09537850SAkhilesh Sanikop       DcDefs::_4x8::DcTop;
1374*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
1375*09537850SAkhilesh Sanikop       DcDefs::_4x8::DcLeft;
1376*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
1377*09537850SAkhilesh Sanikop       DcDefs::_4x8::Dc;
1378*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
1379*09537850SAkhilesh Sanikop       Horizontal4xH_NEON<8>;
1380*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
1381*09537850SAkhilesh Sanikop       Vertical4xH_NEON<8>;
1382*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
1383*09537850SAkhilesh Sanikop       Paeth4xH_NEON<8>;
1384*09537850SAkhilesh Sanikop 
1385*09537850SAkhilesh Sanikop   // 4x16
1386*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
1387*09537850SAkhilesh Sanikop       DcDefs::_4x16::DcTop;
1388*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
1389*09537850SAkhilesh Sanikop       DcDefs::_4x16::DcLeft;
1390*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
1391*09537850SAkhilesh Sanikop       DcDefs::_4x16::Dc;
1392*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
1393*09537850SAkhilesh Sanikop       Horizontal4xH_NEON<16>;
1394*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
1395*09537850SAkhilesh Sanikop       Vertical4xH_NEON<16>;
1396*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
1397*09537850SAkhilesh Sanikop       Paeth4xH_NEON<16>;
1398*09537850SAkhilesh Sanikop 
1399*09537850SAkhilesh Sanikop   // 8x4
1400*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
1401*09537850SAkhilesh Sanikop       DcDefs::_8x4::DcTop;
1402*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
1403*09537850SAkhilesh Sanikop       DcDefs::_8x4::DcLeft;
1404*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
1405*09537850SAkhilesh Sanikop       DcDefs::_8x4::Dc;
1406*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
1407*09537850SAkhilesh Sanikop       Vertical8xH_NEON<4>;
1408*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
1409*09537850SAkhilesh Sanikop       Paeth8xH_NEON<4>;
1410*09537850SAkhilesh Sanikop 
1411*09537850SAkhilesh Sanikop   // 8x8
1412*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
1413*09537850SAkhilesh Sanikop       DcDefs::_8x8::DcTop;
1414*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
1415*09537850SAkhilesh Sanikop       DcDefs::_8x8::DcLeft;
1416*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
1417*09537850SAkhilesh Sanikop       DcDefs::_8x8::Dc;
1418*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
1419*09537850SAkhilesh Sanikop       Horizontal8xH_NEON<8>;
1420*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
1421*09537850SAkhilesh Sanikop       Vertical8xH_NEON<8>;
1422*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
1423*09537850SAkhilesh Sanikop       Paeth8xH_NEON<8>;
1424*09537850SAkhilesh Sanikop 
1425*09537850SAkhilesh Sanikop   // 8x16
1426*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
1427*09537850SAkhilesh Sanikop       DcDefs::_8x16::DcTop;
1428*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
1429*09537850SAkhilesh Sanikop       DcDefs::_8x16::DcLeft;
1430*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
1431*09537850SAkhilesh Sanikop       DcDefs::_8x16::Dc;
1432*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
1433*09537850SAkhilesh Sanikop       Vertical8xH_NEON<16>;
1434*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
1435*09537850SAkhilesh Sanikop       Paeth8xH_NEON<16>;
1436*09537850SAkhilesh Sanikop 
1437*09537850SAkhilesh Sanikop   // 8x32
1438*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
1439*09537850SAkhilesh Sanikop       DcDefs::_8x32::DcTop;
1440*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
1441*09537850SAkhilesh Sanikop       DcDefs::_8x32::DcLeft;
1442*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
1443*09537850SAkhilesh Sanikop       DcDefs::_8x32::Dc;
1444*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
1445*09537850SAkhilesh Sanikop       Horizontal8xH_NEON<32>;
1446*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
1447*09537850SAkhilesh Sanikop       Vertical8xH_NEON<32>;
1448*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
1449*09537850SAkhilesh Sanikop       Paeth8xH_NEON<32>;
1450*09537850SAkhilesh Sanikop 
1451*09537850SAkhilesh Sanikop   // 16x4
1452*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
1453*09537850SAkhilesh Sanikop       DcDefs::_16x4::DcTop;
1454*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
1455*09537850SAkhilesh Sanikop       DcDefs::_16x4::DcLeft;
1456*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
1457*09537850SAkhilesh Sanikop       DcDefs::_16x4::Dc;
1458*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
1459*09537850SAkhilesh Sanikop       Vertical16xH_NEON<4>;
1460*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
1461*09537850SAkhilesh Sanikop       PaethWxH_NEON<16, 4>;
1462*09537850SAkhilesh Sanikop 
1463*09537850SAkhilesh Sanikop   // 16x8
1464*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
1465*09537850SAkhilesh Sanikop       DcDefs::_16x8::DcTop;
1466*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
1467*09537850SAkhilesh Sanikop       DcDefs::_16x8::DcLeft;
1468*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
1469*09537850SAkhilesh Sanikop       DcDefs::_16x8::Dc;
1470*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
1471*09537850SAkhilesh Sanikop       Horizontal16xH_NEON<8>;
1472*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
1473*09537850SAkhilesh Sanikop       Vertical16xH_NEON<8>;
1474*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
1475*09537850SAkhilesh Sanikop       PaethWxH_NEON<16, 8>;
1476*09537850SAkhilesh Sanikop 
1477*09537850SAkhilesh Sanikop   // 16x16
1478*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
1479*09537850SAkhilesh Sanikop       DcDefs::_16x16::DcTop;
1480*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
1481*09537850SAkhilesh Sanikop       DcDefs::_16x16::DcLeft;
1482*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
1483*09537850SAkhilesh Sanikop       DcDefs::_16x16::Dc;
1484*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
1485*09537850SAkhilesh Sanikop       Vertical16xH_NEON<16>;
1486*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
1487*09537850SAkhilesh Sanikop       PaethWxH_NEON<16, 16>;
1488*09537850SAkhilesh Sanikop 
1489*09537850SAkhilesh Sanikop   // 16x32
1490*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
1491*09537850SAkhilesh Sanikop       DcDefs::_16x32::DcTop;
1492*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
1493*09537850SAkhilesh Sanikop       DcDefs::_16x32::DcLeft;
1494*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
1495*09537850SAkhilesh Sanikop       DcDefs::_16x32::Dc;
1496*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
1497*09537850SAkhilesh Sanikop       Vertical16xH_NEON<32>;
1498*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
1499*09537850SAkhilesh Sanikop       PaethWxH_NEON<16, 32>;
1500*09537850SAkhilesh Sanikop 
1501*09537850SAkhilesh Sanikop   // 16x64
1502*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
1503*09537850SAkhilesh Sanikop       DcDefs::_16x64::DcTop;
1504*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
1505*09537850SAkhilesh Sanikop       DcDefs::_16x64::DcLeft;
1506*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
1507*09537850SAkhilesh Sanikop       DcDefs::_16x64::Dc;
1508*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
1509*09537850SAkhilesh Sanikop       Vertical16xH_NEON<64>;
1510*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
1511*09537850SAkhilesh Sanikop       PaethWxH_NEON<16, 64>;
1512*09537850SAkhilesh Sanikop 
1513*09537850SAkhilesh Sanikop   // 32x8
1514*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
1515*09537850SAkhilesh Sanikop       DcDefs::_32x8::DcTop;
1516*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
1517*09537850SAkhilesh Sanikop       DcDefs::_32x8::DcLeft;
1518*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
1519*09537850SAkhilesh Sanikop       DcDefs::_32x8::Dc;
1520*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
1521*09537850SAkhilesh Sanikop       Vertical32xH_NEON<8>;
1522*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
1523*09537850SAkhilesh Sanikop       PaethWxH_NEON<32, 8>;
1524*09537850SAkhilesh Sanikop 
1525*09537850SAkhilesh Sanikop   // 32x16
1526*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
1527*09537850SAkhilesh Sanikop       DcDefs::_32x16::DcTop;
1528*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
1529*09537850SAkhilesh Sanikop       DcDefs::_32x16::DcLeft;
1530*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
1531*09537850SAkhilesh Sanikop       DcDefs::_32x16::Dc;
1532*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
1533*09537850SAkhilesh Sanikop       Vertical32xH_NEON<16>;
1534*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
1535*09537850SAkhilesh Sanikop       PaethWxH_NEON<32, 16>;
1536*09537850SAkhilesh Sanikop 
1537*09537850SAkhilesh Sanikop   // 32x32
1538*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
1539*09537850SAkhilesh Sanikop       DcDefs::_32x32::DcTop;
1540*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
1541*09537850SAkhilesh Sanikop       DcDefs::_32x32::DcLeft;
1542*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
1543*09537850SAkhilesh Sanikop       DcDefs::_32x32::Dc;
1544*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
1545*09537850SAkhilesh Sanikop       Vertical32xH_NEON<32>;
1546*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
1547*09537850SAkhilesh Sanikop       PaethWxH_NEON<32, 32>;
1548*09537850SAkhilesh Sanikop 
1549*09537850SAkhilesh Sanikop   // 32x64
1550*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
1551*09537850SAkhilesh Sanikop       DcDefs::_32x64::DcTop;
1552*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
1553*09537850SAkhilesh Sanikop       DcDefs::_32x64::DcLeft;
1554*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
1555*09537850SAkhilesh Sanikop       DcDefs::_32x64::Dc;
1556*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
1557*09537850SAkhilesh Sanikop       Horizontal32xH_NEON<64>;
1558*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
1559*09537850SAkhilesh Sanikop       Vertical32xH_NEON<64>;
1560*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
1561*09537850SAkhilesh Sanikop       PaethWxH_NEON<32, 64>;
1562*09537850SAkhilesh Sanikop 
1563*09537850SAkhilesh Sanikop   // 64x16
1564*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
1565*09537850SAkhilesh Sanikop       DcDefs::_64x16::DcTop;
1566*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
1567*09537850SAkhilesh Sanikop       DcDefs::_64x16::DcLeft;
1568*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
1569*09537850SAkhilesh Sanikop       DcDefs::_64x16::Dc;
1570*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
1571*09537850SAkhilesh Sanikop       Vertical64xH_NEON<16>;
1572*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
1573*09537850SAkhilesh Sanikop       PaethWxH_NEON<64, 16>;
1574*09537850SAkhilesh Sanikop 
1575*09537850SAkhilesh Sanikop   // 64x32
1576*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
1577*09537850SAkhilesh Sanikop       DcDefs::_64x32::DcTop;
1578*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
1579*09537850SAkhilesh Sanikop       DcDefs::_64x32::DcLeft;
1580*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
1581*09537850SAkhilesh Sanikop       DcDefs::_64x32::Dc;
1582*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
1583*09537850SAkhilesh Sanikop       Vertical64xH_NEON<32>;
1584*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
1585*09537850SAkhilesh Sanikop       PaethWxH_NEON<64, 32>;
1586*09537850SAkhilesh Sanikop 
1587*09537850SAkhilesh Sanikop   // 64x64
1588*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
1589*09537850SAkhilesh Sanikop       DcDefs::_64x64::DcTop;
1590*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
1591*09537850SAkhilesh Sanikop       DcDefs::_64x64::DcLeft;
1592*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
1593*09537850SAkhilesh Sanikop       DcDefs::_64x64::Dc;
1594*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
1595*09537850SAkhilesh Sanikop       Vertical64xH_NEON<64>;
1596*09537850SAkhilesh Sanikop   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
1597*09537850SAkhilesh Sanikop       PaethWxH_NEON<64, 64>;
1598*09537850SAkhilesh Sanikop }
1599*09537850SAkhilesh Sanikop 
1600*09537850SAkhilesh Sanikop }  // namespace
1601*09537850SAkhilesh Sanikop }  // namespace high_bitdepth
1602*09537850SAkhilesh Sanikop #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1603*09537850SAkhilesh Sanikop 
IntraPredInit_NEON()1604*09537850SAkhilesh Sanikop void IntraPredInit_NEON() {
1605*09537850SAkhilesh Sanikop   low_bitdepth::Init8bpp();
1606*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
1607*09537850SAkhilesh Sanikop   high_bitdepth::Init10bpp();
1608*09537850SAkhilesh Sanikop #endif
1609*09537850SAkhilesh Sanikop }
1610*09537850SAkhilesh Sanikop 
1611*09537850SAkhilesh Sanikop }  // namespace dsp
1612*09537850SAkhilesh Sanikop }  // namespace libgav1
1613*09537850SAkhilesh Sanikop 
1614*09537850SAkhilesh Sanikop #else   // !LIBGAV1_ENABLE_NEON
1615*09537850SAkhilesh Sanikop namespace libgav1 {
1616*09537850SAkhilesh Sanikop namespace dsp {
1617*09537850SAkhilesh Sanikop 
IntraPredInit_NEON()1618*09537850SAkhilesh Sanikop void IntraPredInit_NEON() {}
1619*09537850SAkhilesh Sanikop 
1620*09537850SAkhilesh Sanikop }  // namespace dsp
1621*09537850SAkhilesh Sanikop }  // namespace libgav1
1622*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENABLE_NEON
1623