xref: /aosp_15_r20/external/libgav1/src/dsp/arm/intrapred_neon.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intrapred.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31 
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35 
36 //------------------------------------------------------------------------------
37 // DcPredFuncs_NEON
38 
39 using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
40                                  const bool use_ref_1, const void* ref_1,
41                                  const int ref_1_size_log2);
42 using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
43 
44 // DC intra-predictors for square blocks.
45 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
46           DcStoreFunc storefn>
47 struct DcPredFuncs_NEON {
48   DcPredFuncs_NEON() = delete;
49 
50   static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
51                     const void* left_column);
52   static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
53                      const void* left_column);
54   static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
55                  const void* left_column);
56 };
57 
58 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
59           DcStoreFunc storefn>
60 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
DcTop(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void *)61     DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
62           const void* LIBGAV1_RESTRICT const top_row,
63           const void* /*left_column*/) {
64   const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
65   const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
66   storefn(dest, stride, dc);
67 }
68 
69 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
70           DcStoreFunc storefn>
71 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
DcLeft(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)72     DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
73            const void* /*top_row*/,
74            const void* LIBGAV1_RESTRICT const left_column) {
75   const uint32x2_t sum =
76       sumfn(left_column, block_height_log2, false, nullptr, 0);
77   const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
78   storefn(dest, stride, dc);
79 }
80 
81 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
82           DcStoreFunc storefn>
Dc(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)83 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
84     void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
85     const void* LIBGAV1_RESTRICT const top_row,
86     const void* LIBGAV1_RESTRICT const left_column) {
87   const uint32x2_t sum =
88       sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
89   if (block_width_log2 == block_height_log2) {
90     const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
91     storefn(dest, stride, dc);
92   } else {
93     // TODO(johannkoenig): Compare this to mul/shift in vectors.
94     const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
95     uint32_t dc = vget_lane_u32(sum, 0);
96     dc += divisor >> 1;
97     dc /= divisor;
98     storefn(dest, stride, vdup_n_u32(dc));
99   }
100 }
101 
102 // Sum all the elements in the vector into the low 32 bits.
Sum(const uint16x4_t val)103 inline uint32x2_t Sum(const uint16x4_t val) {
104   const uint32x2_t sum = vpaddl_u16(val);
105   return vpadd_u32(sum, sum);
106 }
107 
108 // Sum all the elements in the vector into the low 32 bits.
Sum(const uint16x8_t val)109 inline uint32x2_t Sum(const uint16x8_t val) {
110   const uint32x4_t sum_0 = vpaddlq_u16(val);
111   const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
112   return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
113                   vget_high_u32(vreinterpretq_u32_u64(sum_1)));
114 }
115 
116 }  // namespace
117 
118 //------------------------------------------------------------------------------
119 namespace low_bitdepth {
120 namespace {
121 
122 // Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
123 // entire vector.
Add(const uint8x16_t val_0,const uint8x16_t val_1)124 inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
125   const uint16x8_t sum_0 = vpaddlq_u8(val_0);
126   const uint16x8_t sum_1 = vpaddlq_u8(val_1);
127   return vaddq_u16(sum_0, sum_1);
128 }
129 
130 // Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
131 // the entire vector.
Add(const uint8x16_t val_0,const uint8x16_t val_1,const uint8x16_t val_2,const uint8x16_t val_3)132 inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
133                       const uint8x16_t val_2, const uint8x16_t val_3) {
134   const uint16x8_t sum_0 = Add(val_0, val_1);
135   const uint16x8_t sum_1 = Add(val_2, val_3);
136   return vaddq_u16(sum_0, sum_1);
137 }
138 
139 // Load and combine 32 uint8_t values.
LoadAndAdd32(const uint8_t * buf)140 inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
141   const uint8x16_t val_0 = vld1q_u8(buf);
142   const uint8x16_t val_1 = vld1q_u8(buf + 16);
143   return Add(val_0, val_1);
144 }
145 
146 // Load and combine 64 uint8_t values.
LoadAndAdd64(const uint8_t * buf)147 inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
148   const uint8x16_t val_0 = vld1q_u8(buf);
149   const uint8x16_t val_1 = vld1q_u8(buf + 16);
150   const uint8x16_t val_2 = vld1q_u8(buf + 32);
151   const uint8x16_t val_3 = vld1q_u8(buf + 48);
152   return Add(val_0, val_1, val_2, val_3);
153 }
154 
155 // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
156 // If |use_ref_1| is false then only sum |ref_0|.
157 // For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
158 // uint32_t.
DcSum_NEON(const void * LIBGAV1_RESTRICT ref_0,const int ref_0_size_log2,const bool use_ref_1,const void * LIBGAV1_RESTRICT ref_1,const int ref_1_size_log2)159 inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
160                              const int ref_0_size_log2, const bool use_ref_1,
161                              const void* LIBGAV1_RESTRICT ref_1,
162                              const int ref_1_size_log2) {
163   const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
164   const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
165   if (ref_0_size_log2 == 2) {
166     uint8x8_t val = Load4(ref_0_u8);
167     if (use_ref_1) {
168       switch (ref_1_size_log2) {
169         case 2: {  // 4x4
170           val = Load4<1>(ref_1_u8, val);
171           return Sum(vpaddl_u8(val));
172         }
173         case 3: {  // 4x8
174           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
175           const uint16x4_t sum_0 = vpaddl_u8(val);
176           const uint16x4_t sum_1 = vpaddl_u8(val_1);
177           return Sum(vadd_u16(sum_0, sum_1));
178         }
179         case 4: {  // 4x16
180           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
181           return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
182         }
183       }
184     }
185     // 4x1
186     const uint16x4_t sum = vpaddl_u8(val);
187     return vpaddl_u16(sum);
188   }
189   if (ref_0_size_log2 == 3) {
190     const uint8x8_t val_0 = vld1_u8(ref_0_u8);
191     if (use_ref_1) {
192       switch (ref_1_size_log2) {
193         case 2: {  // 8x4
194           const uint8x8_t val_1 = Load4(ref_1_u8);
195           const uint16x4_t sum_0 = vpaddl_u8(val_0);
196           const uint16x4_t sum_1 = vpaddl_u8(val_1);
197           return Sum(vadd_u16(sum_0, sum_1));
198         }
199         case 3: {  // 8x8
200           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
201           const uint16x4_t sum_0 = vpaddl_u8(val_0);
202           const uint16x4_t sum_1 = vpaddl_u8(val_1);
203           return Sum(vadd_u16(sum_0, sum_1));
204         }
205         case 4: {  // 8x16
206           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
207           return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
208         }
209         case 5: {  // 8x32
210           return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
211         }
212       }
213     }
214     // 8x1
215     return Sum(vpaddl_u8(val_0));
216   }
217   if (ref_0_size_log2 == 4) {
218     const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
219     if (use_ref_1) {
220       switch (ref_1_size_log2) {
221         case 2: {  // 16x4
222           const uint8x8_t val_1 = Load4(ref_1_u8);
223           return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
224         }
225         case 3: {  // 16x8
226           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
227           return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
228         }
229         case 4: {  // 16x16
230           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
231           return Sum(Add(val_0, val_1));
232         }
233         case 5: {  // 16x32
234           const uint16x8_t sum_0 = vpaddlq_u8(val_0);
235           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
236           return Sum(vaddq_u16(sum_0, sum_1));
237         }
238         case 6: {  // 16x64
239           const uint16x8_t sum_0 = vpaddlq_u8(val_0);
240           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
241           return Sum(vaddq_u16(sum_0, sum_1));
242         }
243       }
244     }
245     // 16x1
246     return Sum(vpaddlq_u8(val_0));
247   }
248   if (ref_0_size_log2 == 5) {
249     const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
250     if (use_ref_1) {
251       switch (ref_1_size_log2) {
252         case 3: {  // 32x8
253           const uint8x8_t val_1 = vld1_u8(ref_1_u8);
254           return Sum(vaddw_u8(sum_0, val_1));
255         }
256         case 4: {  // 32x16
257           const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
258           const uint16x8_t sum_1 = vpaddlq_u8(val_1);
259           return Sum(vaddq_u16(sum_0, sum_1));
260         }
261         case 5: {  // 32x32
262           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
263           return Sum(vaddq_u16(sum_0, sum_1));
264         }
265         case 6: {  // 32x64
266           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
267           return Sum(vaddq_u16(sum_0, sum_1));
268         }
269       }
270     }
271     // 32x1
272     return Sum(sum_0);
273   }
274 
275   assert(ref_0_size_log2 == 6);
276   const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
277   if (use_ref_1) {
278     switch (ref_1_size_log2) {
279       case 4: {  // 64x16
280         const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
281         const uint16x8_t sum_1 = vpaddlq_u8(val_1);
282         return Sum(vaddq_u16(sum_0, sum_1));
283       }
284       case 5: {  // 64x32
285         const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
286         return Sum(vaddq_u16(sum_0, sum_1));
287       }
288       case 6: {  // 64x64
289         const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
290         return Sum(vaddq_u16(sum_0, sum_1));
291       }
292     }
293   }
294   // 64x1
295   return Sum(sum_0);
296 }
297 
298 template <int width, int height>
DcStore_NEON(void * const dest,ptrdiff_t stride,const uint32x2_t dc)299 inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
300                          const uint32x2_t dc) {
301   const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
302   auto* dst = static_cast<uint8_t*>(dest);
303   if (width == 4) {
304     int i = height - 1;
305     do {
306       StoreLo4(dst, vget_low_u8(dc_dup));
307       dst += stride;
308     } while (--i != 0);
309     StoreLo4(dst, vget_low_u8(dc_dup));
310   } else if (width == 8) {
311     int i = height - 1;
312     do {
313       vst1_u8(dst, vget_low_u8(dc_dup));
314       dst += stride;
315     } while (--i != 0);
316     vst1_u8(dst, vget_low_u8(dc_dup));
317   } else if (width == 16) {
318     int i = height - 1;
319     do {
320       vst1q_u8(dst, dc_dup);
321       dst += stride;
322     } while (--i != 0);
323     vst1q_u8(dst, dc_dup);
324   } else if (width == 32) {
325     int i = height - 1;
326     do {
327       vst1q_u8(dst, dc_dup);
328       vst1q_u8(dst + 16, dc_dup);
329       dst += stride;
330     } while (--i != 0);
331     vst1q_u8(dst, dc_dup);
332     vst1q_u8(dst + 16, dc_dup);
333   } else {
334     assert(width == 64);
335     int i = height - 1;
336     do {
337       vst1q_u8(dst, dc_dup);
338       vst1q_u8(dst + 16, dc_dup);
339       vst1q_u8(dst + 32, dc_dup);
340       vst1q_u8(dst + 48, dc_dup);
341       dst += stride;
342     } while (--i != 0);
343     vst1q_u8(dst, dc_dup);
344     vst1q_u8(dst + 16, dc_dup);
345     vst1q_u8(dst + 32, dc_dup);
346     vst1q_u8(dst + 48, dc_dup);
347   }
348 }
349 
350 template <int width, int height>
Paeth4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)351 inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
352                              ptrdiff_t stride,
353                              const void* LIBGAV1_RESTRICT const top_row,
354                              const void* LIBGAV1_RESTRICT const left_column) {
355   auto* dest_u8 = static_cast<uint8_t*>(dest);
356   const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
357   const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
358 
359   const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
360   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
361   uint8x8_t top;
362   if (width == 4) {
363     top = Load4(top_row_u8);
364   } else {  // width == 8
365     top = vld1_u8(top_row_u8);
366   }
367 
368   for (int y = 0; y < height; ++y) {
369     const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
370 
371     const uint8x8_t left_dist = vabd_u8(top, top_left);
372     const uint8x8_t top_dist = vabd_u8(left, top_left);
373     const uint16x8_t top_left_dist =
374         vabdq_u16(vaddl_u8(top, left), top_left_x2);
375 
376     const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
377     const uint8x8_t left_le_top_left =
378         vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
379     const uint8x8_t top_le_top_left =
380         vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
381 
382     // if (left_dist <= top_dist && left_dist <= top_left_dist)
383     const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
384     //   dest[x] = left_column[y];
385     // Fill all the unused spaces with 'top'. They will be overwritten when
386     // the positions for top_left are known.
387     uint8x8_t result = vbsl_u8(left_mask, left, top);
388     // else if (top_dist <= top_left_dist)
389     //   dest[x] = top_row[x];
390     // Add these values to the mask. They were already set.
391     const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
392     // else
393     //   dest[x] = top_left;
394     result = vbsl_u8(left_or_top_mask, result, top_left);
395 
396     if (width == 4) {
397       StoreLo4(dest_u8, result);
398     } else {  // width == 8
399       vst1_u8(dest_u8, result);
400     }
401     dest_u8 += stride;
402   }
403 }
404 
405 // Calculate X distance <= TopLeft distance and pack the resulting mask into
406 // uint8x8_t.
XLeTopLeft(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)407 inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
408                              const uint16x8_t top_left_dist_low,
409                              const uint16x8_t top_left_dist_high) {
410   const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
411                                                vqmovn_u16(top_left_dist_high));
412   return vcleq_u8(x_dist, top_left_dist);
413 }
414 
415 // Select the closest values and collect them.
SelectPaeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)416 inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
417                               const uint8x16_t top_left,
418                               const uint8x16_t left_le_top,
419                               const uint8x16_t left_le_top_left,
420                               const uint8x16_t top_le_top_left) {
421   // if (left_dist <= top_dist && left_dist <= top_left_dist)
422   const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
423   //   dest[x] = left_column[y];
424   // Fill all the unused spaces with 'top'. They will be overwritten when
425   // the positions for top_left are known.
426   uint8x16_t result = vbslq_u8(left_mask, left, top);
427   // else if (top_dist <= top_left_dist)
428   //   dest[x] = top_row[x];
429   // Add these values to the mask. They were already set.
430   const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
431   // else
432   //   dest[x] = top_left;
433   return vbslq_u8(left_or_top_mask, result, top_left);
434 }
435 
436 // Generate numbered and high/low versions of top_left_dist.
437 #define TOP_LEFT_DIST(num)                                              \
438   const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
439       vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
440   const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
441       vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
442 
443 // Generate numbered versions of XLeTopLeft with x = left.
444 #define LEFT_LE_TOP_LEFT(num)                                  \
445   const uint8x16_t left_le_top_left_##num =                    \
446       XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
447                  top_left_##num##_dist_high)
448 
449 // Generate numbered versions of XLeTopLeft with x = top.
450 #define TOP_LE_TOP_LEFT(num)                           \
451   const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
452       top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
453 
454 template <int width, int height>
Paeth16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)455 inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
456                                ptrdiff_t stride,
457                                const void* LIBGAV1_RESTRICT const top_row,
458                                const void* LIBGAV1_RESTRICT const left_column) {
459   auto* dest_u8 = static_cast<uint8_t*>(dest);
460   const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
461   const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
462 
463   const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
464   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
465   uint8x16_t top[4];
466   top[0] = vld1q_u8(top_row_u8);
467   if (width > 16) {
468     top[1] = vld1q_u8(top_row_u8 + 16);
469     if (width == 64) {
470       top[2] = vld1q_u8(top_row_u8 + 32);
471       top[3] = vld1q_u8(top_row_u8 + 48);
472     }
473   }
474 
475   for (int y = 0; y < height; ++y) {
476     const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
477 
478     const uint8x16_t top_dist = vabdq_u8(left, top_left);
479 
480     const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
481     TOP_LEFT_DIST(0);
482     const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
483     LEFT_LE_TOP_LEFT(0);
484     TOP_LE_TOP_LEFT(0);
485 
486     const uint8x16_t result_0 =
487         SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
488                     top_le_top_left_0);
489     vst1q_u8(dest_u8, result_0);
490 
491     if (width > 16) {
492       const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
493       TOP_LEFT_DIST(1);
494       const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
495       LEFT_LE_TOP_LEFT(1);
496       TOP_LE_TOP_LEFT(1);
497 
498       const uint8x16_t result_1 =
499           SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
500                       top_le_top_left_1);
501       vst1q_u8(dest_u8 + 16, result_1);
502 
503       if (width == 64) {
504         const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
505         TOP_LEFT_DIST(2);
506         const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
507         LEFT_LE_TOP_LEFT(2);
508         TOP_LE_TOP_LEFT(2);
509 
510         const uint8x16_t result_2 =
511             SelectPaeth(top[2], left, top_left, left_2_le_top,
512                         left_le_top_left_2, top_le_top_left_2);
513         vst1q_u8(dest_u8 + 32, result_2);
514 
515         const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
516         TOP_LEFT_DIST(3);
517         const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
518         LEFT_LE_TOP_LEFT(3);
519         TOP_LE_TOP_LEFT(3);
520 
521         const uint8x16_t result_3 =
522             SelectPaeth(top[3], left, top_left, left_3_le_top,
523                         left_le_top_left_3, top_le_top_left_3);
524         vst1q_u8(dest_u8 + 48, result_3);
525       }
526     }
527 
528     dest_u8 += stride;
529   }
530 }
531 
532 struct DcDefs {
533   DcDefs() = delete;
534 
535   using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
536   using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
537   using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
538   using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
539   using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
540   using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
541   using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
542   using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
543   using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
544   using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
545   using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
546   using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
547   using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
548   using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
549   using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
550   using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
551   using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
552   using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
553   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
554 };
555 
Init8bpp()556 void Init8bpp() {
557   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
558   assert(dsp != nullptr);
559   // 4x4
560   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
561       DcDefs::_4x4::DcTop;
562   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
563       DcDefs::_4x4::DcLeft;
564   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
565       DcDefs::_4x4::Dc;
566   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
567       Paeth4Or8xN_NEON<4, 4>;
568 
569   // 4x8
570   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
571       DcDefs::_4x8::DcTop;
572   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
573       DcDefs::_4x8::DcLeft;
574   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
575       DcDefs::_4x8::Dc;
576   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
577       Paeth4Or8xN_NEON<4, 8>;
578 
579   // 4x16
580   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
581       DcDefs::_4x16::DcTop;
582   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
583       DcDefs::_4x16::DcLeft;
584   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
585       DcDefs::_4x16::Dc;
586   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
587       Paeth4Or8xN_NEON<4, 16>;
588 
589   // 8x4
590   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
591       DcDefs::_8x4::DcTop;
592   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
593       DcDefs::_8x4::DcLeft;
594   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
595       DcDefs::_8x4::Dc;
596   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
597       Paeth4Or8xN_NEON<8, 4>;
598 
599   // 8x8
600   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
601       DcDefs::_8x8::DcTop;
602   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
603       DcDefs::_8x8::DcLeft;
604   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
605       DcDefs::_8x8::Dc;
606   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
607       Paeth4Or8xN_NEON<8, 8>;
608 
609   // 8x16
610   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
611       DcDefs::_8x16::DcTop;
612   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
613       DcDefs::_8x16::DcLeft;
614   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
615       DcDefs::_8x16::Dc;
616   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
617       Paeth4Or8xN_NEON<8, 16>;
618 
619   // 8x32
620   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
621       DcDefs::_8x32::DcTop;
622   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
623       DcDefs::_8x32::DcLeft;
624   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
625       DcDefs::_8x32::Dc;
626   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
627       Paeth4Or8xN_NEON<8, 32>;
628 
629   // 16x4
630   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
631       DcDefs::_16x4::DcTop;
632   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
633       DcDefs::_16x4::DcLeft;
634   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
635       DcDefs::_16x4::Dc;
636   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
637       Paeth16PlusxN_NEON<16, 4>;
638 
639   // 16x8
640   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
641       DcDefs::_16x8::DcTop;
642   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
643       DcDefs::_16x8::DcLeft;
644   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
645       DcDefs::_16x8::Dc;
646   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
647       Paeth16PlusxN_NEON<16, 8>;
648 
649   // 16x16
650   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
651       DcDefs::_16x16::DcTop;
652   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
653       DcDefs::_16x16::DcLeft;
654   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
655       DcDefs::_16x16::Dc;
656   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
657       Paeth16PlusxN_NEON<16, 16>;
658 
659   // 16x32
660   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
661       DcDefs::_16x32::DcTop;
662   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
663       DcDefs::_16x32::DcLeft;
664   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
665       DcDefs::_16x32::Dc;
666   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
667       Paeth16PlusxN_NEON<16, 32>;
668 
669   // 16x64
670   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
671       DcDefs::_16x64::DcTop;
672   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
673       DcDefs::_16x64::DcLeft;
674   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
675       DcDefs::_16x64::Dc;
676   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
677       Paeth16PlusxN_NEON<16, 64>;
678 
679   // 32x8
680   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
681       DcDefs::_32x8::DcTop;
682   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
683       DcDefs::_32x8::DcLeft;
684   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
685       DcDefs::_32x8::Dc;
686   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
687       Paeth16PlusxN_NEON<32, 8>;
688 
689   // 32x16
690   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
691       DcDefs::_32x16::DcTop;
692   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
693       DcDefs::_32x16::DcLeft;
694   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
695       DcDefs::_32x16::Dc;
696   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
697       Paeth16PlusxN_NEON<32, 16>;
698 
699   // 32x32
700   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
701       DcDefs::_32x32::DcTop;
702   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
703       DcDefs::_32x32::DcLeft;
704   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
705       DcDefs::_32x32::Dc;
706   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
707       Paeth16PlusxN_NEON<32, 32>;
708 
709   // 32x64
710   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
711       DcDefs::_32x64::DcTop;
712   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
713       DcDefs::_32x64::DcLeft;
714   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
715       DcDefs::_32x64::Dc;
716   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
717       Paeth16PlusxN_NEON<32, 64>;
718 
719   // 64x16
720   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
721       DcDefs::_64x16::DcTop;
722   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
723       DcDefs::_64x16::DcLeft;
724   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
725       DcDefs::_64x16::Dc;
726   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
727       Paeth16PlusxN_NEON<64, 16>;
728 
729   // 64x32
730   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
731       DcDefs::_64x32::DcTop;
732   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
733       DcDefs::_64x32::DcLeft;
734   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
735       DcDefs::_64x32::Dc;
736   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
737       Paeth16PlusxN_NEON<64, 32>;
738 
739   // 64x64
740   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
741       DcDefs::_64x64::DcTop;
742   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
743       DcDefs::_64x64::DcLeft;
744   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
745       DcDefs::_64x64::Dc;
746   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
747       Paeth16PlusxN_NEON<64, 64>;
748 }
749 
750 }  // namespace
751 }  // namespace low_bitdepth
752 
753 //------------------------------------------------------------------------------
754 #if LIBGAV1_MAX_BITDEPTH >= 10
755 namespace high_bitdepth {
756 namespace {
757 
758 // Add the elements in the given vectors together but do not sum the entire
759 // vector.
Add(const uint16x8_t val_0,const uint16x8_t val_1,const uint16x8_t val_2,const uint16x8_t val_3)760 inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
761                       const uint16x8_t val_2, const uint16x8_t val_3) {
762   const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
763   const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
764   return vaddq_u16(sum_0, sum_1);
765 }
766 
767 // Load and combine 16 uint16_t values.
LoadAndAdd16(const uint16_t * buf)768 inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
769   const uint16x8_t val_0 = vld1q_u16(buf);
770   const uint16x8_t val_1 = vld1q_u16(buf + 8);
771   return vaddq_u16(val_0, val_1);
772 }
773 
774 // Load and combine 32 uint16_t values.
LoadAndAdd32(const uint16_t * buf)775 inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
776   const uint16x8_t val_0 = vld1q_u16(buf);
777   const uint16x8_t val_1 = vld1q_u16(buf + 8);
778   const uint16x8_t val_2 = vld1q_u16(buf + 16);
779   const uint16x8_t val_3 = vld1q_u16(buf + 24);
780   return Add(val_0, val_1, val_2, val_3);
781 }
782 
783 // Load and combine 64 uint16_t values.
LoadAndAdd64(const uint16_t * buf)784 inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
785   const uint16x8_t val_0 = vld1q_u16(buf);
786   const uint16x8_t val_1 = vld1q_u16(buf + 8);
787   const uint16x8_t val_2 = vld1q_u16(buf + 16);
788   const uint16x8_t val_3 = vld1q_u16(buf + 24);
789   const uint16x8_t val_4 = vld1q_u16(buf + 32);
790   const uint16x8_t val_5 = vld1q_u16(buf + 40);
791   const uint16x8_t val_6 = vld1q_u16(buf + 48);
792   const uint16x8_t val_7 = vld1q_u16(buf + 56);
793   const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
794   const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
795   return vaddq_u16(sum_0, sum_1);
796 }
797 
798 // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
799 // If |use_ref_1| is false then only sum |ref_0|.
DcSum_NEON(const void * LIBGAV1_RESTRICT ref_0,const int ref_0_size_log2,const bool use_ref_1,const void * LIBGAV1_RESTRICT ref_1,const int ref_1_size_log2)800 inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
801                              const int ref_0_size_log2, const bool use_ref_1,
802                              const void* LIBGAV1_RESTRICT ref_1,
803                              const int ref_1_size_log2) {
804   const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
805   const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
806   if (ref_0_size_log2 == 2) {
807     const uint16x4_t val_0 = vld1_u16(ref_0_u16);
808     if (use_ref_1) {
809       switch (ref_1_size_log2) {
810         case 2: {  // 4x4
811           const uint16x4_t val_1 = vld1_u16(ref_1_u16);
812           return Sum(vadd_u16(val_0, val_1));
813         }
814         case 3: {  // 4x8
815           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
816           const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
817           return Sum(vaddq_u16(sum_0, val_1));
818         }
819         case 4: {  // 4x16
820           const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
821           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
822           return Sum(vaddq_u16(sum_0, sum_1));
823         }
824       }
825     }
826     // 4x1
827     return Sum(val_0);
828   }
829   if (ref_0_size_log2 == 3) {
830     const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
831     if (use_ref_1) {
832       switch (ref_1_size_log2) {
833         case 2: {  // 8x4
834           const uint16x4_t val_1 = vld1_u16(ref_1_u16);
835           const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
836           return Sum(vaddq_u16(val_0, sum_1));
837         }
838         case 3: {  // 8x8
839           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
840           return Sum(vaddq_u16(val_0, val_1));
841         }
842         case 4: {  // 8x16
843           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
844           return Sum(vaddq_u16(val_0, sum_1));
845         }
846         case 5: {  // 8x32
847           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
848           return Sum(vaddq_u16(val_0, sum_1));
849         }
850       }
851     }
852     // 8x1
853     return Sum(val_0);
854   }
855   if (ref_0_size_log2 == 4) {
856     const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
857     if (use_ref_1) {
858       switch (ref_1_size_log2) {
859         case 2: {  // 16x4
860           const uint16x4_t val_1 = vld1_u16(ref_1_u16);
861           const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
862           return Sum(vaddq_u16(sum_0, sum_1));
863         }
864         case 3: {  // 16x8
865           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
866           return Sum(vaddq_u16(sum_0, val_1));
867         }
868         case 4: {  // 16x16
869           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
870           return Sum(vaddq_u16(sum_0, sum_1));
871         }
872         case 5: {  // 16x32
873           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
874           return Sum(vaddq_u16(sum_0, sum_1));
875         }
876         case 6: {  // 16x64
877           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
878           return Sum(vaddq_u16(sum_0, sum_1));
879         }
880       }
881     }
882     // 16x1
883     return Sum(sum_0);
884   }
885   if (ref_0_size_log2 == 5) {
886     const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
887     if (use_ref_1) {
888       switch (ref_1_size_log2) {
889         case 3: {  // 32x8
890           const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
891           return Sum(vaddq_u16(sum_0, val_1));
892         }
893         case 4: {  // 32x16
894           const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
895           return Sum(vaddq_u16(sum_0, sum_1));
896         }
897         case 5: {  // 32x32
898           const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
899           return Sum(vaddq_u16(sum_0, sum_1));
900         }
901         case 6: {  // 32x64
902           const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
903           return Sum(vaddq_u16(sum_0, sum_1));
904         }
905       }
906     }
907     // 32x1
908     return Sum(sum_0);
909   }
910 
911   assert(ref_0_size_log2 == 6);
912   const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
913   if (use_ref_1) {
914     switch (ref_1_size_log2) {
915       case 4: {  // 64x16
916         const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
917         return Sum(vaddq_u16(sum_0, sum_1));
918       }
919       case 5: {  // 64x32
920         const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
921         return Sum(vaddq_u16(sum_0, sum_1));
922       }
923       case 6: {  // 64x64
924         const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
925         return Sum(vaddq_u16(sum_0, sum_1));
926       }
927     }
928   }
929   // 64x1
930   return Sum(sum_0);
931 }
932 
933 template <int width, int height>
DcStore_NEON(void * const dest,ptrdiff_t stride,const uint32x2_t dc)934 inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
935                          const uint32x2_t dc) {
936   auto* dest_u16 = static_cast<uint16_t*>(dest);
937   ptrdiff_t stride_u16 = stride >> 1;
938   const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
939   if (width == 4) {
940     int i = height - 1;
941     do {
942       vst1_u16(dest_u16, vget_low_u16(dc_dup));
943       dest_u16 += stride_u16;
944     } while (--i != 0);
945     vst1_u16(dest_u16, vget_low_u16(dc_dup));
946   } else if (width == 8) {
947     int i = height - 1;
948     do {
949       vst1q_u16(dest_u16, dc_dup);
950       dest_u16 += stride_u16;
951     } while (--i != 0);
952     vst1q_u16(dest_u16, dc_dup);
953   } else if (width == 16) {
954     int i = height - 1;
955     do {
956       vst1q_u16(dest_u16, dc_dup);
957       vst1q_u16(dest_u16 + 8, dc_dup);
958       dest_u16 += stride_u16;
959     } while (--i != 0);
960     vst1q_u16(dest_u16, dc_dup);
961     vst1q_u16(dest_u16 + 8, dc_dup);
962   } else if (width == 32) {
963     int i = height - 1;
964     do {
965       vst1q_u16(dest_u16, dc_dup);
966       vst1q_u16(dest_u16 + 8, dc_dup);
967       vst1q_u16(dest_u16 + 16, dc_dup);
968       vst1q_u16(dest_u16 + 24, dc_dup);
969       dest_u16 += stride_u16;
970     } while (--i != 0);
971     vst1q_u16(dest_u16, dc_dup);
972     vst1q_u16(dest_u16 + 8, dc_dup);
973     vst1q_u16(dest_u16 + 16, dc_dup);
974     vst1q_u16(dest_u16 + 24, dc_dup);
975   } else {
976     assert(width == 64);
977     int i = height - 1;
978     do {
979       vst1q_u16(dest_u16, dc_dup);
980       vst1q_u16(dest_u16 + 8, dc_dup);
981       vst1q_u16(dest_u16 + 16, dc_dup);
982       vst1q_u16(dest_u16 + 24, dc_dup);
983       vst1q_u16(dest_u16 + 32, dc_dup);
984       vst1q_u16(dest_u16 + 40, dc_dup);
985       vst1q_u16(dest_u16 + 48, dc_dup);
986       vst1q_u16(dest_u16 + 56, dc_dup);
987       dest_u16 += stride_u16;
988     } while (--i != 0);
989     vst1q_u16(dest_u16, dc_dup);
990     vst1q_u16(dest_u16 + 8, dc_dup);
991     vst1q_u16(dest_u16 + 16, dc_dup);
992     vst1q_u16(dest_u16 + 24, dc_dup);
993     vst1q_u16(dest_u16 + 32, dc_dup);
994     vst1q_u16(dest_u16 + 40, dc_dup);
995     vst1q_u16(dest_u16 + 48, dc_dup);
996     vst1q_u16(dest_u16 + 56, dc_dup);
997   }
998 }
999 
1000 struct DcDefs {
1001   DcDefs() = delete;
1002 
1003   using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
1004   using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
1005   using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
1006   using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
1007   using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
1008   using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
1009   using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
1010   using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
1011   using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
1012   using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
1013   using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
1014   using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
1015   using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
1016   using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
1017   using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
1018   using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
1019   using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
1020   using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
1021   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
1022 };
1023 
1024 // IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
1025 
1026 template <int block_height>
Horizontal4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1027 void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1028                         const void* /*top_row*/,
1029                         const void* LIBGAV1_RESTRICT const left_column) {
1030   const auto* const left = static_cast<const uint16_t*>(left_column);
1031   auto* dst = static_cast<uint8_t*>(dest);
1032   int y = 0;
1033   do {
1034     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1035     const uint16x4_t row = vld1_dup_u16(left + y);
1036     vst1_u16(dst16, row);
1037     dst += stride;
1038   } while (++y < block_height);
1039 }
1040 
1041 template <int block_height>
Horizontal8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1042 void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1043                         const void* /*top_row*/,
1044                         const void* LIBGAV1_RESTRICT const left_column) {
1045   const auto* const left = static_cast<const uint16_t*>(left_column);
1046   auto* dst = static_cast<uint8_t*>(dest);
1047   int y = 0;
1048   do {
1049     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1050     const uint16x8_t row = vld1q_dup_u16(left + y);
1051     vst1q_u16(dst16, row);
1052     dst += stride;
1053   } while (++y < block_height);
1054 }
1055 
1056 template <int block_height>
Horizontal16xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1057 void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1058                          const void* /*top_row*/,
1059                          const void* LIBGAV1_RESTRICT const left_column) {
1060   const auto* const left = static_cast<const uint16_t*>(left_column);
1061   auto* dst = static_cast<uint8_t*>(dest);
1062   int y = 0;
1063   do {
1064     const uint16x8_t row0 = vld1q_dup_u16(left + y);
1065     const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
1066     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1067     vst1q_u16(dst16, row0);
1068     vst1q_u16(dst16 + 8, row0);
1069     dst += stride;
1070     dst16 = reinterpret_cast<uint16_t*>(dst);
1071     vst1q_u16(dst16, row1);
1072     vst1q_u16(dst16 + 8, row1);
1073     dst += stride;
1074     y += 2;
1075   } while (y < block_height);
1076 }
1077 
1078 template <int block_height>
Horizontal32xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1079 void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1080                          const void* /*top_row*/,
1081                          const void* LIBGAV1_RESTRICT const left_column) {
1082   const auto* const left = static_cast<const uint16_t*>(left_column);
1083   auto* dst = static_cast<uint8_t*>(dest);
1084   int y = 0;
1085   do {
1086     const uint16x8_t row0 = vld1q_dup_u16(left + y);
1087     const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
1088     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1089     vst1q_u16(dst16, row0);
1090     vst1q_u16(dst16 + 8, row0);
1091     vst1q_u16(dst16 + 16, row0);
1092     vst1q_u16(dst16 + 24, row0);
1093     dst += stride;
1094     dst16 = reinterpret_cast<uint16_t*>(dst);
1095     vst1q_u16(dst16, row1);
1096     vst1q_u16(dst16 + 8, row1);
1097     vst1q_u16(dst16 + 16, row1);
1098     vst1q_u16(dst16 + 24, row1);
1099     dst += stride;
1100     y += 2;
1101   } while (y < block_height);
1102 }
1103 
1104 // IntraPredFuncs_NEON::Vertical -- copy top row to all rows
1105 
1106 template <int block_height>
Vertical4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1107 void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1108                       const void* LIBGAV1_RESTRICT const top_row,
1109                       const void* const /*left_column*/) {
1110   const auto* const top = static_cast<const uint8_t*>(top_row);
1111   auto* dst = static_cast<uint8_t*>(dest);
1112   const uint8x8_t row = vld1_u8(top);
1113   int y = block_height;
1114   do {
1115     vst1_u8(dst, row);
1116     dst += stride;
1117   } while (--y != 0);
1118 }
1119 
1120 template <int block_height>
Vertical8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1121 void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1122                       const void* LIBGAV1_RESTRICT const top_row,
1123                       const void* const /*left_column*/) {
1124   const auto* const top = static_cast<const uint8_t*>(top_row);
1125   auto* dst = static_cast<uint8_t*>(dest);
1126   const uint8x16_t row = vld1q_u8(top);
1127   int y = block_height;
1128   do {
1129     vst1q_u8(dst, row);
1130     dst += stride;
1131   } while (--y != 0);
1132 }
1133 
1134 template <int block_height>
Vertical16xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1135 void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1136                        const void* LIBGAV1_RESTRICT const top_row,
1137                        const void* const /*left_column*/) {
1138   const auto* const top = static_cast<const uint8_t*>(top_row);
1139   auto* dst = static_cast<uint8_t*>(dest);
1140   const uint8x16_t row0 = vld1q_u8(top);
1141   const uint8x16_t row1 = vld1q_u8(top + 16);
1142   int y = block_height;
1143   do {
1144     vst1q_u8(dst, row0);
1145     vst1q_u8(dst + 16, row1);
1146     dst += stride;
1147     vst1q_u8(dst, row0);
1148     vst1q_u8(dst + 16, row1);
1149     dst += stride;
1150     y -= 2;
1151   } while (y != 0);
1152 }
1153 
1154 template <int block_height>
Vertical32xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1155 void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1156                        const void* LIBGAV1_RESTRICT const top_row,
1157                        const void* const /*left_column*/) {
1158   const auto* const top = static_cast<const uint8_t*>(top_row);
1159   auto* dst = static_cast<uint8_t*>(dest);
1160   const uint8x16_t row0 = vld1q_u8(top);
1161   const uint8x16_t row1 = vld1q_u8(top + 16);
1162   const uint8x16_t row2 = vld1q_u8(top + 32);
1163   const uint8x16_t row3 = vld1q_u8(top + 48);
1164   int y = block_height;
1165   do {
1166     vst1q_u8(dst, row0);
1167     vst1q_u8(dst + 16, row1);
1168     vst1q_u8(dst + 32, row2);
1169     vst1q_u8(dst + 48, row3);
1170     dst += stride;
1171     vst1q_u8(dst, row0);
1172     vst1q_u8(dst + 16, row1);
1173     vst1q_u8(dst + 32, row2);
1174     vst1q_u8(dst + 48, row3);
1175     dst += stride;
1176     y -= 2;
1177   } while (y != 0);
1178 }
1179 
1180 template <int block_height>
Vertical64xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1181 void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1182                        const void* LIBGAV1_RESTRICT const top_row,
1183                        const void* const /*left_column*/) {
1184   const auto* const top = static_cast<const uint8_t*>(top_row);
1185   auto* dst = static_cast<uint8_t*>(dest);
1186   const uint8x16_t row0 = vld1q_u8(top);
1187   const uint8x16_t row1 = vld1q_u8(top + 16);
1188   const uint8x16_t row2 = vld1q_u8(top + 32);
1189   const uint8x16_t row3 = vld1q_u8(top + 48);
1190   const uint8x16_t row4 = vld1q_u8(top + 64);
1191   const uint8x16_t row5 = vld1q_u8(top + 80);
1192   const uint8x16_t row6 = vld1q_u8(top + 96);
1193   const uint8x16_t row7 = vld1q_u8(top + 112);
1194   int y = block_height;
1195   do {
1196     vst1q_u8(dst, row0);
1197     vst1q_u8(dst + 16, row1);
1198     vst1q_u8(dst + 32, row2);
1199     vst1q_u8(dst + 48, row3);
1200     vst1q_u8(dst + 64, row4);
1201     vst1q_u8(dst + 80, row5);
1202     vst1q_u8(dst + 96, row6);
1203     vst1q_u8(dst + 112, row7);
1204     dst += stride;
1205     vst1q_u8(dst, row0);
1206     vst1q_u8(dst + 16, row1);
1207     vst1q_u8(dst + 32, row2);
1208     vst1q_u8(dst + 48, row3);
1209     vst1q_u8(dst + 64, row4);
1210     vst1q_u8(dst + 80, row5);
1211     vst1q_u8(dst + 96, row6);
1212     vst1q_u8(dst + 112, row7);
1213     dst += stride;
1214     y -= 2;
1215   } while (y != 0);
1216 }
1217 
1218 template <int height>
Paeth4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1219 inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1220                           const void* LIBGAV1_RESTRICT const top_ptr,
1221                           const void* LIBGAV1_RESTRICT const left_ptr) {
1222   auto* dst = static_cast<uint8_t*>(dest);
1223   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1224   const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1225 
1226   const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
1227   const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
1228   const uint16x4_t top = vld1_u16(top_row);
1229 
1230   for (int y = 0; y < height; ++y) {
1231     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1232     const uint16x4_t left = vdup_n_u16(left_col[y]);
1233 
1234     const uint16x4_t left_dist = vabd_u16(top, top_left);
1235     const uint16x4_t top_dist = vabd_u16(left, top_left);
1236     const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
1237 
1238     const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
1239     const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
1240     const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
1241 
1242     // if (left_dist <= top_dist && left_dist <= top_left_dist)
1243     const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
1244     //   dest[x] = left_column[y];
1245     // Fill all the unused spaces with 'top'. They will be overwritten when
1246     // the positions for top_left are known.
1247     uint16x4_t result = vbsl_u16(left_mask, left, top);
1248     // else if (top_dist <= top_left_dist)
1249     //   dest[x] = top_row[x];
1250     // Add these values to the mask. They were already set.
1251     const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
1252     // else
1253     //   dest[x] = top_left;
1254     result = vbsl_u16(left_or_top_mask, result, top_left);
1255 
1256     vst1_u16(dst16, result);
1257     dst += stride;
1258   }
1259 }
1260 
1261 template <int height>
Paeth8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1262 inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1263                           const void* LIBGAV1_RESTRICT const top_ptr,
1264                           const void* LIBGAV1_RESTRICT const left_ptr) {
1265   auto* dst = static_cast<uint8_t*>(dest);
1266   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1267   const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1268 
1269   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
1270   const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
1271   const uint16x8_t top = vld1q_u16(top_row);
1272 
1273   for (int y = 0; y < height; ++y) {
1274     auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1275     const uint16x8_t left = vdupq_n_u16(left_col[y]);
1276 
1277     const uint16x8_t left_dist = vabdq_u16(top, top_left);
1278     const uint16x8_t top_dist = vabdq_u16(left, top_left);
1279     const uint16x8_t top_left_dist =
1280         vabdq_u16(vaddq_u16(top, left), top_left_x2);
1281 
1282     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
1283     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
1284     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
1285 
1286     // if (left_dist <= top_dist && left_dist <= top_left_dist)
1287     const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
1288     //   dest[x] = left_column[y];
1289     // Fill all the unused spaces with 'top'. They will be overwritten when
1290     // the positions for top_left are known.
1291     uint16x8_t result = vbslq_u16(left_mask, left, top);
1292     // else if (top_dist <= top_left_dist)
1293     //   dest[x] = top_row[x];
1294     // Add these values to the mask. They were already set.
1295     const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
1296     // else
1297     //   dest[x] = top_left;
1298     result = vbslq_u16(left_or_top_mask, result, top_left);
1299 
1300     vst1q_u16(dst16, result);
1301     dst += stride;
1302   }
1303 }
1304 
1305 // For 16xH and above.
1306 template <int width, int height>
PaethWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1307 inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1308                           const void* LIBGAV1_RESTRICT const top_ptr,
1309                           const void* LIBGAV1_RESTRICT const left_ptr) {
1310   auto* dst = static_cast<uint8_t*>(dest);
1311   const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1312   const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1313 
1314   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
1315   const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
1316 
1317   uint16x8_t top[width >> 3];
1318   for (int i = 0; i < width >> 3; ++i) {
1319     top[i] = vld1q_u16(top_row + (i << 3));
1320   }
1321 
1322   for (int y = 0; y < height; ++y) {
1323     auto* dst_x = reinterpret_cast<uint16_t*>(dst);
1324     const uint16x8_t left = vdupq_n_u16(left_col[y]);
1325     const uint16x8_t top_dist = vabdq_u16(left, top_left);
1326 
1327     for (int i = 0; i < (width >> 3); ++i) {
1328       const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
1329       const uint16x8_t top_left_dist =
1330           vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
1331 
1332       const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
1333       const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
1334       const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
1335 
1336       // if (left_dist <= top_dist && left_dist <= top_left_dist)
1337       const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
1338       //   dest[x] = left_column[y];
1339       // Fill all the unused spaces with 'top'. They will be overwritten when
1340       // the positions for top_left are known.
1341       uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
1342       // else if (top_dist <= top_left_dist)
1343       //   dest[x] = top_row[x];
1344       // Add these values to the mask. They were already set.
1345       const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
1346       // else
1347       //   dest[x] = top_left;
1348       result = vbslq_u16(left_or_top_mask, result, top_left);
1349 
1350       vst1q_u16(dst_x, result);
1351       dst_x += 8;
1352     }
1353     dst += stride;
1354   }
1355 }
1356 
Init10bpp()1357 void Init10bpp() {
1358   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1359   assert(dsp != nullptr);
1360   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
1361       DcDefs::_4x4::DcTop;
1362   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
1363       DcDefs::_4x4::DcLeft;
1364   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
1365       DcDefs::_4x4::Dc;
1366   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
1367       Vertical4xH_NEON<4>;
1368   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
1369       Paeth4xH_NEON<4>;
1370 
1371   // 4x8
1372   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
1373       DcDefs::_4x8::DcTop;
1374   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
1375       DcDefs::_4x8::DcLeft;
1376   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
1377       DcDefs::_4x8::Dc;
1378   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
1379       Horizontal4xH_NEON<8>;
1380   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
1381       Vertical4xH_NEON<8>;
1382   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
1383       Paeth4xH_NEON<8>;
1384 
1385   // 4x16
1386   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
1387       DcDefs::_4x16::DcTop;
1388   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
1389       DcDefs::_4x16::DcLeft;
1390   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
1391       DcDefs::_4x16::Dc;
1392   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
1393       Horizontal4xH_NEON<16>;
1394   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
1395       Vertical4xH_NEON<16>;
1396   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
1397       Paeth4xH_NEON<16>;
1398 
1399   // 8x4
1400   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
1401       DcDefs::_8x4::DcTop;
1402   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
1403       DcDefs::_8x4::DcLeft;
1404   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
1405       DcDefs::_8x4::Dc;
1406   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
1407       Vertical8xH_NEON<4>;
1408   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
1409       Paeth8xH_NEON<4>;
1410 
1411   // 8x8
1412   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
1413       DcDefs::_8x8::DcTop;
1414   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
1415       DcDefs::_8x8::DcLeft;
1416   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
1417       DcDefs::_8x8::Dc;
1418   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
1419       Horizontal8xH_NEON<8>;
1420   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
1421       Vertical8xH_NEON<8>;
1422   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
1423       Paeth8xH_NEON<8>;
1424 
1425   // 8x16
1426   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
1427       DcDefs::_8x16::DcTop;
1428   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
1429       DcDefs::_8x16::DcLeft;
1430   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
1431       DcDefs::_8x16::Dc;
1432   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
1433       Vertical8xH_NEON<16>;
1434   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
1435       Paeth8xH_NEON<16>;
1436 
1437   // 8x32
1438   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
1439       DcDefs::_8x32::DcTop;
1440   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
1441       DcDefs::_8x32::DcLeft;
1442   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
1443       DcDefs::_8x32::Dc;
1444   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
1445       Horizontal8xH_NEON<32>;
1446   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
1447       Vertical8xH_NEON<32>;
1448   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
1449       Paeth8xH_NEON<32>;
1450 
1451   // 16x4
1452   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
1453       DcDefs::_16x4::DcTop;
1454   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
1455       DcDefs::_16x4::DcLeft;
1456   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
1457       DcDefs::_16x4::Dc;
1458   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
1459       Vertical16xH_NEON<4>;
1460   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
1461       PaethWxH_NEON<16, 4>;
1462 
1463   // 16x8
1464   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
1465       DcDefs::_16x8::DcTop;
1466   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
1467       DcDefs::_16x8::DcLeft;
1468   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
1469       DcDefs::_16x8::Dc;
1470   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
1471       Horizontal16xH_NEON<8>;
1472   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
1473       Vertical16xH_NEON<8>;
1474   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
1475       PaethWxH_NEON<16, 8>;
1476 
1477   // 16x16
1478   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
1479       DcDefs::_16x16::DcTop;
1480   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
1481       DcDefs::_16x16::DcLeft;
1482   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
1483       DcDefs::_16x16::Dc;
1484   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
1485       Vertical16xH_NEON<16>;
1486   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
1487       PaethWxH_NEON<16, 16>;
1488 
1489   // 16x32
1490   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
1491       DcDefs::_16x32::DcTop;
1492   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
1493       DcDefs::_16x32::DcLeft;
1494   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
1495       DcDefs::_16x32::Dc;
1496   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
1497       Vertical16xH_NEON<32>;
1498   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
1499       PaethWxH_NEON<16, 32>;
1500 
1501   // 16x64
1502   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
1503       DcDefs::_16x64::DcTop;
1504   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
1505       DcDefs::_16x64::DcLeft;
1506   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
1507       DcDefs::_16x64::Dc;
1508   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
1509       Vertical16xH_NEON<64>;
1510   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
1511       PaethWxH_NEON<16, 64>;
1512 
1513   // 32x8
1514   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
1515       DcDefs::_32x8::DcTop;
1516   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
1517       DcDefs::_32x8::DcLeft;
1518   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
1519       DcDefs::_32x8::Dc;
1520   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
1521       Vertical32xH_NEON<8>;
1522   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
1523       PaethWxH_NEON<32, 8>;
1524 
1525   // 32x16
1526   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
1527       DcDefs::_32x16::DcTop;
1528   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
1529       DcDefs::_32x16::DcLeft;
1530   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
1531       DcDefs::_32x16::Dc;
1532   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
1533       Vertical32xH_NEON<16>;
1534   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
1535       PaethWxH_NEON<32, 16>;
1536 
1537   // 32x32
1538   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
1539       DcDefs::_32x32::DcTop;
1540   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
1541       DcDefs::_32x32::DcLeft;
1542   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
1543       DcDefs::_32x32::Dc;
1544   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
1545       Vertical32xH_NEON<32>;
1546   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
1547       PaethWxH_NEON<32, 32>;
1548 
1549   // 32x64
1550   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
1551       DcDefs::_32x64::DcTop;
1552   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
1553       DcDefs::_32x64::DcLeft;
1554   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
1555       DcDefs::_32x64::Dc;
1556   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
1557       Horizontal32xH_NEON<64>;
1558   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
1559       Vertical32xH_NEON<64>;
1560   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
1561       PaethWxH_NEON<32, 64>;
1562 
1563   // 64x16
1564   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
1565       DcDefs::_64x16::DcTop;
1566   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
1567       DcDefs::_64x16::DcLeft;
1568   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
1569       DcDefs::_64x16::Dc;
1570   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
1571       Vertical64xH_NEON<16>;
1572   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
1573       PaethWxH_NEON<64, 16>;
1574 
1575   // 64x32
1576   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
1577       DcDefs::_64x32::DcTop;
1578   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
1579       DcDefs::_64x32::DcLeft;
1580   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
1581       DcDefs::_64x32::Dc;
1582   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
1583       Vertical64xH_NEON<32>;
1584   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
1585       PaethWxH_NEON<64, 32>;
1586 
1587   // 64x64
1588   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
1589       DcDefs::_64x64::DcTop;
1590   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
1591       DcDefs::_64x64::DcLeft;
1592   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
1593       DcDefs::_64x64::Dc;
1594   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
1595       Vertical64xH_NEON<64>;
1596   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
1597       PaethWxH_NEON<64, 64>;
1598 }
1599 
1600 }  // namespace
1601 }  // namespace high_bitdepth
1602 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
1603 
IntraPredInit_NEON()1604 void IntraPredInit_NEON() {
1605   low_bitdepth::Init8bpp();
1606 #if LIBGAV1_MAX_BITDEPTH >= 10
1607   high_bitdepth::Init10bpp();
1608 #endif
1609 }
1610 
1611 }  // namespace dsp
1612 }  // namespace libgav1
1613 
1614 #else   // !LIBGAV1_ENABLE_NEON
1615 namespace libgav1 {
1616 namespace dsp {
1617 
IntraPredInit_NEON()1618 void IntraPredInit_NEON() {}
1619 
1620 }  // namespace dsp
1621 }  // namespace libgav1
1622 #endif  // LIBGAV1_ENABLE_NEON
1623