xref: /aosp_15_r20/external/libaom/aom_dsp/arm/intrapred_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <stdint.h>
15 
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 #include "config/av1_rtcd.h"
19 
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/arm/mem_neon.h"
22 #include "aom_dsp/arm/reinterpret_neon.h"
23 #include "aom_dsp/arm/sum_neon.h"
24 #include "aom_dsp/arm/transpose_neon.h"
25 #include "aom_dsp/intrapred_common.h"
26 
27 //------------------------------------------------------------------------------
28 // DC 4x4
29 
dc_load_sum_4(const uint8_t * in)30 static inline uint16x8_t dc_load_sum_4(const uint8_t *in) {
31   const uint8x8_t a = load_u8_4x1(in);
32   const uint16x4_t p0 = vpaddl_u8(a);
33   const uint16x4_t p1 = vpadd_u16(p0, p0);
34   return vcombine_u16(p1, vdup_n_u16(0));
35 }
36 
dc_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)37 static inline void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
38                                 uint8x8_t dc) {
39   for (int i = 0; i < h; ++i) {
40     store_u8_4x1(dst + i * stride, dc);
41   }
42 }
43 
aom_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
45                                const uint8_t *above, const uint8_t *left) {
46   const uint16x8_t sum_top = dc_load_sum_4(above);
47   const uint16x8_t sum_left = dc_load_sum_4(left);
48   const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
49   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
50   dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
51 }
52 
aom_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)53 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
54                                     const uint8_t *above, const uint8_t *left) {
55   const uint16x8_t sum_left = dc_load_sum_4(left);
56   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
57   (void)above;
58   dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
59 }
60 
aom_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)61 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
62                                    const uint8_t *above, const uint8_t *left) {
63   const uint16x8_t sum_top = dc_load_sum_4(above);
64   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
65   (void)left;
66   dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
67 }
68 
aom_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)69 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
70                                    const uint8_t *above, const uint8_t *left) {
71   const uint8x8_t dc0 = vdup_n_u8(0x80);
72   (void)above;
73   (void)left;
74   dc_store_4xh(dst, stride, 4, dc0);
75 }
76 
77 //------------------------------------------------------------------------------
78 // DC 8x8
79 
dc_load_sum_8(const uint8_t * in)80 static inline uint16x8_t dc_load_sum_8(const uint8_t *in) {
81   // This isn't used in the case where we want to load both above and left
82   // vectors, since we want to avoid performing the reduction twice.
83   const uint8x8_t a = vld1_u8(in);
84   const uint16x4_t p0 = vpaddl_u8(a);
85   const uint16x4_t p1 = vpadd_u16(p0, p0);
86   const uint16x4_t p2 = vpadd_u16(p1, p1);
87   return vcombine_u16(p2, vdup_n_u16(0));
88 }
89 
horizontal_add_and_broadcast_u16x8(uint16x8_t a)90 static inline uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
91 #if AOM_ARCH_AARCH64
92   // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
93   // instruction, however the addv instruction is usually slightly more
94   // expensive than a pairwise addition, so the need for immediately
95   // broadcasting the result again seems to negate any benefit.
96   const uint16x8_t b = vpaddq_u16(a, a);
97   const uint16x8_t c = vpaddq_u16(b, b);
98   return vpaddq_u16(c, c);
99 #else
100   const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
101   const uint16x4_t c = vpadd_u16(b, b);
102   const uint16x4_t d = vpadd_u16(c, c);
103   return vcombine_u16(d, d);
104 #endif
105 }
106 
dc_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)107 static inline void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
108                                 uint8x8_t dc) {
109   for (int i = 0; i < h; ++i) {
110     vst1_u8(dst + i * stride, dc);
111   }
112 }
113 
aom_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)114 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
115                                const uint8_t *above, const uint8_t *left) {
116   const uint8x8_t sum_top = vld1_u8(above);
117   const uint8x8_t sum_left = vld1_u8(left);
118   uint16x8_t sum = vaddl_u8(sum_left, sum_top);
119   sum = horizontal_add_and_broadcast_u16x8(sum);
120   const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
121   dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
122 }
123 
aom_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)124 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
125                                     const uint8_t *above, const uint8_t *left) {
126   const uint16x8_t sum_left = dc_load_sum_8(left);
127   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
128   (void)above;
129   dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
130 }
131 
aom_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)132 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
133                                    const uint8_t *above, const uint8_t *left) {
134   const uint16x8_t sum_top = dc_load_sum_8(above);
135   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
136   (void)left;
137   dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
138 }
139 
aom_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)140 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
141                                    const uint8_t *above, const uint8_t *left) {
142   const uint8x8_t dc0 = vdup_n_u8(0x80);
143   (void)above;
144   (void)left;
145   dc_store_8xh(dst, stride, 8, dc0);
146 }
147 
148 //------------------------------------------------------------------------------
149 // DC 16x16
150 
dc_load_partial_sum_16(const uint8_t * in)151 static inline uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
152   const uint8x16_t a = vld1q_u8(in);
153   // delay the remainder of the reduction until
154   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
155   // than twice in the case we are loading both above and left.
156   return vpaddlq_u8(a);
157 }
158 
dc_load_sum_16(const uint8_t * in)159 static inline uint16x8_t dc_load_sum_16(const uint8_t *in) {
160   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
161 }
162 
dc_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)163 static inline void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
164                                  uint8x16_t dc) {
165   for (int i = 0; i < h; ++i) {
166     vst1q_u8(dst + i * stride, dc);
167   }
168 }
169 
aom_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)170 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
171                                  const uint8_t *above, const uint8_t *left) {
172   const uint16x8_t sum_top = dc_load_partial_sum_16(above);
173   const uint16x8_t sum_left = dc_load_partial_sum_16(left);
174   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
175   sum = horizontal_add_and_broadcast_u16x8(sum);
176   const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
177   dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
178 }
179 
aom_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)180 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
181                                       const uint8_t *above,
182                                       const uint8_t *left) {
183   const uint16x8_t sum_left = dc_load_sum_16(left);
184   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
185   (void)above;
186   dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
187 }
188 
aom_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)189 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
190                                      const uint8_t *above,
191                                      const uint8_t *left) {
192   const uint16x8_t sum_top = dc_load_sum_16(above);
193   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
194   (void)left;
195   dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
196 }
197 
aom_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)198 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
199                                      const uint8_t *above,
200                                      const uint8_t *left) {
201   const uint8x16_t dc0 = vdupq_n_u8(0x80);
202   (void)above;
203   (void)left;
204   dc_store_16xh(dst, stride, 16, dc0);
205 }
206 
207 //------------------------------------------------------------------------------
208 // DC 32x32
209 
dc_load_partial_sum_32(const uint8_t * in)210 static inline uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
211   const uint8x16_t a0 = vld1q_u8(in);
212   const uint8x16_t a1 = vld1q_u8(in + 16);
213   // delay the remainder of the reduction until
214   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
215   // than twice in the case we are loading both above and left.
216   return vpadalq_u8(vpaddlq_u8(a0), a1);
217 }
218 
dc_load_sum_32(const uint8_t * in)219 static inline uint16x8_t dc_load_sum_32(const uint8_t *in) {
220   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
221 }
222 
dc_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)223 static inline void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
224                                  uint8x16_t dc) {
225   for (int i = 0; i < h; ++i) {
226     vst1q_u8(dst + i * stride, dc);
227     vst1q_u8(dst + i * stride + 16, dc);
228   }
229 }
230 
aom_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)231 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
232                                  const uint8_t *above, const uint8_t *left) {
233   const uint16x8_t sum_top = dc_load_partial_sum_32(above);
234   const uint16x8_t sum_left = dc_load_partial_sum_32(left);
235   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
236   sum = horizontal_add_and_broadcast_u16x8(sum);
237   const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
238   dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
239 }
240 
aom_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)241 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
242                                       const uint8_t *above,
243                                       const uint8_t *left) {
244   const uint16x8_t sum_left = dc_load_sum_32(left);
245   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
246   (void)above;
247   dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
248 }
249 
aom_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)250 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
251                                      const uint8_t *above,
252                                      const uint8_t *left) {
253   const uint16x8_t sum_top = dc_load_sum_32(above);
254   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
255   (void)left;
256   dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
257 }
258 
aom_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)259 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
260                                      const uint8_t *above,
261                                      const uint8_t *left) {
262   const uint8x16_t dc0 = vdupq_n_u8(0x80);
263   (void)above;
264   (void)left;
265   dc_store_32xh(dst, stride, 32, dc0);
266 }
267 
268 //------------------------------------------------------------------------------
269 // DC 64x64
270 
dc_load_partial_sum_64(const uint8_t * in)271 static inline uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
272   const uint8x16_t a0 = vld1q_u8(in);
273   const uint8x16_t a1 = vld1q_u8(in + 16);
274   const uint8x16_t a2 = vld1q_u8(in + 32);
275   const uint8x16_t a3 = vld1q_u8(in + 48);
276   const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
277   const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
278   // delay the remainder of the reduction until
279   // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
280   // than twice in the case we are loading both above and left.
281   return vaddq_u16(p01, p23);
282 }
283 
dc_load_sum_64(const uint8_t * in)284 static inline uint16x8_t dc_load_sum_64(const uint8_t *in) {
285   return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
286 }
287 
dc_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)288 static inline void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
289                                  uint8x16_t dc) {
290   for (int i = 0; i < h; ++i) {
291     vst1q_u8(dst + i * stride, dc);
292     vst1q_u8(dst + i * stride + 16, dc);
293     vst1q_u8(dst + i * stride + 32, dc);
294     vst1q_u8(dst + i * stride + 48, dc);
295   }
296 }
297 
aom_dc_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)298 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
299                                  const uint8_t *above, const uint8_t *left) {
300   const uint16x8_t sum_top = dc_load_partial_sum_64(above);
301   const uint16x8_t sum_left = dc_load_partial_sum_64(left);
302   uint16x8_t sum = vaddq_u16(sum_left, sum_top);
303   sum = horizontal_add_and_broadcast_u16x8(sum);
304   const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
305   dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
306 }
307 
aom_dc_left_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)308 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
309                                       const uint8_t *above,
310                                       const uint8_t *left) {
311   const uint16x8_t sum_left = dc_load_sum_64(left);
312   const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
313   (void)above;
314   dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
315 }
316 
aom_dc_top_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)317 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
318                                      const uint8_t *above,
319                                      const uint8_t *left) {
320   const uint16x8_t sum_top = dc_load_sum_64(above);
321   const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
322   (void)left;
323   dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
324 }
325 
aom_dc_128_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)326 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
327                                      const uint8_t *above,
328                                      const uint8_t *left) {
329   const uint8x16_t dc0 = vdupq_n_u8(0x80);
330   (void)above;
331   (void)left;
332   dc_store_64xh(dst, stride, 64, dc0);
333 }
334 
335 //------------------------------------------------------------------------------
336 // DC rectangular cases
337 
338 #define DC_MULTIPLIER_1X2 0x5556
339 #define DC_MULTIPLIER_1X4 0x3334
340 
341 #define DC_SHIFT2 16
342 
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)343 static inline int divide_using_multiply_shift(int num, int shift1,
344                                               int multiplier, int shift2) {
345   const int interm = num >> shift1;
346   return interm * multiplier >> shift2;
347 }
348 
calculate_dc_from_sum(int bw,int bh,uint32_t sum,int shift1,int multiplier)349 static inline int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
350                                         int shift1, int multiplier) {
351   const int expected_dc = divide_using_multiply_shift(
352       sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
353   assert(expected_dc < (1 << 8));
354   return expected_dc;
355 }
356 
357 #undef DC_SHIFT2
358 
aom_dc_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)359 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
360                                const uint8_t *above, const uint8_t *left) {
361   uint8x8_t a = load_u8_4x1(above);
362   uint8x8_t l = vld1_u8(left);
363   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
364   uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
365   dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
366 }
367 
aom_dc_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)368 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
369                                const uint8_t *above, const uint8_t *left) {
370   uint8x8_t a = vld1_u8(above);
371   uint8x8_t l = load_u8_4x1(left);
372   uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
373   uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
374   dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
375 }
376 
377 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)378 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
379                                 const uint8_t *above, const uint8_t *left) {
380   uint8x8_t a = load_u8_4x1(above);
381   uint8x16_t l = vld1q_u8(left);
382   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
383   uint32_t sum = horizontal_add_u16x8(sum_al);
384   uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
385   dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
386 }
387 
aom_dc_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)388 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
389                                 const uint8_t *above, const uint8_t *left) {
390   uint8x16_t a = vld1q_u8(above);
391   uint8x8_t l = load_u8_4x1(left);
392   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
393   uint32_t sum = horizontal_add_u16x8(sum_al);
394   uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
395   dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
396 }
397 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
398 
aom_dc_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)399 void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
400                                 const uint8_t *above, const uint8_t *left) {
401   uint8x8_t a = vld1_u8(above);
402   uint8x16_t l = vld1q_u8(left);
403   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
404   uint32_t sum = horizontal_add_u16x8(sum_al);
405   uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
406   dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
407 }
408 
aom_dc_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)409 void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
410                                 const uint8_t *above, const uint8_t *left) {
411   uint8x16_t a = vld1q_u8(above);
412   uint8x8_t l = vld1_u8(left);
413   uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
414   uint32_t sum = horizontal_add_u16x8(sum_al);
415   uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
416   dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
417 }
418 
419 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)420 void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
421                                 const uint8_t *above, const uint8_t *left) {
422   uint8x8_t a = vld1_u8(above);
423   uint16x8_t sum_left = dc_load_partial_sum_32(left);
424   uint16x8_t sum_al = vaddw_u8(sum_left, a);
425   uint32_t sum = horizontal_add_u16x8(sum_al);
426   uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
427   dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
428 }
429 
aom_dc_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)430 void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
431                                 const uint8_t *above, const uint8_t *left) {
432   uint16x8_t sum_top = dc_load_partial_sum_32(above);
433   uint8x8_t l = vld1_u8(left);
434   uint16x8_t sum_al = vaddw_u8(sum_top, l);
435   uint32_t sum = horizontal_add_u16x8(sum_al);
436   uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
437   dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
438 }
439 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
440 
aom_dc_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)441 void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
442                                  const uint8_t *above, const uint8_t *left) {
443   uint16x8_t sum_above = dc_load_partial_sum_16(above);
444   uint16x8_t sum_left = dc_load_partial_sum_32(left);
445   uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
446   uint32_t sum = horizontal_add_u16x8(sum_al);
447   uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
448   dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
449 }
450 
aom_dc_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)451 void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
452                                  const uint8_t *above, const uint8_t *left) {
453   uint16x8_t sum_above = dc_load_partial_sum_32(above);
454   uint16x8_t sum_left = dc_load_partial_sum_16(left);
455   uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
456   uint32_t sum = horizontal_add_u16x8(sum_al);
457   uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
458   dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
459 }
460 
461 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)462 void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
463                                  const uint8_t *above, const uint8_t *left) {
464   uint16x8_t sum_above = dc_load_partial_sum_16(above);
465   uint16x8_t sum_left = dc_load_partial_sum_64(left);
466   uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
467   uint32_t sum = horizontal_add_u16x8(sum_al);
468   uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
469   dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
470 }
471 
aom_dc_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)472 void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
473                                  const uint8_t *above, const uint8_t *left) {
474   uint16x8_t sum_above = dc_load_partial_sum_64(above);
475   uint16x8_t sum_left = dc_load_partial_sum_16(left);
476   uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
477   uint32_t sum = horizontal_add_u16x8(sum_al);
478   uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
479   dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
480 }
481 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
482 
aom_dc_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)483 void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
484                                  const uint8_t *above, const uint8_t *left) {
485   uint16x8_t sum_above = dc_load_partial_sum_32(above);
486   uint16x8_t sum_left = dc_load_partial_sum_64(left);
487   uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
488   uint32_t sum = horizontal_add_u16x8(sum_al);
489   uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
490   dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
491 }
492 
aom_dc_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)493 void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
494                                  const uint8_t *above, const uint8_t *left) {
495   uint16x8_t sum_above = dc_load_partial_sum_64(above);
496   uint16x8_t sum_left = dc_load_partial_sum_32(left);
497   uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
498   uint32_t sum = horizontal_add_u16x8(sum_al);
499   uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
500   dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
501 }
502 
503 #undef DC_MULTIPLIER_1X2
504 #undef DC_MULTIPLIER_1X4
505 
506 #define DC_PREDICTOR_128(w, h, q)                                            \
507   void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
508                                              const uint8_t *above,           \
509                                              const uint8_t *left) {          \
510     (void)above;                                                             \
511     (void)left;                                                              \
512     dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80));                \
513   }
514 
515 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
516 DC_PREDICTOR_128(4, 16, )
517 DC_PREDICTOR_128(8, 32, )
518 DC_PREDICTOR_128(16, 4, q)
519 DC_PREDICTOR_128(16, 64, q)
520 DC_PREDICTOR_128(32, 8, q)
521 DC_PREDICTOR_128(64, 16, q)
522 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
523 DC_PREDICTOR_128(4, 8, )
524 DC_PREDICTOR_128(8, 4, )
525 DC_PREDICTOR_128(8, 16, )
526 DC_PREDICTOR_128(16, 8, q)
527 DC_PREDICTOR_128(16, 32, q)
528 DC_PREDICTOR_128(32, 16, q)
529 DC_PREDICTOR_128(32, 64, q)
530 DC_PREDICTOR_128(64, 32, q)
531 
532 #undef DC_PREDICTOR_128
533 
534 #define DC_PREDICTOR_LEFT(w, h, shift, q)                                     \
535   void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
536                                               const uint8_t *above,           \
537                                               const uint8_t *left) {          \
538     (void)above;                                                              \
539     const uint16x8_t sum = dc_load_sum_##h(left);                             \
540     const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                         \
541     dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));            \
542   }
543 
544 DC_PREDICTOR_LEFT(4, 8, 3, )
545 DC_PREDICTOR_LEFT(8, 4, 2, )
546 DC_PREDICTOR_LEFT(8, 16, 4, )
547 DC_PREDICTOR_LEFT(16, 8, 3, q)
548 DC_PREDICTOR_LEFT(16, 32, 5, q)
549 DC_PREDICTOR_LEFT(32, 16, 4, q)
550 DC_PREDICTOR_LEFT(32, 64, 6, q)
551 DC_PREDICTOR_LEFT(64, 32, 5, q)
552 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
553 DC_PREDICTOR_LEFT(4, 16, 4, )
554 DC_PREDICTOR_LEFT(16, 4, 2, q)
555 DC_PREDICTOR_LEFT(8, 32, 5, )
556 DC_PREDICTOR_LEFT(32, 8, 3, q)
557 DC_PREDICTOR_LEFT(16, 64, 6, q)
558 DC_PREDICTOR_LEFT(64, 16, 4, q)
559 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
560 
561 #undef DC_PREDICTOR_LEFT
562 
563 #define DC_PREDICTOR_TOP(w, h, shift, q)                                     \
564   void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
565                                              const uint8_t *above,           \
566                                              const uint8_t *left) {          \
567     (void)left;                                                              \
568     const uint16x8_t sum = dc_load_sum_##w(above);                           \
569     const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                        \
570     dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));           \
571   }
572 
573 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
574 DC_PREDICTOR_TOP(8, 32, 3, )
575 DC_PREDICTOR_TOP(4, 16, 2, )
576 DC_PREDICTOR_TOP(16, 4, 4, q)
577 DC_PREDICTOR_TOP(16, 64, 4, q)
578 DC_PREDICTOR_TOP(32, 8, 5, q)
579 DC_PREDICTOR_TOP(64, 16, 6, q)
580 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
581 DC_PREDICTOR_TOP(4, 8, 2, )
582 DC_PREDICTOR_TOP(8, 4, 3, )
583 DC_PREDICTOR_TOP(8, 16, 3, )
584 DC_PREDICTOR_TOP(16, 8, 4, q)
585 DC_PREDICTOR_TOP(16, 32, 4, q)
586 DC_PREDICTOR_TOP(32, 16, 5, q)
587 DC_PREDICTOR_TOP(32, 64, 5, q)
588 DC_PREDICTOR_TOP(64, 32, 6, q)
589 
590 #undef DC_PREDICTOR_TOP
591 
592 // -----------------------------------------------------------------------------
593 
v_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)594 static inline void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
595                                uint8x8_t d0) {
596   for (int i = 0; i < h; ++i) {
597     store_u8_4x1(dst + i * stride, d0);
598   }
599 }
600 
v_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)601 static inline void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
602                                uint8x8_t d0) {
603   for (int i = 0; i < h; ++i) {
604     vst1_u8(dst + i * stride, d0);
605   }
606 }
607 
v_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0)608 static inline void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
609                                 uint8x16_t d0) {
610   for (int i = 0; i < h; ++i) {
611     vst1q_u8(dst + i * stride, d0);
612   }
613 }
614 
v_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1)615 static inline void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
616                                 uint8x16_t d0, uint8x16_t d1) {
617   for (int i = 0; i < h; ++i) {
618     vst1q_u8(dst + 0, d0);
619     vst1q_u8(dst + 16, d1);
620     dst += stride;
621   }
622 }
623 
v_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1,uint8x16_t d2,uint8x16_t d3)624 static inline void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
625                                 uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
626                                 uint8x16_t d3) {
627   for (int i = 0; i < h; ++i) {
628     vst1q_u8(dst + 0, d0);
629     vst1q_u8(dst + 16, d1);
630     vst1q_u8(dst + 32, d2);
631     vst1q_u8(dst + 48, d3);
632     dst += stride;
633   }
634 }
635 
aom_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)636 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
637                               const uint8_t *above, const uint8_t *left) {
638   (void)left;
639   v_store_4xh(dst, stride, 4, load_u8_4x1(above));
640 }
641 
aom_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)642 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
643                               const uint8_t *above, const uint8_t *left) {
644   (void)left;
645   v_store_8xh(dst, stride, 8, vld1_u8(above));
646 }
647 
aom_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)648 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
649                                 const uint8_t *above, const uint8_t *left) {
650   (void)left;
651   v_store_16xh(dst, stride, 16, vld1q_u8(above));
652 }
653 
aom_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
655                                 const uint8_t *above, const uint8_t *left) {
656   const uint8x16_t d0 = vld1q_u8(above);
657   const uint8x16_t d1 = vld1q_u8(above + 16);
658   (void)left;
659   v_store_32xh(dst, stride, 32, d0, d1);
660 }
661 
aom_v_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)662 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
663                               const uint8_t *above, const uint8_t *left) {
664   (void)left;
665   v_store_4xh(dst, stride, 8, load_u8_4x1(above));
666 }
667 
668 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)669 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
670                                const uint8_t *above, const uint8_t *left) {
671   (void)left;
672   v_store_4xh(dst, stride, 16, load_u8_4x1(above));
673 }
674 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
675 
aom_v_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)676 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
677                               const uint8_t *above, const uint8_t *left) {
678   (void)left;
679   v_store_8xh(dst, stride, 4, vld1_u8(above));
680 }
681 
aom_v_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)682 void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
683                                const uint8_t *above, const uint8_t *left) {
684   (void)left;
685   v_store_8xh(dst, stride, 16, vld1_u8(above));
686 }
687 
688 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689 void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
690                                const uint8_t *above, const uint8_t *left) {
691   (void)left;
692   v_store_8xh(dst, stride, 32, vld1_u8(above));
693 }
694 
aom_v_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)695 void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
696                                const uint8_t *above, const uint8_t *left) {
697   (void)left;
698   v_store_16xh(dst, stride, 4, vld1q_u8(above));
699 }
700 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
701 
aom_v_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)702 void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
703                                const uint8_t *above, const uint8_t *left) {
704   (void)left;
705   v_store_16xh(dst, stride, 8, vld1q_u8(above));
706 }
707 
aom_v_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)708 void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
709                                 const uint8_t *above, const uint8_t *left) {
710   (void)left;
711   v_store_16xh(dst, stride, 32, vld1q_u8(above));
712 }
713 
714 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)715 void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
716                                 const uint8_t *above, const uint8_t *left) {
717   (void)left;
718   v_store_16xh(dst, stride, 64, vld1q_u8(above));
719 }
720 
aom_v_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)721 void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
722                                const uint8_t *above, const uint8_t *left) {
723   const uint8x16_t d0 = vld1q_u8(above);
724   const uint8x16_t d1 = vld1q_u8(above + 16);
725   (void)left;
726   v_store_32xh(dst, stride, 8, d0, d1);
727 }
728 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
729 
aom_v_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)730 void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
731                                 const uint8_t *above, const uint8_t *left) {
732   const uint8x16_t d0 = vld1q_u8(above);
733   const uint8x16_t d1 = vld1q_u8(above + 16);
734   (void)left;
735   v_store_32xh(dst, stride, 16, d0, d1);
736 }
737 
aom_v_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)738 void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
739                                 const uint8_t *above, const uint8_t *left) {
740   const uint8x16_t d0 = vld1q_u8(above);
741   const uint8x16_t d1 = vld1q_u8(above + 16);
742   (void)left;
743   v_store_32xh(dst, stride, 64, d0, d1);
744 }
745 
746 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)747 void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
748                                 const uint8_t *above, const uint8_t *left) {
749   const uint8x16_t d0 = vld1q_u8(above);
750   const uint8x16_t d1 = vld1q_u8(above + 16);
751   const uint8x16_t d2 = vld1q_u8(above + 32);
752   const uint8x16_t d3 = vld1q_u8(above + 48);
753   (void)left;
754   v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
755 }
756 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
757 
aom_v_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)758 void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
759                                 const uint8_t *above, const uint8_t *left) {
760   const uint8x16_t d0 = vld1q_u8(above);
761   const uint8x16_t d1 = vld1q_u8(above + 16);
762   const uint8x16_t d2 = vld1q_u8(above + 32);
763   const uint8x16_t d3 = vld1q_u8(above + 48);
764   (void)left;
765   v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
766 }
767 
aom_v_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)768 void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
769                                 const uint8_t *above, const uint8_t *left) {
770   const uint8x16_t d0 = vld1q_u8(above);
771   const uint8x16_t d1 = vld1q_u8(above + 16);
772   const uint8x16_t d2 = vld1q_u8(above + 32);
773   const uint8x16_t d3 = vld1q_u8(above + 48);
774   (void)left;
775   v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
776 }
777 
778 // -----------------------------------------------------------------------------
779 
h_store_4x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)780 static inline void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
781   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
782   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
783   store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
784   store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
785   store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
786   store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
787   store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
788   store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
789 }
790 
h_store_8x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)791 static inline void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
792   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
793   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
794   vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
795   vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
796   vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
797   vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
798   vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
799   vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
800 }
801 
h_store_16x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)802 static inline void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
803   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
804   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
805   vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
806   vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
807   vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
808   vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
809   vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
810   vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
811 }
812 
h_store_32x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)813 static inline void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
814   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
815   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
816   dst += stride;
817   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
818   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
819   dst += stride;
820   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
821   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
822   dst += stride;
823   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
824   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
825   dst += stride;
826   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
827   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
828   dst += stride;
829   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
830   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
831   dst += stride;
832   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
833   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
834   dst += stride;
835   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
836   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
837 }
838 
h_store_64x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)839 static inline void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
840   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
841   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
842   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
843   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
844   dst += stride;
845   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
846   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
847   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
848   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
849   dst += stride;
850   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
851   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
852   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
853   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
854   dst += stride;
855   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
856   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
857   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
858   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
859   dst += stride;
860   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
861   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
862   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
863   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
864   dst += stride;
865   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
866   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
867   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
868   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
869   dst += stride;
870   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
871   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
872   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
873   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
874   dst += stride;
875   vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
876   vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
877   vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
878   vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
879 }
880 
aom_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)881 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
882                               const uint8_t *above, const uint8_t *left) {
883   const uint8x8_t d0 = load_u8_4x1(left);
884   (void)above;
885   store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
886   store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
887   store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
888   store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
889 }
890 
aom_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)891 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
892                               const uint8_t *above, const uint8_t *left) {
893   const uint8x8_t d0 = vld1_u8(left);
894   (void)above;
895   h_store_8x8(dst, stride, d0);
896 }
897 
aom_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)898 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
899                                 const uint8_t *above, const uint8_t *left) {
900   const uint8x16_t d0 = vld1q_u8(left);
901   (void)above;
902   h_store_16x8(dst, stride, vget_low_u8(d0));
903   h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
904 }
905 
aom_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)906 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
907                                 const uint8_t *above, const uint8_t *left) {
908   const uint8x16_t d0 = vld1q_u8(left);
909   const uint8x16_t d1 = vld1q_u8(left + 16);
910   (void)above;
911   h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
912   h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
913   h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
914   h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
915 }
916 
aom_h_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)917 void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
918                               const uint8_t *above, const uint8_t *left) {
919   const uint8x8_t d0 = vld1_u8(left);
920   (void)above;
921   h_store_4x8(dst, stride, d0);
922 }
923 
924 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)925 void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
926                                const uint8_t *above, const uint8_t *left) {
927   const uint8x16_t d0 = vld1q_u8(left);
928   (void)above;
929   h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
930   h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
931 }
932 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
933 
aom_h_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)934 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
935                               const uint8_t *above, const uint8_t *left) {
936   const uint8x8_t d0 = load_u8_4x1(left);
937   (void)above;
938   vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
939   vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
940   vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
941   vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
942 }
943 
aom_h_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)944 void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
945                                const uint8_t *above, const uint8_t *left) {
946   const uint8x16_t d0 = vld1q_u8(left);
947   (void)above;
948   h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
949   h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
950 }
951 
952 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)953 void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
954                                const uint8_t *above, const uint8_t *left) {
955   const uint8x16_t d0 = vld1q_u8(left);
956   const uint8x16_t d1 = vld1q_u8(left + 16);
957   (void)above;
958   h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
959   h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
960   h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
961   h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
962 }
963 
aom_h_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)964 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
965                                const uint8_t *above, const uint8_t *left) {
966   const uint8x8_t d0 = load_u8_4x1(left);
967   (void)above;
968   vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
969   vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
970   vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
971   vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
972 }
973 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
974 
aom_h_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)975 void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
976                                const uint8_t *above, const uint8_t *left) {
977   const uint8x8_t d0 = vld1_u8(left);
978   (void)above;
979   h_store_16x8(dst, stride, d0);
980 }
981 
aom_h_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)982 void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
983                                 const uint8_t *above, const uint8_t *left) {
984   const uint8x16_t d0 = vld1q_u8(left);
985   const uint8x16_t d1 = vld1q_u8(left + 16);
986   (void)above;
987   h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
988   h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
989   h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
990   h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
991 }
992 
993 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)994 void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
995                                 const uint8_t *above, const uint8_t *left) {
996   const uint8x16_t d0 = vld1q_u8(left);
997   const uint8x16_t d1 = vld1q_u8(left + 16);
998   const uint8x16_t d2 = vld1q_u8(left + 32);
999   const uint8x16_t d3 = vld1q_u8(left + 48);
1000   (void)above;
1001   h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
1002   h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
1003   h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
1004   h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
1005   h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
1006   h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
1007   h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
1008   h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
1009 }
1010 
aom_h_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1011 void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
1012                                const uint8_t *above, const uint8_t *left) {
1013   const uint8x8_t d0 = vld1_u8(left);
1014   (void)above;
1015   h_store_32x8(dst, stride, d0);
1016 }
1017 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1018 
aom_h_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1019 void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
1020                                 const uint8_t *above, const uint8_t *left) {
1021   const uint8x16_t d0 = vld1q_u8(left);
1022   (void)above;
1023   h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
1024   h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
1025 }
1026 
aom_h_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1027 void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
1028                                 const uint8_t *above, const uint8_t *left) {
1029   const uint8x16_t d0 = vld1q_u8(left + 0);
1030   const uint8x16_t d1 = vld1q_u8(left + 16);
1031   const uint8x16_t d2 = vld1q_u8(left + 32);
1032   const uint8x16_t d3 = vld1q_u8(left + 48);
1033   (void)above;
1034   h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
1035   h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
1036   h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
1037   h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
1038   h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
1039   h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
1040   h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
1041   h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
1042 }
1043 
1044 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1045 void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
1046                                 const uint8_t *above, const uint8_t *left) {
1047   const uint8x16_t d0 = vld1q_u8(left);
1048   (void)above;
1049   h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1050   h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1051 }
1052 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1053 
aom_h_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1054 void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
1055                                 const uint8_t *above, const uint8_t *left) {
1056   (void)above;
1057   for (int i = 0; i < 2; ++i) {
1058     const uint8x16_t d0 = vld1q_u8(left);
1059     h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1060     h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1061     left += 16;
1062     dst += 16 * stride;
1063   }
1064 }
1065 
aom_h_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1066 void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
1067                                 const uint8_t *above, const uint8_t *left) {
1068   (void)above;
1069   for (int i = 0; i < 4; ++i) {
1070     const uint8x16_t d0 = vld1q_u8(left);
1071     h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1072     h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1073     left += 16;
1074     dst += 16 * stride;
1075   }
1076 }
1077 
1078 /* ---------------------P R E D I C T I O N   Z 1--------------------------- */
1079 
1080 // Low bit depth functions
1081 static DECLARE_ALIGNED(32, const uint8_t, BaseMask[33][32]) = {
1082   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1083     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1084   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1086   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1087     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1088   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1089     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1090   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1091     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1092   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1093     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1094   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1096   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
1098   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1099     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
1100   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1101     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
1102   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1103     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
1104     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1105   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1106     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
1107     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1108   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1109     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
1110     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1111   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1112     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
1113     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1114   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1115     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
1116     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1117   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1118     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
1119     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1120   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1121     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
1122     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1123   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1124     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
1125     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1126   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1127     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
1128     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1129   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1130     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
1131     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1132   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1133     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
1134     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1135   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1136     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1137     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1138   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1139     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1140     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
1141   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1142     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1143     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
1144   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1145     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1146     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
1147   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1148     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1149     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
1150   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1151     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1152     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
1153   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1154     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1155     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
1156   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1157     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1158     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
1159   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1160     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1161     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
1162   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1163     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1164     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
1165   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1166     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1167     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
1168   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1169     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1170     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1171 };
1172 
dr_prediction_z1_HxW_internal_neon_64(int H,int W,uint8x8_t * dst,const uint8_t * above,int upsample_above,int dx)1173 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
1174     int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
1175     int dx) {
1176   const int frac_bits = 6 - upsample_above;
1177   const int max_base_x = ((W + H) - 1) << upsample_above;
1178 
1179   assert(dx > 0);
1180   // pre-filter above pixels
1181   // store in temp buffers:
1182   //   above[x] * 32 + 16
1183   //   above[x+1] - above[x]
1184   // final pixels will be calculated as:
1185   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1186 
1187   const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
1188 
1189   int x = dx;
1190   for (int r = 0; r < W; r++) {
1191     int base = x >> frac_bits;
1192     int base_max_diff = (max_base_x - base) >> upsample_above;
1193     if (base_max_diff <= 0) {
1194       for (int i = r; i < W; ++i) {
1195         dst[i] = a_mbase_x;  // save 4 values
1196       }
1197       return;
1198     }
1199 
1200     if (base_max_diff > H) base_max_diff = H;
1201 
1202     uint8x8x2_t a01_128;
1203     uint16x8_t shift;
1204     if (upsample_above) {
1205       a01_128 = vld2_u8(above + base);
1206       shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
1207     } else {
1208       a01_128.val[0] = vld1_u8(above + base);
1209       a01_128.val[1] = vld1_u8(above + base + 1);
1210       shift = vdupq_n_u16((x & 0x3f) >> 1);
1211     }
1212     uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
1213     uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
1214     uint16x8_t res = vmlaq_u16(a32, diff, shift);
1215 
1216     uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
1217     dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
1218 
1219     x += dx;
1220   }
1221 }
1222 
dr_prediction_z1_4xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1223 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1224                                       const uint8_t *above, int upsample_above,
1225                                       int dx) {
1226   uint8x8_t dstvec[16];
1227 
1228   dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
1229                                         dx);
1230   for (int i = 0; i < N; i++) {
1231     vst1_lane_u32((uint32_t *)(dst + stride * i),
1232                   vreinterpret_u32_u8(dstvec[i]), 0);
1233   }
1234 }
1235 
dr_prediction_z1_8xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1236 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1237                                       const uint8_t *above, int upsample_above,
1238                                       int dx) {
1239   uint8x8_t dstvec[32];
1240 
1241   dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
1242                                         dx);
1243   for (int i = 0; i < N; i++) {
1244     vst1_u8(dst + stride * i, dstvec[i]);
1245   }
1246 }
1247 
dr_prediction_z1_HxW_internal_neon(int H,int W,uint8x16_t * dst,const uint8_t * above,int upsample_above,int dx)1248 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
1249     int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
1250     int dx) {
1251   const int frac_bits = 6 - upsample_above;
1252   const int max_base_x = ((W + H) - 1) << upsample_above;
1253 
1254   assert(dx > 0);
1255   // pre-filter above pixels
1256   // store in temp buffers:
1257   //   above[x] * 32 + 16
1258   //   above[x+1] - above[x]
1259   // final pixels will be calculated as:
1260   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1261 
1262   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1263 
1264   int x = dx;
1265   for (int r = 0; r < W; r++) {
1266     int base = x >> frac_bits;
1267     int base_max_diff = (max_base_x - base) >> upsample_above;
1268     if (base_max_diff <= 0) {
1269       for (int i = r; i < W; ++i) {
1270         dst[i] = a_mbase_x;  // save 4 values
1271       }
1272       return;
1273     }
1274 
1275     if (base_max_diff > H) base_max_diff = H;
1276 
1277     uint16x8_t shift;
1278     uint8x16_t a0_128, a1_128;
1279     if (upsample_above) {
1280       uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
1281       a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
1282       a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
1283       shift = vdupq_n_u16(x & 0x1f);
1284     } else {
1285       a0_128 = vld1q_u8(above + base);
1286       a1_128 = vld1q_u8(above + base + 1);
1287       shift = vdupq_n_u16((x & 0x3f) >> 1);
1288     }
1289     uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1290     uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1291     uint16x8_t a32_lo =
1292         vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1293     uint16x8_t a32_hi =
1294         vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1295     uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1296     uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1297     uint8x16_t v_temp =
1298         vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1299 
1300     uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
1301     dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
1302 
1303     x += dx;
1304   }
1305 }
1306 
dr_prediction_z1_16xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1307 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1308                                        const uint8_t *above, int upsample_above,
1309                                        int dx) {
1310   uint8x16_t dstvec[64];
1311 
1312   dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
1313   for (int i = 0; i < N; i++) {
1314     vst1q_u8(dst + stride * i, dstvec[i]);
1315   }
1316 }
1317 
dr_prediction_z1_32xN_internal_neon(int N,uint8x16x2_t * dstvec,const uint8_t * above,int dx)1318 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
1319     int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
1320   const int frac_bits = 6;
1321   const int max_base_x = ((32 + N) - 1);
1322 
1323   // pre-filter above pixels
1324   // store in temp buffers:
1325   //   above[x] * 32 + 16
1326   //   above[x+1] - above[x]
1327   // final pixels will be calculated as:
1328   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1329 
1330   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1331 
1332   int x = dx;
1333   for (int r = 0; r < N; r++) {
1334     int base = x >> frac_bits;
1335     int base_max_diff = (max_base_x - base);
1336     if (base_max_diff <= 0) {
1337       for (int i = r; i < N; ++i) {
1338         dstvec[i].val[0] = a_mbase_x;  // save 32 values
1339         dstvec[i].val[1] = a_mbase_x;
1340       }
1341       return;
1342     }
1343     if (base_max_diff > 32) base_max_diff = 32;
1344 
1345     uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1346 
1347     uint8x16_t res16[2];
1348     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
1349       int mdiff = base_max_diff - j;
1350       if (mdiff <= 0) {
1351         res16[jj] = a_mbase_x;
1352       } else {
1353         uint8x16_t a0_128 = vld1q_u8(above + base + j);
1354         uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
1355         uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1356         uint16x8_t diff_hi =
1357             vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1358         uint16x8_t a32_lo =
1359             vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1360         uint16x8_t a32_hi =
1361             vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1362         uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1363         uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1364 
1365         res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1366       }
1367     }
1368 
1369     uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
1370     uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
1371     dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
1372     dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
1373     x += dx;
1374   }
1375 }
1376 
dr_prediction_z1_32xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1377 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1378                                        const uint8_t *above, int dx) {
1379   uint8x16x2_t dstvec[64];
1380 
1381   dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
1382   for (int i = 0; i < N; i++) {
1383     vst1q_u8(dst + stride * i, dstvec[i].val[0]);
1384     vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
1385   }
1386 }
1387 
1388 // clang-format off
1389 static const uint8_t kLoadMaxShuffles[] = {
1390   15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1391   14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1392   13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1393   12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1394   11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1395   10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1396    9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1397    8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1398    7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
1399    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
1400    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
1401    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
1402    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
1403    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15, 15,
1404    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15,
1405    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
1406 };
1407 // clang-format on
1408 
z1_load_masked_neon(const uint8_t * ptr,int shuffle_idx)1409 static inline uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
1410                                              int shuffle_idx) {
1411   uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
1412   uint8x16_t src = vld1q_u8(ptr);
1413 #if AOM_ARCH_AARCH64
1414   return vqtbl1q_u8(src, shuffle);
1415 #else
1416   uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
1417   uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
1418   uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
1419   return vcombine_u8(lo, hi);
1420 #endif
1421 }
1422 
dr_prediction_z1_64xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1423 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1424                                        const uint8_t *above, int dx) {
1425   const int frac_bits = 6;
1426   const int max_base_x = ((64 + N) - 1);
1427 
1428   // pre-filter above pixels
1429   // store in temp buffers:
1430   //   above[x] * 32 + 16
1431   //   above[x+1] - above[x]
1432   // final pixels will be calculated as:
1433   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1434 
1435   const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1436 
1437   int x = dx;
1438   for (int r = 0; r < N; r++, dst += stride) {
1439     int base = x >> frac_bits;
1440     if (base >= max_base_x) {
1441       for (int i = r; i < N; ++i) {
1442         vst1q_u8(dst, a_mbase_x);
1443         vst1q_u8(dst + 16, a_mbase_x);
1444         vst1q_u8(dst + 32, a_mbase_x);
1445         vst1q_u8(dst + 48, a_mbase_x);
1446         dst += stride;
1447       }
1448       return;
1449     }
1450 
1451     uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1452     uint8x16_t base_inc128 =
1453         vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1454                                                vcreate_u8(0x0F0E0D0C0B0A0908)));
1455 
1456     for (int j = 0; j < 64; j += 16) {
1457       if (base + j >= max_base_x) {
1458         vst1q_u8(dst + j, a_mbase_x);
1459       } else {
1460         uint8x16_t a0_128;
1461         uint8x16_t a1_128;
1462         if (base + j + 15 >= max_base_x) {
1463           int shuffle_idx = max_base_x - base - j;
1464           a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1465         } else {
1466           a0_128 = vld1q_u8(above + base + j);
1467         }
1468         if (base + j + 16 >= max_base_x) {
1469           int shuffle_idx = max_base_x - base - j - 1;
1470           a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1471         } else {
1472           a1_128 = vld1q_u8(above + base + j + 1);
1473         }
1474 
1475         uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1476         uint16x8_t diff_hi =
1477             vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1478         uint16x8_t a32_lo =
1479             vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1480         uint16x8_t a32_hi =
1481             vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1482         uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1483         uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1484         vst1q_u8(dst + j,
1485                  vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
1486 
1487         base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
1488       }
1489     }
1490     x += dx;
1491   }
1492 }
1493 
1494 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)1495 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1496                                const uint8_t *above, const uint8_t *left,
1497                                int upsample_above, int dx, int dy) {
1498   (void)left;
1499   (void)dy;
1500 
1501   switch (bw) {
1502     case 4:
1503       dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1504       break;
1505     case 8:
1506       dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1507       break;
1508     case 16:
1509       dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1510       break;
1511     case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
1512     case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
1513     default: break;
1514   }
1515 }
1516 
1517 /* ---------------------P R E D I C T I O N   Z 2--------------------------- */
1518 
1519 // TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
1520 #if AOM_ARCH_AARCH64
1521 #if !AOM_ARCH_AARCH64
1522 static DECLARE_ALIGNED(16, const uint8_t, LoadMaskz2[4][16]) = {
1523   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1524   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1525   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1526     0, 0, 0 },
1527   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1528     0xff, 0xff, 0xff, 0xff }
1529 };
1530 #endif  // !AOM_ARCH_AARCH64
1531 
dr_prediction_z2_Nx4_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y,uint8x8_t * a0_x,uint8x8_t * a1_x,uint16x4_t * shift0)1532 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
1533     const uint8_t *above, int upsample_above, int dx, int base_x, int y,
1534     uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
1535   uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1536   uint16x4_t ydx = vdup_n_u16(y * dx);
1537   if (upsample_above) {
1538     // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
1539     // only load either 16 or 32.
1540     uint8x8_t v_tmp = vld1_u8(above + base_x);
1541     *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
1542     *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
1543     *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
1544   } else {
1545     *a0_x = load_unaligned_u8_4x1(above + base_x);
1546     *a1_x = load_unaligned_u8_4x1(above + base_x + 1);
1547     *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
1548   }
1549 }
1550 
dr_prediction_z2_Nx4_left_neon(uint8x16x2_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y,uint16x4_t * a0_y,uint16x4_t * a1_y,uint16x4_t * shift1)1551 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
1552 #if AOM_ARCH_AARCH64
1553     uint8x16x2_t left_vals,
1554 #else
1555     const uint8_t *left,
1556 #endif
1557     int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
1558     uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
1559   int16x4_t dy64 = vdup_n_s16(dy);
1560   int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1561   int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1562   int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1563   int16x4_t v_r6 = vdup_n_s16(r << 6);
1564   int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1565   int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1566 
1567   // Values in base_y_c64 range from -2 through 14 inclusive.
1568   base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
1569 
1570 #if AOM_ARCH_AARCH64
1571   uint8x8_t left_idx0 =
1572       vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2)));  // [0, 16]
1573   uint8x8_t left_idx1 =
1574       vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3)));  // [1, 17]
1575 
1576   *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
1577   *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
1578 #else   // !AOM_ARCH_AARCH64
1579   DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1580 
1581   vst1_s16(base_y_c, base_y_c64);
1582   uint8x8_t a0_y_u8 = vdup_n_u8(0);
1583   a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
1584   a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
1585   a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
1586   a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
1587 
1588   base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
1589   vst1_s16(base_y_c, base_y_c64);
1590   uint8x8_t a1_y_u8 = vdup_n_u8(0);
1591   a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
1592   a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
1593   a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
1594   a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
1595 
1596   *a0_y = vreinterpret_u16_u8(a0_y_u8);
1597   *a1_y = vreinterpret_u16_u8(a1_y_u8);
1598 #endif  // AOM_ARCH_AARCH64
1599 
1600   if (upsample_left) {
1601     *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
1602   } else {
1603     *shift1 =
1604         vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
1605   }
1606 }
1607 
dr_prediction_z2_Nx8_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y)1608 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
1609     const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
1610   uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1611                                   vcreate_u16(0x0008000700060005));
1612   uint16x8_t ydx = vdupq_n_u16(y * dx);
1613   uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
1614 
1615   uint16x8_t shift0;
1616   uint8x8_t a0_x0;
1617   uint8x8_t a1_x0;
1618   if (upsample_above) {
1619     uint8x8x2_t v_tmp = vld2_u8(above + base_x);
1620     a0_x0 = v_tmp.val[0];
1621     a1_x0 = v_tmp.val[1];
1622     shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1623   } else {
1624     a0_x0 = vld1_u8(above + base_x);
1625     a1_x0 = vld1_u8(above + base_x + 1);
1626     shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1627   }
1628 
1629   uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0);  // a[x+1] - a[x]
1630   uint16x8_t a32 =
1631       vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32));  // a[x] * 32 + 16
1632   uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
1633   return vshrn_n_u16(res, 5);
1634 }
1635 
dr_prediction_z2_Nx8_left_neon(uint8x16x3_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y)1636 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
1637 #if AOM_ARCH_AARCH64
1638     uint8x16x3_t left_vals,
1639 #else
1640     const uint8_t *left,
1641 #endif
1642     int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
1643   int16x8_t v_r6 = vdupq_n_s16(r << 6);
1644   int16x8_t dy128 = vdupq_n_s16(dy);
1645   int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1646   int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
1647 
1648   uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1649                                   vcreate_u16(0x0008000700060005));
1650   int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1651   int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1652 
1653   // Values in base_y_c128 range from -2 through 31 inclusive.
1654   base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
1655 
1656 #if AOM_ARCH_AARCH64
1657   uint8x16_t left_idx0 =
1658       vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2)));  // [0, 33]
1659   uint8x16_t left_idx1 =
1660       vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3)));  // [1, 34]
1661   uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1662 
1663   uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
1664   uint8x8_t a0_x1 = vget_low_u8(a01_x);
1665   uint8x8_t a1_x1 = vget_high_u8(a01_x);
1666 #else   // !AOM_ARCH_AARCH64
1667   uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
1668   uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
1669 #endif  // AOM_ARCH_AARCH64
1670 
1671   uint16x8_t shift1;
1672   if (upsample_left) {
1673     shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
1674   } else {
1675     shift1 = vshrq_n_u16(
1676         vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
1677   }
1678 
1679   uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
1680   uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
1681   uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
1682   return vshrn_n_u16(res, 5);
1683 }
1684 
dr_prediction_z2_NxW_above_neon(const uint8_t * above,int dx,int base_x,int y,int j)1685 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
1686     const uint8_t *above, int dx, int base_x, int y, int j) {
1687   uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1688                                         vcreate_u16(0x0007000600050004)),
1689                            vcombine_u16(vcreate_u16(0x000B000A00090008),
1690                                         vcreate_u16(0x000F000E000D000C)) } };
1691   uint16x8_t j256 = vdupq_n_u16(j);
1692   uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
1693 
1694   const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
1695   const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
1696   uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1697   uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1698   uint16x8_t shift0 =
1699       vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
1700   uint16x8_t shift1 =
1701       vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
1702   // a[x+1] - a[x]
1703   uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
1704   uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
1705   // a[x] * 32 + 16
1706   uint16x8_t a32_0 =
1707       vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
1708   uint16x8_t a32_1 =
1709       vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
1710   uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
1711   uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
1712   return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1713 }
1714 
dr_prediction_z2_NxW_left_neon(uint8x16x4_t left_vals0,uint8x16x4_t left_vals1,int dy,int r,int j)1715 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
1716 #if AOM_ARCH_AARCH64
1717     uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
1718 #else
1719     const uint8_t *left,
1720 #endif
1721     int dy, int r, int j) {
1722   // here upsample_above and upsample_left are 0 by design of
1723   // av1_use_intra_edge_upsample
1724   const int min_base_y = -1;
1725 
1726   int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
1727   int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
1728   int16x8_t dy256 = vdupq_n_s16(dy);
1729   uint16x8_t j256 = vdupq_n_u16(j);
1730 
1731   uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1732                                         vcreate_u16(0x0007000600050004)),
1733                            vcombine_u16(vcreate_u16(0x000B000A00090008),
1734                                         vcreate_u16(0x000F000E000D000C)) } };
1735   uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
1736                            vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
1737 
1738   int16x8_t v_r6 = vdupq_n_s16(r << 6);
1739 
1740   int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
1741   int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
1742   int16x8_t mul16_lo = vreinterpretq_s16_u16(
1743       vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
1744                 vreinterpretq_u16_s16(half_min_base_y256)));
1745   int16x8_t mul16_hi = vreinterpretq_s16_u16(
1746       vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
1747                 vreinterpretq_u16_s16(half_min_base_y256)));
1748   int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
1749   int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
1750 
1751   int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
1752   int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
1753 
1754   base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
1755   base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
1756 
1757 #if !AOM_ARCH_AARCH64
1758   int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
1759   int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
1760   int16_t offset_diff = max_y - min_y;
1761 
1762   uint8x8_t a0_y0;
1763   uint8x8_t a0_y1;
1764   uint8x8_t a1_y0;
1765   uint8x8_t a1_y1;
1766   if (offset_diff < 16) {
1767     // Avoid gathers where the data we want is close together in memory.
1768     // We don't need this for AArch64 since we can already use TBL to cover the
1769     // full range of possible values.
1770     assert(offset_diff >= 0);
1771     int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
1772 
1773     int16x8x2_t base_y_offset;
1774     base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
1775     base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
1776 
1777     int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1778                                              vqmovn_s16(base_y_offset.val[1]));
1779 
1780     uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1781     uint8x16_t a0_y128 = vld1q_u8(left + min_y);
1782     uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
1783     a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1784     a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1785 
1786     uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1787     uint8x8_t v_index_high =
1788         vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1789     uint8x8x2_t v_tmp, v_res;
1790     v_tmp.val[0] = vget_low_u8(a0_y128);
1791     v_tmp.val[1] = vget_high_u8(a0_y128);
1792     v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1793     v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1794     a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1795     v_tmp.val[0] = vget_low_u8(a1_y128);
1796     v_tmp.val[1] = vget_high_u8(a1_y128);
1797     v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1798     v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1799     a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1800 
1801     a0_y0 = vget_low_u8(a0_y128);
1802     a0_y1 = vget_high_u8(a0_y128);
1803     a1_y0 = vget_low_u8(a1_y128);
1804     a1_y1 = vget_high_u8(a1_y128);
1805   } else {
1806     a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
1807     a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
1808     a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
1809     a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
1810   }
1811 #else
1812   // Values in left_idx{0,1} range from 0 through 63 inclusive.
1813   uint8x16_t left_idx0 =
1814       vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
1815   uint8x16_t left_idx1 =
1816       vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
1817   uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1818 
1819   uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
1820   uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
1821 
1822   uint8x8_t a0_y0 = vget_low_u8(a0_y01);
1823   uint8x8_t a0_y1 = vget_high_u8(a0_y01);
1824   uint8x8_t a1_y0 = vget_low_u8(a1_y01);
1825   uint8x8_t a1_y1 = vget_high_u8(a1_y01);
1826 #endif  // !AOM_ARCH_AARCH64
1827 
1828   uint16x8_t shifty_lo = vshrq_n_u16(
1829       vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
1830   uint16x8_t shifty_hi = vshrq_n_u16(
1831       vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
1832 
1833   // a[x+1] - a[x]
1834   uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
1835   uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
1836   // a[x] * 32 + 16
1837   uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
1838   uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
1839 
1840   uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
1841   uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
1842 
1843   return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1844 }
1845 
dr_prediction_z2_Nx4_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1846 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1847                                       const uint8_t *above, const uint8_t *left,
1848                                       int upsample_above, int upsample_left,
1849                                       int dx, int dy) {
1850   const int min_base_x = -(1 << upsample_above);
1851   const int min_base_y = -(1 << upsample_left);
1852   const int frac_bits_x = 6 - upsample_above;
1853   const int frac_bits_y = 6 - upsample_left;
1854 
1855   assert(dx > 0);
1856   // pre-filter above pixels
1857   // store in temp buffers:
1858   //   above[x] * 32 + 16
1859   //   above[x+1] - above[x]
1860   // final pixels will be calculated as:
1861   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1862 
1863 #if AOM_ARCH_AARCH64
1864   // Use ext rather than loading left + 14 directly to avoid over-read.
1865   const uint8x16_t left_m2 = vld1q_u8(left - 2);
1866   const uint8x16_t left_0 = vld1q_u8(left);
1867   const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
1868   const uint8x16x2_t left_vals = { { left_m2, left_14 } };
1869 #define LEFT left_vals
1870 #else  // !AOM_ARCH_AARCH64
1871 #define LEFT left
1872 #endif  // AOM_ARCH_AARCH64
1873 
1874   for (int r = 0; r < N; r++) {
1875     int y = r + 1;
1876     int base_x = (-y * dx) >> frac_bits_x;
1877     const int base_min_diff =
1878         (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
1879         upsample_above;
1880 
1881     if (base_min_diff <= 0) {
1882       uint8x8_t a0_x_u8, a1_x_u8;
1883       uint16x4_t shift0;
1884       dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1885                                       &a0_x_u8, &a1_x_u8, &shift0);
1886       uint8x8_t a0_x = a0_x_u8;
1887       uint8x8_t a1_x = a1_x_u8;
1888 
1889       uint16x8_t diff = vsubl_u8(a1_x, a0_x);  // a[x+1] - a[x]
1890       uint16x8_t a32 =
1891           vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32));  // a[x] * 32 + 16
1892       uint16x8_t res =
1893           vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
1894       uint8x8_t resx = vshrn_n_u16(res, 5);
1895       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
1896     } else if (base_min_diff < 4) {
1897       uint8x8_t a0_x_u8, a1_x_u8;
1898       uint16x4_t shift0;
1899       dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1900                                       &a0_x_u8, &a1_x_u8, &shift0);
1901       uint16x8_t a0_x = vmovl_u8(a0_x_u8);
1902       uint16x8_t a1_x = vmovl_u8(a1_x_u8);
1903 
1904       uint16x4_t a0_y;
1905       uint16x4_t a1_y;
1906       uint16x4_t shift1;
1907       dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1908                                      frac_bits_y, &a0_y, &a1_y, &shift1);
1909       a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
1910       a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
1911 
1912       uint16x8_t shift = vcombine_u16(shift0, shift1);
1913       uint16x8_t diff = vsubq_u16(a1_x, a0_x);  // a[x+1] - a[x]
1914       uint16x8_t a32 =
1915           vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32);  // a[x] * 32 + 16
1916       uint16x8_t res = vmlaq_u16(a32, diff, shift);
1917       uint8x8_t resx = vshrn_n_u16(res, 5);
1918       uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
1919 
1920       uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1921       uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
1922       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1923     } else {
1924       uint16x4_t a0_y, a1_y;
1925       uint16x4_t shift1;
1926       dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1927                                      frac_bits_y, &a0_y, &a1_y, &shift1);
1928       uint16x4_t diff = vsub_u16(a1_y, a0_y);                 // a[x+1] - a[x]
1929       uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32);  // a[x] * 32 + 16
1930       uint16x4_t res = vmla_u16(a32, diff, shift1);
1931       uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
1932 
1933       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
1934     }
1935 
1936     dst += stride;
1937   }
1938 #undef LEFT
1939 }
1940 
dr_prediction_z2_Nx8_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1941 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1942                                       const uint8_t *above, const uint8_t *left,
1943                                       int upsample_above, int upsample_left,
1944                                       int dx, int dy) {
1945   const int min_base_x = -(1 << upsample_above);
1946   const int min_base_y = -(1 << upsample_left);
1947   const int frac_bits_x = 6 - upsample_above;
1948   const int frac_bits_y = 6 - upsample_left;
1949 
1950   // pre-filter above pixels
1951   // store in temp buffers:
1952   //   above[x] * 32 + 16
1953   //   above[x+1] - above[x]
1954   // final pixels will be calculated as:
1955   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1956 
1957 #if AOM_ARCH_AARCH64
1958   // Use ext rather than loading left + 30 directly to avoid over-read.
1959   const uint8x16_t left_m2 = vld1q_u8(left - 2);
1960   const uint8x16_t left_0 = vld1q_u8(left + 0);
1961   const uint8x16_t left_16 = vld1q_u8(left + 16);
1962   const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
1963   const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
1964   const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
1965 #define LEFT left_vals
1966 #else  // !AOM_ARCH_AARCH64
1967 #define LEFT left
1968 #endif  // AOM_ARCH_AARCH64
1969 
1970   for (int r = 0; r < N; r++) {
1971     int y = r + 1;
1972     int base_x = (-y * dx) >> frac_bits_x;
1973     int base_min_diff =
1974         (min_base_x - base_x + upsample_above) >> upsample_above;
1975 
1976     if (base_min_diff <= 0) {
1977       uint8x8_t resx =
1978           dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1979       vst1_u8(dst, resx);
1980     } else if (base_min_diff < 8) {
1981       uint8x8_t resx =
1982           dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1983       uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1984           LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1985       uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1986       uint8x8_t resxy = vbsl_u8(mask, resy, resx);
1987       vst1_u8(dst, resxy);
1988     } else {
1989       uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1990           LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1991       vst1_u8(dst, resy);
1992     }
1993 
1994     dst += stride;
1995   }
1996 #undef LEFT
1997 }
1998 
dr_prediction_z2_HxW_neon(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int dx,int dy)1999 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
2000                                       ptrdiff_t stride, const uint8_t *above,
2001                                       const uint8_t *left, int dx, int dy) {
2002   // here upsample_above and upsample_left are 0 by design of
2003   // av1_use_intra_edge_upsample
2004   const int min_base_x = -1;
2005 
2006 #if AOM_ARCH_AARCH64
2007   const uint8x16_t left_m1 = vld1q_u8(left - 1);
2008   const uint8x16_t left_0 = vld1q_u8(left + 0);
2009   const uint8x16_t left_16 = vld1q_u8(left + 16);
2010   const uint8x16_t left_32 = vld1q_u8(left + 32);
2011   const uint8x16_t left_48 = vld1q_u8(left + 48);
2012   const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
2013   const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
2014   const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
2015   const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
2016   const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
2017 #define LEFT left_vals0, left_vals1
2018 #else  // !AOM_ARCH_AARCH64
2019 #define LEFT left
2020 #endif  // AOM_ARCH_AARCH64
2021 
2022   for (int r = 0; r < H; r++) {
2023     int y = r + 1;
2024     int base_x = (-y * dx) >> 6;
2025     for (int j = 0; j < W; j += 16) {
2026       const int base_min_diff = min_base_x - base_x - j;
2027 
2028       if (base_min_diff <= 0) {
2029         uint8x16_t resx =
2030             dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2031         vst1q_u8(dst + j, resx);
2032       } else if (base_min_diff < 16) {
2033         uint8x16_t resx =
2034             dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2035         uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2036         uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
2037         uint8x16_t resxy = vbslq_u8(mask, resy, resx);
2038         vst1q_u8(dst + j, resxy);
2039       } else {
2040         uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2041         vst1q_u8(dst + j, resy);
2042       }
2043     }  // for j
2044     dst += stride;
2045   }
2046 #undef LEFT
2047 }
2048 
2049 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)2050 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2051                                const uint8_t *above, const uint8_t *left,
2052                                int upsample_above, int upsample_left, int dx,
2053                                int dy) {
2054   assert(dx > 0);
2055   assert(dy > 0);
2056 
2057   switch (bw) {
2058     case 4:
2059       dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
2060                                 upsample_left, dx, dy);
2061       break;
2062     case 8:
2063       dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
2064                                 upsample_left, dx, dy);
2065       break;
2066     default:
2067       dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
2068       break;
2069   }
2070 }
2071 #endif  // AOM_ARCH_AARCH64
2072 
2073 /* ---------------------P R E D I C T I O N   Z 3--------------------------- */
2074 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
z3_transpose_arrays_u8_16x4(const uint8x16_t * x,uint8x16x2_t * d)2075 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
2076                                                          uint8x16x2_t *d) {
2077   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
2078   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
2079 
2080   d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2081                                               vreinterpretq_u16_u8(w1.val[0])));
2082   d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2083                                               vreinterpretq_u16_u8(w1.val[1])));
2084 }
2085 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2086 
z3_transpose_arrays_u8_4x4(const uint8x8_t * x,uint8x8x2_t * d)2087 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
2088                                                         uint8x8x2_t *d) {
2089   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2090   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2091 
2092   *d = aom_reinterpret_u8_u16_x2(
2093       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2094 }
2095 
z3_transpose_arrays_u8_8x4(const uint8x8_t * x,uint8x8x2_t * d)2096 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
2097                                                         uint8x8x2_t *d) {
2098   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2099   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2100 
2101   d[0] = aom_reinterpret_u8_u16_x2(
2102       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2103   d[1] = aom_reinterpret_u8_u16_x2(
2104       vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
2105 }
2106 
z3_transpose_arrays_u8_16x16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)2107 static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
2108                                          uint8_t *dst, ptrdiff_t pitchDst) {
2109   // The same as the normal transposes in transpose_neon.h, but with a stride
2110   // between consecutive vectors of elements.
2111   uint8x16_t r[16];
2112   uint8x16_t d[16];
2113   for (int i = 0; i < 16; i++) {
2114     r[i] = vld1q_u8(src + i * pitchSrc);
2115   }
2116   transpose_arrays_u8_16x16(r, d);
2117   for (int i = 0; i < 16; i++) {
2118     vst1q_u8(dst + i * pitchDst, d[i]);
2119   }
2120 }
2121 
z3_transpose_arrays_u8_16nx16n(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)2122 static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
2123                                            ptrdiff_t pitchSrc, uint8_t *dst,
2124                                            ptrdiff_t pitchDst, int width,
2125                                            int height) {
2126   for (int j = 0; j < height; j += 16) {
2127     for (int i = 0; i < width; i += 16) {
2128       z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
2129                                    dst + j * pitchDst + i, pitchDst);
2130     }
2131   }
2132 }
2133 
dr_prediction_z3_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2134 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2135                                       const uint8_t *left, int upsample_left,
2136                                       int dy) {
2137   uint8x8_t dstvec[4];
2138   uint8x8x2_t dest;
2139 
2140   dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2141   z3_transpose_arrays_u8_4x4(dstvec, &dest);
2142   store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
2143   store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
2144 }
2145 
dr_prediction_z3_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2146 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2147                                       const uint8_t *left, int upsample_left,
2148                                       int dy) {
2149   uint8x8_t dstvec[8];
2150   uint8x8_t d[8];
2151 
2152   dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2153   transpose_arrays_u8_8x8(dstvec, d);
2154   store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
2155 }
2156 
dr_prediction_z3_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2157 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2158                                       const uint8_t *left, int upsample_left,
2159                                       int dy) {
2160   uint8x8_t dstvec[4];
2161   uint8x8x2_t d[2];
2162 
2163   dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2164   z3_transpose_arrays_u8_8x4(dstvec, d);
2165   store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
2166   store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
2167   store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
2168   store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
2169 }
2170 
dr_prediction_z3_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2171 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2172                                       const uint8_t *left, int upsample_left,
2173                                       int dy) {
2174   uint8x8_t dstvec[8];
2175   uint8x8_t d[8];
2176 
2177   dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2178   transpose_arrays_u8_8x8(dstvec, d);
2179   store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
2180 }
2181 
dr_prediction_z3_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2182 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2183                                        const uint8_t *left, int upsample_left,
2184                                        int dy) {
2185   uint8x16_t dstvec[8];
2186   uint8x8_t d[16];
2187 
2188   dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2189   transpose_arrays_u8_16x8(dstvec, d);
2190   for (int i = 0; i < 16; i++) {
2191     vst1_u8(dst + i * stride, d[i]);
2192   }
2193 }
2194 
dr_prediction_z3_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2195 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2196                                        const uint8_t *left, int upsample_left,
2197                                        int dy) {
2198   uint8x8_t dstvec[16];
2199   uint8x16_t d[8];
2200 
2201   dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2202   transpose_arrays_u8_8x16(dstvec, d);
2203   for (int i = 0; i < 8; i++) {
2204     vst1q_u8(dst + i * stride, d[i]);
2205   }
2206 }
2207 
2208 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2209 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2210                                        const uint8_t *left, int upsample_left,
2211                                        int dy) {
2212   uint8x16_t dstvec[4];
2213   uint8x16x2_t d[2];
2214 
2215   dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2216   z3_transpose_arrays_u8_16x4(dstvec, d);
2217   store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
2218   store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
2219   store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
2220   store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
2221 }
2222 
dr_prediction_z3_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2223 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2224                                        const uint8_t *left, int upsample_left,
2225                                        int dy) {
2226   uint8x8_t dstvec[16];
2227   uint8x16_t d[8];
2228 
2229   dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2230   transpose_arrays_u8_8x16(dstvec, d);
2231   for (int i = 0; i < 4; i++) {
2232     vst1q_u8(dst + i * stride, d[i]);
2233   }
2234 }
2235 
dr_prediction_z3_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2236 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2237                                        const uint8_t *left, int upsample_left,
2238                                        int dy) {
2239   (void)upsample_left;
2240   uint8x16x2_t dstvec[16];
2241   uint8x16_t d[32];
2242   uint8x16_t v_zero = vdupq_n_u8(0);
2243 
2244   dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
2245   for (int i = 8; i < 16; i++) {
2246     dstvec[i].val[0] = v_zero;
2247     dstvec[i].val[1] = v_zero;
2248   }
2249   transpose_arrays_u8_32x16(dstvec, d);
2250   for (int i = 0; i < 32; i++) {
2251     vst1_u8(dst + i * stride, vget_low_u8(d[i]));
2252   }
2253 }
2254 
dr_prediction_z3_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2255 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2256                                        const uint8_t *left, int upsample_left,
2257                                        int dy) {
2258   uint8x8_t dstvec[32];
2259   uint8x16_t d[16];
2260 
2261   dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2262   transpose_arrays_u8_8x16(dstvec, d);
2263   transpose_arrays_u8_8x16(dstvec + 16, d + 8);
2264   for (int i = 0; i < 8; i++) {
2265     vst1q_u8(dst + i * stride, d[i]);
2266     vst1q_u8(dst + i * stride + 16, d[i + 8]);
2267   }
2268 }
2269 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2270 
dr_prediction_z3_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2271 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2272                                         const uint8_t *left, int upsample_left,
2273                                         int dy) {
2274   uint8x16_t dstvec[16];
2275   uint8x16_t d[16];
2276 
2277   dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2278   transpose_arrays_u8_16x16(dstvec, d);
2279   for (int i = 0; i < 16; i++) {
2280     vst1q_u8(dst + i * stride, d[i]);
2281   }
2282 }
2283 
dr_prediction_z3_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2284 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2285                                         const uint8_t *left, int upsample_left,
2286                                         int dy) {
2287   (void)upsample_left;
2288   uint8x16x2_t dstvec[32];
2289   uint8x16_t d[64];
2290 
2291   dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
2292   transpose_arrays_u8_32x16(dstvec, d);
2293   transpose_arrays_u8_32x16(dstvec + 16, d + 32);
2294   for (int i = 0; i < 32; i++) {
2295     vst1q_u8(dst + i * stride, d[i]);
2296     vst1q_u8(dst + i * stride + 16, d[i + 32]);
2297   }
2298 }
2299 
dr_prediction_z3_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2300 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2301                                         const uint8_t *left, int upsample_left,
2302                                         int dy) {
2303   (void)upsample_left;
2304   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2305 
2306   dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
2307   z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
2308 }
2309 
dr_prediction_z3_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2310 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2311                                         const uint8_t *left, int upsample_left,
2312                                         int dy) {
2313   (void)upsample_left;
2314   uint8x16x2_t dstvec[16];
2315   uint8x16_t d[32];
2316 
2317   dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
2318   transpose_arrays_u8_32x16(dstvec, d);
2319   for (int i = 0; i < 16; i++) {
2320     vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
2321     vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
2322   }
2323 }
2324 
dr_prediction_z3_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2325 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2326                                         const uint8_t *left, int upsample_left,
2327                                         int dy) {
2328   uint8x16_t dstvec[32];
2329 
2330   dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2331   for (int i = 0; i < 32; i += 16) {
2332     uint8x16_t d[16];
2333     transpose_arrays_u8_16x16(dstvec + i, d);
2334     for (int j = 0; j < 16; j++) {
2335       vst1q_u8(dst + j * stride + i, d[j]);
2336     }
2337   }
2338 }
2339 
dr_prediction_z3_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2340 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2341                                         const uint8_t *left, int upsample_left,
2342                                         int dy) {
2343   (void)upsample_left;
2344   uint8_t dstT[64 * 32];
2345 
2346   dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
2347   z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
2348 }
2349 
dr_prediction_z3_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2350 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2351                                         const uint8_t *left, int upsample_left,
2352                                         int dy) {
2353   (void)upsample_left;
2354   uint8_t dstT[32 * 64];
2355 
2356   dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
2357   z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
2358 }
2359 
2360 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2361 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2362                                         const uint8_t *left, int upsample_left,
2363                                         int dy) {
2364   (void)upsample_left;
2365   uint8_t dstT[64 * 16];
2366 
2367   dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
2368   z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
2369 }
2370 
dr_prediction_z3_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2371 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2372                                         const uint8_t *left, int upsample_left,
2373                                         int dy) {
2374   uint8x16_t dstvec[64];
2375 
2376   dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2377   for (int i = 0; i < 64; i += 16) {
2378     uint8x16_t d[16];
2379     transpose_arrays_u8_16x16(dstvec + i, d);
2380     for (int j = 0; j < 16; ++j) {
2381       vst1q_u8(dst + j * stride + i, d[j]);
2382     }
2383   }
2384 }
2385 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2386 
2387 typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
2388                                     const uint8_t *left, int upsample_left,
2389                                     int dy);
2390 
2391 #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
2392 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
2393   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2394   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2395   { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, NULL,
2396     NULL, NULL },
2397   { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
2398     dr_prediction_z3_8x16_neon, NULL, NULL },
2399   { NULL, NULL, NULL, dr_prediction_z3_16x8_neon, dr_prediction_z3_16x16_neon,
2400     dr_prediction_z3_16x32_neon, NULL },
2401   { NULL, NULL, NULL, NULL, dr_prediction_z3_32x16_neon,
2402     dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
2403   { NULL, NULL, NULL, NULL, NULL, dr_prediction_z3_64x32_neon,
2404     dr_prediction_z3_64x64_neon },
2405 };
2406 #else
2407 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
2408   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2409   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2410   { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
2411     dr_prediction_z3_4x16_neon, NULL, NULL },
2412   { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
2413     dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
2414   { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
2415     dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
2416     dr_prediction_z3_16x64_neon },
2417   { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
2418     dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
2419   { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
2420     dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
2421 };
2422 #endif  // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
2423 
av1_dr_prediction_z3_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)2424 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2425                                const uint8_t *above, const uint8_t *left,
2426                                int upsample_left, int dx, int dy) {
2427   (void)above;
2428   (void)dx;
2429   assert(dx == 1);
2430   assert(dy > 0);
2431 
2432   dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
2433   assert(f != NULL);
2434   f(dst, stride, left, upsample_left, dy);
2435 }
2436 
2437 // -----------------------------------------------------------------------------
2438 // SMOOTH_PRED
2439 
2440 // 256 - v = vneg_s8(v)
negate_s8(const uint8x8_t v)2441 static inline uint8x8_t negate_s8(const uint8x8_t v) {
2442   return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
2443 }
2444 
smooth_4xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2445 static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
2446                             const uint8_t *const top_row,
2447                             const uint8_t *const left_column,
2448                             const int height) {
2449   const uint8_t top_right = top_row[3];
2450   const uint8_t bottom_left = left_column[height - 1];
2451   const uint8_t *const weights_y = smooth_weights + height - 4;
2452 
2453   uint8x8_t top_v = load_u8_4x1(top_row);
2454   const uint8x8_t top_right_v = vdup_n_u8(top_right);
2455   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2456   uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
2457   const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2458   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2459 
2460   assert(height > 0);
2461   int y = 0;
2462   do {
2463     const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2464     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2465     const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2466     const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
2467     const uint16x8_t weighted_top_bl =
2468         vmlal_u8(weighted_bl, weights_y_v, top_v);
2469     const uint16x8_t weighted_left_tr =
2470         vmlal_u8(weighted_tr, weights_x_v, left_v);
2471     // Maximum value of each parameter: 0xFF00
2472     const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2473     const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2474 
2475     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
2476     dst += stride;
2477   } while (++y != height);
2478 }
2479 
calculate_pred(const uint16x8_t weighted_top_bl,const uint16x8_t weighted_left_tr)2480 static inline uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
2481                                        const uint16x8_t weighted_left_tr) {
2482   // Maximum value of each parameter: 0xFF00
2483   const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2484   return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2485 }
2486 
calculate_weights_and_pred(const uint8x8_t top,const uint8x8_t left,const uint16x8_t weighted_tr,const uint8x8_t bottom_left,const uint8x8_t weights_x,const uint8x8_t scaled_weights_y,const uint8x8_t weights_y)2487 static inline uint8x8_t calculate_weights_and_pred(
2488     const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
2489     const uint8x8_t bottom_left, const uint8x8_t weights_x,
2490     const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
2491   const uint16x8_t weighted_top = vmull_u8(weights_y, top);
2492   const uint16x8_t weighted_top_bl =
2493       vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
2494   const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
2495   return calculate_pred(weighted_top_bl, weighted_left_tr);
2496 }
2497 
smooth_8xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2498 static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
2499                             const uint8_t *const top_row,
2500                             const uint8_t *const left_column,
2501                             const int height) {
2502   const uint8_t top_right = top_row[7];
2503   const uint8_t bottom_left = left_column[height - 1];
2504   const uint8_t *const weights_y = smooth_weights + height - 4;
2505 
2506   const uint8x8_t top_v = vld1_u8(top_row);
2507   const uint8x8_t top_right_v = vdup_n_u8(top_right);
2508   const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2509   const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
2510   const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2511   const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2512 
2513   assert(height > 0);
2514   int y = 0;
2515   do {
2516     const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2517     const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2518     const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2519     const uint8x8_t result =
2520         calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
2521                                    weights_x_v, scaled_weights_y, weights_y_v);
2522 
2523     vst1_u8(dst, result);
2524     dst += stride;
2525   } while (++y != height);
2526 }
2527 
2528 #define SMOOTH_NXM(W, H)                                                       \
2529   void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2530                                              const uint8_t *above,             \
2531                                              const uint8_t *left) {            \
2532     smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
2533   }
2534 
2535 SMOOTH_NXM(4, 4)
2536 SMOOTH_NXM(4, 8)
2537 SMOOTH_NXM(8, 4)
2538 SMOOTH_NXM(8, 8)
2539 SMOOTH_NXM(8, 16)
2540 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2541 SMOOTH_NXM(4, 16)
2542 SMOOTH_NXM(8, 32)
2543 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2544 
2545 #undef SMOOTH_NXM
2546 
calculate_weights_and_predq(const uint8x16_t top,const uint8x8_t left,const uint8x8_t top_right,const uint8x8_t weights_y,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x,const uint16x8_t weighted_bl)2547 static inline uint8x16_t calculate_weights_and_predq(
2548     const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
2549     const uint8x8_t weights_y, const uint8x16_t weights_x,
2550     const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
2551   const uint16x8_t weighted_top_bl_low =
2552       vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2553   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2554   const uint16x8_t weighted_left_tr_low =
2555       vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2556   const uint8x8_t result_low =
2557       calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
2558 
2559   const uint16x8_t weighted_top_bl_high =
2560       vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2561   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2562   const uint16x8_t weighted_left_tr_high =
2563       vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2564   const uint8x8_t result_high =
2565       calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
2566 
2567   return vcombine_u8(result_low, result_high);
2568 }
2569 
2570 // 256 - v = vneg_s8(v)
negate_s8q(const uint8x16_t v)2571 static inline uint8x16_t negate_s8q(const uint8x16_t v) {
2572   return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
2573 }
2574 
2575 // For width 16 and above.
2576 #define SMOOTH_PREDICTOR(W)                                                 \
2577   static void smooth_##W##xh_neon(                                          \
2578       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
2579       const uint8_t *const left_column, const int height) {                 \
2580     const uint8_t top_right = top_row[(W)-1];                               \
2581     const uint8_t bottom_left = left_column[height - 1];                    \
2582     const uint8_t *const weights_y = smooth_weights + height - 4;           \
2583                                                                             \
2584     uint8x16_t top_v[4];                                                    \
2585     top_v[0] = vld1q_u8(top_row);                                           \
2586     if ((W) > 16) {                                                         \
2587       top_v[1] = vld1q_u8(top_row + 16);                                    \
2588       if ((W) == 64) {                                                      \
2589         top_v[2] = vld1q_u8(top_row + 32);                                  \
2590         top_v[3] = vld1q_u8(top_row + 48);                                  \
2591       }                                                                     \
2592     }                                                                       \
2593                                                                             \
2594     const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
2595     const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);                 \
2596                                                                             \
2597     uint8x16_t weights_x_v[4];                                              \
2598     weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4);                      \
2599     if ((W) > 16) {                                                         \
2600       weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);             \
2601       if ((W) == 64) {                                                      \
2602         weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);           \
2603         weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);           \
2604       }                                                                     \
2605     }                                                                       \
2606                                                                             \
2607     uint8x16_t scaled_weights_x[4];                                         \
2608     scaled_weights_x[0] = negate_s8q(weights_x_v[0]);                       \
2609     if ((W) > 16) {                                                         \
2610       scaled_weights_x[1] = negate_s8q(weights_x_v[1]);                     \
2611       if ((W) == 64) {                                                      \
2612         scaled_weights_x[2] = negate_s8q(weights_x_v[2]);                   \
2613         scaled_weights_x[3] = negate_s8q(weights_x_v[3]);                   \
2614       }                                                                     \
2615     }                                                                       \
2616                                                                             \
2617     for (int y = 0; y < height; ++y) {                                      \
2618       const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
2619       const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);                \
2620       const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);            \
2621       const uint16x8_t weighted_bl =                                        \
2622           vmull_u8(scaled_weights_y, bottom_left_v);                        \
2623                                                                             \
2624       vst1q_u8(dst, calculate_weights_and_predq(                            \
2625                         top_v[0], left_v, top_right_v, weights_y_v,         \
2626                         weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
2627                                                                             \
2628       if ((W) > 16) {                                                       \
2629         vst1q_u8(dst + 16,                                                  \
2630                  calculate_weights_and_predq(                               \
2631                      top_v[1], left_v, top_right_v, weights_y_v,            \
2632                      weights_x_v[1], scaled_weights_x[1], weighted_bl));    \
2633         if ((W) == 64) {                                                    \
2634           vst1q_u8(dst + 32,                                                \
2635                    calculate_weights_and_predq(                             \
2636                        top_v[2], left_v, top_right_v, weights_y_v,          \
2637                        weights_x_v[2], scaled_weights_x[2], weighted_bl));  \
2638           vst1q_u8(dst + 48,                                                \
2639                    calculate_weights_and_predq(                             \
2640                        top_v[3], left_v, top_right_v, weights_y_v,          \
2641                        weights_x_v[3], scaled_weights_x[3], weighted_bl));  \
2642         }                                                                   \
2643       }                                                                     \
2644                                                                             \
2645       dst += stride;                                                        \
2646     }                                                                       \
2647   }
2648 
2649 SMOOTH_PREDICTOR(16)
2650 SMOOTH_PREDICTOR(32)
2651 SMOOTH_PREDICTOR(64)
2652 
2653 #undef SMOOTH_PREDICTOR
2654 
2655 #define SMOOTH_NXM_WIDE(W, H)                                                  \
2656   void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2657                                              const uint8_t *above,             \
2658                                              const uint8_t *left) {            \
2659     smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
2660   }
2661 
2662 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2663 SMOOTH_NXM_WIDE(16, 4)
2664 SMOOTH_NXM_WIDE(16, 64)
2665 SMOOTH_NXM_WIDE(32, 8)
2666 SMOOTH_NXM_WIDE(64, 16)
2667 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2668 SMOOTH_NXM_WIDE(16, 8)
2669 SMOOTH_NXM_WIDE(16, 16)
2670 SMOOTH_NXM_WIDE(16, 32)
2671 SMOOTH_NXM_WIDE(32, 16)
2672 SMOOTH_NXM_WIDE(32, 32)
2673 SMOOTH_NXM_WIDE(32, 64)
2674 SMOOTH_NXM_WIDE(64, 32)
2675 SMOOTH_NXM_WIDE(64, 64)
2676 
2677 #undef SMOOTH_NXM_WIDE
2678 
2679 // -----------------------------------------------------------------------------
2680 // SMOOTH_V_PRED
2681 
2682 // For widths 4 and 8.
2683 #define SMOOTH_V_PREDICTOR(W)                                         \
2684   static void smooth_v_##W##xh_neon(                                  \
2685       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,   \
2686       const uint8_t *const left_column, const int height) {           \
2687     const uint8_t bottom_left = left_column[height - 1];              \
2688     const uint8_t *const weights_y = smooth_weights + height - 4;     \
2689                                                                       \
2690     uint8x8_t top_v;                                                  \
2691     if ((W) == 4) {                                                   \
2692       top_v = load_u8_4x1(top_row);                                   \
2693     } else { /* width == 8 */                                         \
2694       top_v = vld1_u8(top_row);                                       \
2695     }                                                                 \
2696                                                                       \
2697     const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);           \
2698                                                                       \
2699     assert(height > 0);                                               \
2700     int y = 0;                                                        \
2701     do {                                                              \
2702       const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);          \
2703       const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);      \
2704                                                                       \
2705       const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);   \
2706       const uint16x8_t weighted_top_bl =                              \
2707           vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);    \
2708       const uint8x8_t pred =                                          \
2709           vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE);    \
2710                                                                       \
2711       if ((W) == 4) {                                                 \
2712         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
2713       } else { /* width == 8 */                                       \
2714         vst1_u8(dst, pred);                                           \
2715       }                                                               \
2716       dst += stride;                                                  \
2717     } while (++y != height);                                          \
2718   }
2719 
2720 SMOOTH_V_PREDICTOR(4)
2721 SMOOTH_V_PREDICTOR(8)
2722 
2723 #undef SMOOTH_V_PREDICTOR
2724 
2725 #define SMOOTH_V_NXM(W, H)                                    \
2726   void aom_smooth_v_predictor_##W##x##H##_neon(               \
2727       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2728       const uint8_t *left) {                                  \
2729     smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
2730   }
2731 
2732 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2733 SMOOTH_V_NXM(4, 16)
2734 SMOOTH_V_NXM(8, 32)
2735 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2736 SMOOTH_V_NXM(4, 4)
2737 SMOOTH_V_NXM(4, 8)
2738 SMOOTH_V_NXM(8, 4)
2739 SMOOTH_V_NXM(8, 8)
2740 SMOOTH_V_NXM(8, 16)
2741 
2742 #undef SMOOTH_V_NXM
2743 
calculate_vertical_weights_and_pred(const uint8x16_t top,const uint8x8_t weights_y,const uint16x8_t weighted_bl)2744 static inline uint8x16_t calculate_vertical_weights_and_pred(
2745     const uint8x16_t top, const uint8x8_t weights_y,
2746     const uint16x8_t weighted_bl) {
2747   const uint16x8_t pred_low =
2748       vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2749   const uint16x8_t pred_high =
2750       vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2751   const uint8x8_t pred_scaled_low =
2752       vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
2753   const uint8x8_t pred_scaled_high =
2754       vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
2755   return vcombine_u8(pred_scaled_low, pred_scaled_high);
2756 }
2757 
2758 // For width 16 and above.
2759 #define SMOOTH_V_PREDICTOR(W)                                            \
2760   static void smooth_v_##W##xh_neon(                                     \
2761       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,      \
2762       const uint8_t *const left_column, const int height) {              \
2763     const uint8_t bottom_left = left_column[height - 1];                 \
2764     const uint8_t *const weights_y = smooth_weights + height - 4;        \
2765                                                                          \
2766     uint8x16_t top_v[4];                                                 \
2767     top_v[0] = vld1q_u8(top_row);                                        \
2768     if ((W) > 16) {                                                      \
2769       top_v[1] = vld1q_u8(top_row + 16);                                 \
2770       if ((W) == 64) {                                                   \
2771         top_v[2] = vld1q_u8(top_row + 32);                               \
2772         top_v[3] = vld1q_u8(top_row + 48);                               \
2773       }                                                                  \
2774     }                                                                    \
2775                                                                          \
2776     const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);              \
2777                                                                          \
2778     assert(height > 0);                                                  \
2779     int y = 0;                                                           \
2780     do {                                                                 \
2781       const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);             \
2782       const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);         \
2783       const uint16x8_t weighted_bl =                                     \
2784           vmull_u8(scaled_weights_y, bottom_left_v);                     \
2785                                                                          \
2786       const uint8x16_t pred_0 = calculate_vertical_weights_and_pred(     \
2787           top_v[0], weights_y_v, weighted_bl);                           \
2788       vst1q_u8(dst, pred_0);                                             \
2789                                                                          \
2790       if ((W) > 16) {                                                    \
2791         const uint8x16_t pred_1 = calculate_vertical_weights_and_pred(   \
2792             top_v[1], weights_y_v, weighted_bl);                         \
2793         vst1q_u8(dst + 16, pred_1);                                      \
2794                                                                          \
2795         if ((W) == 64) {                                                 \
2796           const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
2797               top_v[2], weights_y_v, weighted_bl);                       \
2798           vst1q_u8(dst + 32, pred_2);                                    \
2799                                                                          \
2800           const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
2801               top_v[3], weights_y_v, weighted_bl);                       \
2802           vst1q_u8(dst + 48, pred_3);                                    \
2803         }                                                                \
2804       }                                                                  \
2805                                                                          \
2806       dst += stride;                                                     \
2807     } while (++y != height);                                             \
2808   }
2809 
2810 SMOOTH_V_PREDICTOR(16)
2811 SMOOTH_V_PREDICTOR(32)
2812 SMOOTH_V_PREDICTOR(64)
2813 
2814 #undef SMOOTH_V_PREDICTOR
2815 
2816 #define SMOOTH_V_NXM_WIDE(W, H)                               \
2817   void aom_smooth_v_predictor_##W##x##H##_neon(               \
2818       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2819       const uint8_t *left) {                                  \
2820     smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
2821   }
2822 
2823 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2824 SMOOTH_V_NXM_WIDE(16, 4)
2825 SMOOTH_V_NXM_WIDE(32, 8)
2826 SMOOTH_V_NXM_WIDE(64, 16)
2827 SMOOTH_V_NXM_WIDE(16, 64)
2828 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2829 SMOOTH_V_NXM_WIDE(16, 8)
2830 SMOOTH_V_NXM_WIDE(16, 16)
2831 SMOOTH_V_NXM_WIDE(16, 32)
2832 SMOOTH_V_NXM_WIDE(32, 16)
2833 SMOOTH_V_NXM_WIDE(32, 32)
2834 SMOOTH_V_NXM_WIDE(32, 64)
2835 SMOOTH_V_NXM_WIDE(64, 32)
2836 SMOOTH_V_NXM_WIDE(64, 64)
2837 
2838 #undef SMOOTH_V_NXM_WIDE
2839 
2840 // -----------------------------------------------------------------------------
2841 // SMOOTH_H_PRED
2842 
2843 // For widths 4 and 8.
2844 #define SMOOTH_H_PREDICTOR(W)                                               \
2845   static void smooth_h_##W##xh_neon(                                        \
2846       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
2847       const uint8_t *const left_column, const int height) {                 \
2848     const uint8_t top_right = top_row[(W)-1];                               \
2849                                                                             \
2850     const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
2851     /* Over-reads for 4xN but still within the array. */                    \
2852     const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4);            \
2853     const uint8x8_t scaled_weights_x = negate_s8(weights_x);                \
2854     const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
2855                                                                             \
2856     assert(height > 0);                                                     \
2857     int y = 0;                                                              \
2858     do {                                                                    \
2859       const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
2860       const uint16x8_t weighted_left_tr =                                   \
2861           vmlal_u8(weighted_tr, weights_x, left_v);                         \
2862       const uint8x8_t pred =                                                \
2863           vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE);         \
2864                                                                             \
2865       if ((W) == 4) {                                                       \
2866         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0);       \
2867       } else { /* width == 8 */                                             \
2868         vst1_u8(dst, pred);                                                 \
2869       }                                                                     \
2870       dst += stride;                                                        \
2871     } while (++y != height);                                                \
2872   }
2873 
2874 SMOOTH_H_PREDICTOR(4)
2875 SMOOTH_H_PREDICTOR(8)
2876 
2877 #undef SMOOTH_H_PREDICTOR
2878 
2879 #define SMOOTH_H_NXM(W, H)                                    \
2880   void aom_smooth_h_predictor_##W##x##H##_neon(               \
2881       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2882       const uint8_t *left) {                                  \
2883     smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
2884   }
2885 
2886 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2887 SMOOTH_H_NXM(4, 16)
2888 SMOOTH_H_NXM(8, 32)
2889 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2890 SMOOTH_H_NXM(4, 4)
2891 SMOOTH_H_NXM(4, 8)
2892 SMOOTH_H_NXM(8, 4)
2893 SMOOTH_H_NXM(8, 8)
2894 SMOOTH_H_NXM(8, 16)
2895 
2896 #undef SMOOTH_H_NXM
2897 
calculate_horizontal_weights_and_pred(const uint8x8_t left,const uint8x8_t top_right,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x)2898 static inline uint8x16_t calculate_horizontal_weights_and_pred(
2899     const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
2900     const uint8x16_t scaled_weights_x) {
2901   const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2902   const uint16x8_t weighted_left_tr_low =
2903       vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2904   const uint8x8_t pred_scaled_low =
2905       vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
2906 
2907   const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2908   const uint16x8_t weighted_left_tr_high =
2909       vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2910   const uint8x8_t pred_scaled_high =
2911       vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
2912 
2913   return vcombine_u8(pred_scaled_low, pred_scaled_high);
2914 }
2915 
2916 // For width 16 and above.
2917 #define SMOOTH_H_PREDICTOR(W)                                              \
2918   static void smooth_h_##W##xh_neon(                                       \
2919       uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,        \
2920       const uint8_t *const left_column, const int height) {                \
2921     const uint8_t top_right = top_row[(W)-1];                              \
2922                                                                            \
2923     const uint8x8_t top_right_v = vdup_n_u8(top_right);                    \
2924                                                                            \
2925     uint8x16_t weights_x[4];                                               \
2926     weights_x[0] = vld1q_u8(smooth_weights + (W)-4);                       \
2927     if ((W) > 16) {                                                        \
2928       weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);              \
2929       if ((W) == 64) {                                                     \
2930         weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);            \
2931         weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);            \
2932       }                                                                    \
2933     }                                                                      \
2934                                                                            \
2935     uint8x16_t scaled_weights_x[4];                                        \
2936     scaled_weights_x[0] = negate_s8q(weights_x[0]);                        \
2937     if ((W) > 16) {                                                        \
2938       scaled_weights_x[1] = negate_s8q(weights_x[1]);                      \
2939       if ((W) == 64) {                                                     \
2940         scaled_weights_x[2] = negate_s8q(weights_x[2]);                    \
2941         scaled_weights_x[3] = negate_s8q(weights_x[3]);                    \
2942       }                                                                    \
2943     }                                                                      \
2944                                                                            \
2945     assert(height > 0);                                                    \
2946     int y = 0;                                                             \
2947     do {                                                                   \
2948       const uint8x8_t left_v = vdup_n_u8(left_column[y]);                  \
2949                                                                            \
2950       const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred(     \
2951           left_v, top_right_v, weights_x[0], scaled_weights_x[0]);         \
2952       vst1q_u8(dst, pred_0);                                               \
2953                                                                            \
2954       if ((W) > 16) {                                                      \
2955         const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred(   \
2956             left_v, top_right_v, weights_x[1], scaled_weights_x[1]);       \
2957         vst1q_u8(dst + 16, pred_1);                                        \
2958                                                                            \
2959         if ((W) == 64) {                                                   \
2960           const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
2961               left_v, top_right_v, weights_x[2], scaled_weights_x[2]);     \
2962           vst1q_u8(dst + 32, pred_2);                                      \
2963                                                                            \
2964           const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
2965               left_v, top_right_v, weights_x[3], scaled_weights_x[3]);     \
2966           vst1q_u8(dst + 48, pred_3);                                      \
2967         }                                                                  \
2968       }                                                                    \
2969       dst += stride;                                                       \
2970     } while (++y != height);                                               \
2971   }
2972 
2973 SMOOTH_H_PREDICTOR(16)
2974 SMOOTH_H_PREDICTOR(32)
2975 SMOOTH_H_PREDICTOR(64)
2976 
2977 #undef SMOOTH_H_PREDICTOR
2978 
2979 #define SMOOTH_H_NXM_WIDE(W, H)                               \
2980   void aom_smooth_h_predictor_##W##x##H##_neon(               \
2981       uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2982       const uint8_t *left) {                                  \
2983     smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
2984   }
2985 
2986 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2987 SMOOTH_H_NXM_WIDE(16, 4)
2988 SMOOTH_H_NXM_WIDE(16, 64)
2989 SMOOTH_H_NXM_WIDE(32, 8)
2990 SMOOTH_H_NXM_WIDE(64, 16)
2991 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2992 SMOOTH_H_NXM_WIDE(16, 8)
2993 SMOOTH_H_NXM_WIDE(16, 16)
2994 SMOOTH_H_NXM_WIDE(16, 32)
2995 SMOOTH_H_NXM_WIDE(32, 16)
2996 SMOOTH_H_NXM_WIDE(32, 32)
2997 SMOOTH_H_NXM_WIDE(32, 64)
2998 SMOOTH_H_NXM_WIDE(64, 32)
2999 SMOOTH_H_NXM_WIDE(64, 64)
3000 
3001 #undef SMOOTH_H_NXM_WIDE
3002 
3003 // -----------------------------------------------------------------------------
3004 // PAETH
3005 
paeth_4or8_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)3006 static inline void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
3007                                        const uint8_t *const top_row,
3008                                        const uint8_t *const left_column,
3009                                        int width, int height) {
3010   const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
3011   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
3012   uint8x8_t top;
3013   if (width == 4) {
3014     top = load_u8_4x1(top_row);
3015   } else {  // width == 8
3016     top = vld1_u8(top_row);
3017   }
3018 
3019   assert(height > 0);
3020   int y = 0;
3021   do {
3022     const uint8x8_t left = vdup_n_u8(left_column[y]);
3023 
3024     const uint8x8_t left_dist = vabd_u8(top, top_left);
3025     const uint8x8_t top_dist = vabd_u8(left, top_left);
3026     const uint16x8_t top_left_dist =
3027         vabdq_u16(vaddl_u8(top, left), top_left_x2);
3028 
3029     const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
3030     const uint8x8_t left_le_top_left =
3031         vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
3032     const uint8x8_t top_le_top_left =
3033         vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
3034 
3035     // if (left_dist <= top_dist && left_dist <= top_left_dist)
3036     const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
3037     //   dest[x] = left_column[y];
3038     // Fill all the unused spaces with 'top'. They will be overwritten when
3039     // the positions for top_left are known.
3040     uint8x8_t result = vbsl_u8(left_mask, left, top);
3041     // else if (top_dist <= top_left_dist)
3042     //   dest[x] = top_row[x];
3043     // Add these values to the mask. They were already set.
3044     const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
3045     // else
3046     //   dest[x] = top_left;
3047     result = vbsl_u8(left_or_top_mask, result, top_left);
3048 
3049     if (width == 4) {
3050       store_u8_4x1(dest, result);
3051     } else {  // width == 8
3052       vst1_u8(dest, result);
3053     }
3054     dest += stride;
3055   } while (++y != height);
3056 }
3057 
3058 #define PAETH_NXM(W, H)                                                     \
3059   void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
3060                                             const uint8_t *above,           \
3061                                             const uint8_t *left) {          \
3062     paeth_4or8_x_h_neon(dst, stride, above, left, W, H);                    \
3063   }
3064 
3065 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3066 PAETH_NXM(4, 16)
3067 PAETH_NXM(8, 32)
3068 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3069 PAETH_NXM(4, 4)
3070 PAETH_NXM(4, 8)
3071 PAETH_NXM(8, 4)
3072 PAETH_NXM(8, 8)
3073 PAETH_NXM(8, 16)
3074 
3075 // Calculate X distance <= TopLeft distance and pack the resulting mask into
3076 // uint8x8_t.
x_le_top_left(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)3077 static inline uint8x16_t x_le_top_left(const uint8x16_t x_dist,
3078                                        const uint16x8_t top_left_dist_low,
3079                                        const uint16x8_t top_left_dist_high) {
3080   const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
3081                                                vqmovn_u16(top_left_dist_high));
3082   return vcleq_u8(x_dist, top_left_dist);
3083 }
3084 
3085 // Select the closest values and collect them.
select_paeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)3086 static inline uint8x16_t select_paeth(const uint8x16_t top,
3087                                       const uint8x16_t left,
3088                                       const uint8x16_t top_left,
3089                                       const uint8x16_t left_le_top,
3090                                       const uint8x16_t left_le_top_left,
3091                                       const uint8x16_t top_le_top_left) {
3092   // if (left_dist <= top_dist && left_dist <= top_left_dist)
3093   const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
3094   //   dest[x] = left_column[y];
3095   // Fill all the unused spaces with 'top'. They will be overwritten when
3096   // the positions for top_left are known.
3097   uint8x16_t result = vbslq_u8(left_mask, left, top);
3098   // else if (top_dist <= top_left_dist)
3099   //   dest[x] = top_row[x];
3100   // Add these values to the mask. They were already set.
3101   const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
3102   // else
3103   //   dest[x] = top_left;
3104   return vbslq_u8(left_or_top_mask, result, top_left);
3105 }
3106 
3107 // Generate numbered and high/low versions of top_left_dist.
3108 #define TOP_LEFT_DIST(num)                                              \
3109   const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
3110       vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
3111   const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
3112       vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
3113 
3114 // Generate numbered versions of XLeTopLeft with x = left.
3115 #define LEFT_LE_TOP_LEFT(num)                                     \
3116   const uint8x16_t left_le_top_left_##num =                       \
3117       x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
3118                     top_left_##num##_dist_high)
3119 
3120 // Generate numbered versions of XLeTopLeft with x = top.
3121 #define TOP_LE_TOP_LEFT(num)                              \
3122   const uint8x16_t top_le_top_left_##num = x_le_top_left( \
3123       top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
3124 
paeth16_plus_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)3125 static inline void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
3126                                          const uint8_t *const top_row,
3127                                          const uint8_t *const left_column,
3128                                          int width, int height) {
3129   const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
3130   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
3131   uint8x16_t top[4];
3132   top[0] = vld1q_u8(top_row);
3133   if (width > 16) {
3134     top[1] = vld1q_u8(top_row + 16);
3135     if (width == 64) {
3136       top[2] = vld1q_u8(top_row + 32);
3137       top[3] = vld1q_u8(top_row + 48);
3138     }
3139   }
3140 
3141   assert(height > 0);
3142   int y = 0;
3143   do {
3144     const uint8x16_t left = vdupq_n_u8(left_column[y]);
3145 
3146     const uint8x16_t top_dist = vabdq_u8(left, top_left);
3147 
3148     const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
3149     TOP_LEFT_DIST(0);
3150     const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
3151     LEFT_LE_TOP_LEFT(0);
3152     TOP_LE_TOP_LEFT(0);
3153 
3154     const uint8x16_t result_0 =
3155         select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
3156                      top_le_top_left_0);
3157     vst1q_u8(dest, result_0);
3158 
3159     if (width > 16) {
3160       const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
3161       TOP_LEFT_DIST(1);
3162       const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
3163       LEFT_LE_TOP_LEFT(1);
3164       TOP_LE_TOP_LEFT(1);
3165 
3166       const uint8x16_t result_1 =
3167           select_paeth(top[1], left, top_left, left_1_le_top,
3168                        left_le_top_left_1, top_le_top_left_1);
3169       vst1q_u8(dest + 16, result_1);
3170 
3171       if (width == 64) {
3172         const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
3173         TOP_LEFT_DIST(2);
3174         const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
3175         LEFT_LE_TOP_LEFT(2);
3176         TOP_LE_TOP_LEFT(2);
3177 
3178         const uint8x16_t result_2 =
3179             select_paeth(top[2], left, top_left, left_2_le_top,
3180                          left_le_top_left_2, top_le_top_left_2);
3181         vst1q_u8(dest + 32, result_2);
3182 
3183         const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
3184         TOP_LEFT_DIST(3);
3185         const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
3186         LEFT_LE_TOP_LEFT(3);
3187         TOP_LE_TOP_LEFT(3);
3188 
3189         const uint8x16_t result_3 =
3190             select_paeth(top[3], left, top_left, left_3_le_top,
3191                          left_le_top_left_3, top_le_top_left_3);
3192         vst1q_u8(dest + 48, result_3);
3193       }
3194     }
3195 
3196     dest += stride;
3197   } while (++y != height);
3198 }
3199 
3200 #define PAETH_NXM_WIDE(W, H)                                                \
3201   void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
3202                                             const uint8_t *above,           \
3203                                             const uint8_t *left) {          \
3204     paeth16_plus_x_h_neon(dst, stride, above, left, W, H);                  \
3205   }
3206 
3207 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3208 PAETH_NXM_WIDE(16, 4)
3209 PAETH_NXM_WIDE(16, 64)
3210 PAETH_NXM_WIDE(32, 8)
3211 PAETH_NXM_WIDE(64, 16)
3212 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3213 PAETH_NXM_WIDE(16, 8)
3214 PAETH_NXM_WIDE(16, 16)
3215 PAETH_NXM_WIDE(16, 32)
3216 PAETH_NXM_WIDE(32, 16)
3217 PAETH_NXM_WIDE(32, 32)
3218 PAETH_NXM_WIDE(32, 64)
3219 PAETH_NXM_WIDE(64, 32)
3220 PAETH_NXM_WIDE(64, 64)
3221