1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <stdint.h>
15
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 #include "config/av1_rtcd.h"
19
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/arm/mem_neon.h"
22 #include "aom_dsp/arm/reinterpret_neon.h"
23 #include "aom_dsp/arm/sum_neon.h"
24 #include "aom_dsp/arm/transpose_neon.h"
25 #include "aom_dsp/intrapred_common.h"
26
27 //------------------------------------------------------------------------------
28 // DC 4x4
29
dc_load_sum_4(const uint8_t * in)30 static inline uint16x8_t dc_load_sum_4(const uint8_t *in) {
31 const uint8x8_t a = load_u8_4x1(in);
32 const uint16x4_t p0 = vpaddl_u8(a);
33 const uint16x4_t p1 = vpadd_u16(p0, p0);
34 return vcombine_u16(p1, vdup_n_u16(0));
35 }
36
dc_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)37 static inline void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
38 uint8x8_t dc) {
39 for (int i = 0; i < h; ++i) {
40 store_u8_4x1(dst + i * stride, dc);
41 }
42 }
43
aom_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 const uint16x8_t sum_top = dc_load_sum_4(above);
47 const uint16x8_t sum_left = dc_load_sum_4(left);
48 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
49 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
50 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
51 }
52
aom_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)53 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
54 const uint8_t *above, const uint8_t *left) {
55 const uint16x8_t sum_left = dc_load_sum_4(left);
56 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
57 (void)above;
58 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
59 }
60
aom_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)61 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
62 const uint8_t *above, const uint8_t *left) {
63 const uint16x8_t sum_top = dc_load_sum_4(above);
64 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
65 (void)left;
66 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
67 }
68
aom_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)69 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
70 const uint8_t *above, const uint8_t *left) {
71 const uint8x8_t dc0 = vdup_n_u8(0x80);
72 (void)above;
73 (void)left;
74 dc_store_4xh(dst, stride, 4, dc0);
75 }
76
77 //------------------------------------------------------------------------------
78 // DC 8x8
79
dc_load_sum_8(const uint8_t * in)80 static inline uint16x8_t dc_load_sum_8(const uint8_t *in) {
81 // This isn't used in the case where we want to load both above and left
82 // vectors, since we want to avoid performing the reduction twice.
83 const uint8x8_t a = vld1_u8(in);
84 const uint16x4_t p0 = vpaddl_u8(a);
85 const uint16x4_t p1 = vpadd_u16(p0, p0);
86 const uint16x4_t p2 = vpadd_u16(p1, p1);
87 return vcombine_u16(p2, vdup_n_u16(0));
88 }
89
horizontal_add_and_broadcast_u16x8(uint16x8_t a)90 static inline uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
91 #if AOM_ARCH_AARCH64
92 // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
93 // instruction, however the addv instruction is usually slightly more
94 // expensive than a pairwise addition, so the need for immediately
95 // broadcasting the result again seems to negate any benefit.
96 const uint16x8_t b = vpaddq_u16(a, a);
97 const uint16x8_t c = vpaddq_u16(b, b);
98 return vpaddq_u16(c, c);
99 #else
100 const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
101 const uint16x4_t c = vpadd_u16(b, b);
102 const uint16x4_t d = vpadd_u16(c, c);
103 return vcombine_u16(d, d);
104 #endif
105 }
106
dc_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t dc)107 static inline void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
108 uint8x8_t dc) {
109 for (int i = 0; i < h; ++i) {
110 vst1_u8(dst + i * stride, dc);
111 }
112 }
113
aom_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)114 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
115 const uint8_t *above, const uint8_t *left) {
116 const uint8x8_t sum_top = vld1_u8(above);
117 const uint8x8_t sum_left = vld1_u8(left);
118 uint16x8_t sum = vaddl_u8(sum_left, sum_top);
119 sum = horizontal_add_and_broadcast_u16x8(sum);
120 const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
121 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
122 }
123
aom_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)124 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
125 const uint8_t *above, const uint8_t *left) {
126 const uint16x8_t sum_left = dc_load_sum_8(left);
127 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
128 (void)above;
129 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
130 }
131
aom_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)132 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
133 const uint8_t *above, const uint8_t *left) {
134 const uint16x8_t sum_top = dc_load_sum_8(above);
135 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
136 (void)left;
137 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
138 }
139
aom_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)140 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
141 const uint8_t *above, const uint8_t *left) {
142 const uint8x8_t dc0 = vdup_n_u8(0x80);
143 (void)above;
144 (void)left;
145 dc_store_8xh(dst, stride, 8, dc0);
146 }
147
148 //------------------------------------------------------------------------------
149 // DC 16x16
150
dc_load_partial_sum_16(const uint8_t * in)151 static inline uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
152 const uint8x16_t a = vld1q_u8(in);
153 // delay the remainder of the reduction until
154 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
155 // than twice in the case we are loading both above and left.
156 return vpaddlq_u8(a);
157 }
158
dc_load_sum_16(const uint8_t * in)159 static inline uint16x8_t dc_load_sum_16(const uint8_t *in) {
160 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
161 }
162
dc_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)163 static inline void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
164 uint8x16_t dc) {
165 for (int i = 0; i < h; ++i) {
166 vst1q_u8(dst + i * stride, dc);
167 }
168 }
169
aom_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)170 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
171 const uint8_t *above, const uint8_t *left) {
172 const uint16x8_t sum_top = dc_load_partial_sum_16(above);
173 const uint16x8_t sum_left = dc_load_partial_sum_16(left);
174 uint16x8_t sum = vaddq_u16(sum_left, sum_top);
175 sum = horizontal_add_and_broadcast_u16x8(sum);
176 const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
177 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
178 }
179
aom_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)180 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
181 const uint8_t *above,
182 const uint8_t *left) {
183 const uint16x8_t sum_left = dc_load_sum_16(left);
184 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
185 (void)above;
186 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
187 }
188
aom_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)189 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
190 const uint8_t *above,
191 const uint8_t *left) {
192 const uint16x8_t sum_top = dc_load_sum_16(above);
193 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
194 (void)left;
195 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
196 }
197
aom_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)198 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
199 const uint8_t *above,
200 const uint8_t *left) {
201 const uint8x16_t dc0 = vdupq_n_u8(0x80);
202 (void)above;
203 (void)left;
204 dc_store_16xh(dst, stride, 16, dc0);
205 }
206
207 //------------------------------------------------------------------------------
208 // DC 32x32
209
dc_load_partial_sum_32(const uint8_t * in)210 static inline uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
211 const uint8x16_t a0 = vld1q_u8(in);
212 const uint8x16_t a1 = vld1q_u8(in + 16);
213 // delay the remainder of the reduction until
214 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
215 // than twice in the case we are loading both above and left.
216 return vpadalq_u8(vpaddlq_u8(a0), a1);
217 }
218
dc_load_sum_32(const uint8_t * in)219 static inline uint16x8_t dc_load_sum_32(const uint8_t *in) {
220 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
221 }
222
dc_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)223 static inline void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
224 uint8x16_t dc) {
225 for (int i = 0; i < h; ++i) {
226 vst1q_u8(dst + i * stride, dc);
227 vst1q_u8(dst + i * stride + 16, dc);
228 }
229 }
230
aom_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)231 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
232 const uint8_t *above, const uint8_t *left) {
233 const uint16x8_t sum_top = dc_load_partial_sum_32(above);
234 const uint16x8_t sum_left = dc_load_partial_sum_32(left);
235 uint16x8_t sum = vaddq_u16(sum_left, sum_top);
236 sum = horizontal_add_and_broadcast_u16x8(sum);
237 const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
238 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
239 }
240
aom_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)241 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
242 const uint8_t *above,
243 const uint8_t *left) {
244 const uint16x8_t sum_left = dc_load_sum_32(left);
245 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
246 (void)above;
247 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
248 }
249
aom_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)250 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
251 const uint8_t *above,
252 const uint8_t *left) {
253 const uint16x8_t sum_top = dc_load_sum_32(above);
254 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
255 (void)left;
256 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
257 }
258
aom_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)259 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
260 const uint8_t *above,
261 const uint8_t *left) {
262 const uint8x16_t dc0 = vdupq_n_u8(0x80);
263 (void)above;
264 (void)left;
265 dc_store_32xh(dst, stride, 32, dc0);
266 }
267
268 //------------------------------------------------------------------------------
269 // DC 64x64
270
dc_load_partial_sum_64(const uint8_t * in)271 static inline uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
272 const uint8x16_t a0 = vld1q_u8(in);
273 const uint8x16_t a1 = vld1q_u8(in + 16);
274 const uint8x16_t a2 = vld1q_u8(in + 32);
275 const uint8x16_t a3 = vld1q_u8(in + 48);
276 const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
277 const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
278 // delay the remainder of the reduction until
279 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
280 // than twice in the case we are loading both above and left.
281 return vaddq_u16(p01, p23);
282 }
283
dc_load_sum_64(const uint8_t * in)284 static inline uint16x8_t dc_load_sum_64(const uint8_t *in) {
285 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
286 }
287
dc_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t dc)288 static inline void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
289 uint8x16_t dc) {
290 for (int i = 0; i < h; ++i) {
291 vst1q_u8(dst + i * stride, dc);
292 vst1q_u8(dst + i * stride + 16, dc);
293 vst1q_u8(dst + i * stride + 32, dc);
294 vst1q_u8(dst + i * stride + 48, dc);
295 }
296 }
297
aom_dc_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)298 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
299 const uint8_t *above, const uint8_t *left) {
300 const uint16x8_t sum_top = dc_load_partial_sum_64(above);
301 const uint16x8_t sum_left = dc_load_partial_sum_64(left);
302 uint16x8_t sum = vaddq_u16(sum_left, sum_top);
303 sum = horizontal_add_and_broadcast_u16x8(sum);
304 const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
305 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
306 }
307
aom_dc_left_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)308 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
309 const uint8_t *above,
310 const uint8_t *left) {
311 const uint16x8_t sum_left = dc_load_sum_64(left);
312 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
313 (void)above;
314 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
315 }
316
aom_dc_top_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)317 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
318 const uint8_t *above,
319 const uint8_t *left) {
320 const uint16x8_t sum_top = dc_load_sum_64(above);
321 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
322 (void)left;
323 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
324 }
325
aom_dc_128_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)326 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
327 const uint8_t *above,
328 const uint8_t *left) {
329 const uint8x16_t dc0 = vdupq_n_u8(0x80);
330 (void)above;
331 (void)left;
332 dc_store_64xh(dst, stride, 64, dc0);
333 }
334
335 //------------------------------------------------------------------------------
336 // DC rectangular cases
337
338 #define DC_MULTIPLIER_1X2 0x5556
339 #define DC_MULTIPLIER_1X4 0x3334
340
341 #define DC_SHIFT2 16
342
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)343 static inline int divide_using_multiply_shift(int num, int shift1,
344 int multiplier, int shift2) {
345 const int interm = num >> shift1;
346 return interm * multiplier >> shift2;
347 }
348
calculate_dc_from_sum(int bw,int bh,uint32_t sum,int shift1,int multiplier)349 static inline int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
350 int shift1, int multiplier) {
351 const int expected_dc = divide_using_multiply_shift(
352 sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
353 assert(expected_dc < (1 << 8));
354 return expected_dc;
355 }
356
357 #undef DC_SHIFT2
358
aom_dc_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)359 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
360 const uint8_t *above, const uint8_t *left) {
361 uint8x8_t a = load_u8_4x1(above);
362 uint8x8_t l = vld1_u8(left);
363 uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
364 uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
365 dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
366 }
367
aom_dc_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)368 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
369 const uint8_t *above, const uint8_t *left) {
370 uint8x8_t a = vld1_u8(above);
371 uint8x8_t l = load_u8_4x1(left);
372 uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
373 uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
374 dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
375 }
376
377 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)378 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
379 const uint8_t *above, const uint8_t *left) {
380 uint8x8_t a = load_u8_4x1(above);
381 uint8x16_t l = vld1q_u8(left);
382 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
383 uint32_t sum = horizontal_add_u16x8(sum_al);
384 uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
385 dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
386 }
387
aom_dc_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)388 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
389 const uint8_t *above, const uint8_t *left) {
390 uint8x16_t a = vld1q_u8(above);
391 uint8x8_t l = load_u8_4x1(left);
392 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
393 uint32_t sum = horizontal_add_u16x8(sum_al);
394 uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
395 dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
396 }
397 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
398
aom_dc_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)399 void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
400 const uint8_t *above, const uint8_t *left) {
401 uint8x8_t a = vld1_u8(above);
402 uint8x16_t l = vld1q_u8(left);
403 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
404 uint32_t sum = horizontal_add_u16x8(sum_al);
405 uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
406 dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
407 }
408
aom_dc_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)409 void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
410 const uint8_t *above, const uint8_t *left) {
411 uint8x16_t a = vld1q_u8(above);
412 uint8x8_t l = vld1_u8(left);
413 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
414 uint32_t sum = horizontal_add_u16x8(sum_al);
415 uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
416 dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
417 }
418
419 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)420 void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
421 const uint8_t *above, const uint8_t *left) {
422 uint8x8_t a = vld1_u8(above);
423 uint16x8_t sum_left = dc_load_partial_sum_32(left);
424 uint16x8_t sum_al = vaddw_u8(sum_left, a);
425 uint32_t sum = horizontal_add_u16x8(sum_al);
426 uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
427 dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
428 }
429
aom_dc_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)430 void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
431 const uint8_t *above, const uint8_t *left) {
432 uint16x8_t sum_top = dc_load_partial_sum_32(above);
433 uint8x8_t l = vld1_u8(left);
434 uint16x8_t sum_al = vaddw_u8(sum_top, l);
435 uint32_t sum = horizontal_add_u16x8(sum_al);
436 uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
437 dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
438 }
439 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
440
aom_dc_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)441 void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
442 const uint8_t *above, const uint8_t *left) {
443 uint16x8_t sum_above = dc_load_partial_sum_16(above);
444 uint16x8_t sum_left = dc_load_partial_sum_32(left);
445 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
446 uint32_t sum = horizontal_add_u16x8(sum_al);
447 uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
448 dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
449 }
450
aom_dc_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)451 void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
452 const uint8_t *above, const uint8_t *left) {
453 uint16x8_t sum_above = dc_load_partial_sum_32(above);
454 uint16x8_t sum_left = dc_load_partial_sum_16(left);
455 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
456 uint32_t sum = horizontal_add_u16x8(sum_al);
457 uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
458 dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
459 }
460
461 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)462 void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
463 const uint8_t *above, const uint8_t *left) {
464 uint16x8_t sum_above = dc_load_partial_sum_16(above);
465 uint16x8_t sum_left = dc_load_partial_sum_64(left);
466 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
467 uint32_t sum = horizontal_add_u16x8(sum_al);
468 uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
469 dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
470 }
471
aom_dc_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)472 void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
473 const uint8_t *above, const uint8_t *left) {
474 uint16x8_t sum_above = dc_load_partial_sum_64(above);
475 uint16x8_t sum_left = dc_load_partial_sum_16(left);
476 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
477 uint32_t sum = horizontal_add_u16x8(sum_al);
478 uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
479 dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
480 }
481 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
482
aom_dc_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)483 void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
484 const uint8_t *above, const uint8_t *left) {
485 uint16x8_t sum_above = dc_load_partial_sum_32(above);
486 uint16x8_t sum_left = dc_load_partial_sum_64(left);
487 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
488 uint32_t sum = horizontal_add_u16x8(sum_al);
489 uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
490 dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
491 }
492
aom_dc_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)493 void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
494 const uint8_t *above, const uint8_t *left) {
495 uint16x8_t sum_above = dc_load_partial_sum_64(above);
496 uint16x8_t sum_left = dc_load_partial_sum_32(left);
497 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
498 uint32_t sum = horizontal_add_u16x8(sum_al);
499 uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
500 dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
501 }
502
503 #undef DC_MULTIPLIER_1X2
504 #undef DC_MULTIPLIER_1X4
505
506 #define DC_PREDICTOR_128(w, h, q) \
507 void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
508 const uint8_t *above, \
509 const uint8_t *left) { \
510 (void)above; \
511 (void)left; \
512 dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \
513 }
514
515 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
516 DC_PREDICTOR_128(4, 16, )
517 DC_PREDICTOR_128(8, 32, )
518 DC_PREDICTOR_128(16, 4, q)
519 DC_PREDICTOR_128(16, 64, q)
520 DC_PREDICTOR_128(32, 8, q)
521 DC_PREDICTOR_128(64, 16, q)
522 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
523 DC_PREDICTOR_128(4, 8, )
524 DC_PREDICTOR_128(8, 4, )
525 DC_PREDICTOR_128(8, 16, )
526 DC_PREDICTOR_128(16, 8, q)
527 DC_PREDICTOR_128(16, 32, q)
528 DC_PREDICTOR_128(32, 16, q)
529 DC_PREDICTOR_128(32, 64, q)
530 DC_PREDICTOR_128(64, 32, q)
531
532 #undef DC_PREDICTOR_128
533
534 #define DC_PREDICTOR_LEFT(w, h, shift, q) \
535 void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
536 const uint8_t *above, \
537 const uint8_t *left) { \
538 (void)above; \
539 const uint16x8_t sum = dc_load_sum_##h(left); \
540 const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
541 dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
542 }
543
544 DC_PREDICTOR_LEFT(4, 8, 3, )
545 DC_PREDICTOR_LEFT(8, 4, 2, )
546 DC_PREDICTOR_LEFT(8, 16, 4, )
547 DC_PREDICTOR_LEFT(16, 8, 3, q)
548 DC_PREDICTOR_LEFT(16, 32, 5, q)
549 DC_PREDICTOR_LEFT(32, 16, 4, q)
550 DC_PREDICTOR_LEFT(32, 64, 6, q)
551 DC_PREDICTOR_LEFT(64, 32, 5, q)
552 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
553 DC_PREDICTOR_LEFT(4, 16, 4, )
554 DC_PREDICTOR_LEFT(16, 4, 2, q)
555 DC_PREDICTOR_LEFT(8, 32, 5, )
556 DC_PREDICTOR_LEFT(32, 8, 3, q)
557 DC_PREDICTOR_LEFT(16, 64, 6, q)
558 DC_PREDICTOR_LEFT(64, 16, 4, q)
559 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
560
561 #undef DC_PREDICTOR_LEFT
562
563 #define DC_PREDICTOR_TOP(w, h, shift, q) \
564 void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
565 const uint8_t *above, \
566 const uint8_t *left) { \
567 (void)left; \
568 const uint16x8_t sum = dc_load_sum_##w(above); \
569 const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \
570 dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \
571 }
572
573 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
574 DC_PREDICTOR_TOP(8, 32, 3, )
575 DC_PREDICTOR_TOP(4, 16, 2, )
576 DC_PREDICTOR_TOP(16, 4, 4, q)
577 DC_PREDICTOR_TOP(16, 64, 4, q)
578 DC_PREDICTOR_TOP(32, 8, 5, q)
579 DC_PREDICTOR_TOP(64, 16, 6, q)
580 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
581 DC_PREDICTOR_TOP(4, 8, 2, )
582 DC_PREDICTOR_TOP(8, 4, 3, )
583 DC_PREDICTOR_TOP(8, 16, 3, )
584 DC_PREDICTOR_TOP(16, 8, 4, q)
585 DC_PREDICTOR_TOP(16, 32, 4, q)
586 DC_PREDICTOR_TOP(32, 16, 5, q)
587 DC_PREDICTOR_TOP(32, 64, 5, q)
588 DC_PREDICTOR_TOP(64, 32, 6, q)
589
590 #undef DC_PREDICTOR_TOP
591
592 // -----------------------------------------------------------------------------
593
v_store_4xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)594 static inline void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
595 uint8x8_t d0) {
596 for (int i = 0; i < h; ++i) {
597 store_u8_4x1(dst + i * stride, d0);
598 }
599 }
600
v_store_8xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x8_t d0)601 static inline void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
602 uint8x8_t d0) {
603 for (int i = 0; i < h; ++i) {
604 vst1_u8(dst + i * stride, d0);
605 }
606 }
607
v_store_16xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0)608 static inline void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
609 uint8x16_t d0) {
610 for (int i = 0; i < h; ++i) {
611 vst1q_u8(dst + i * stride, d0);
612 }
613 }
614
v_store_32xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1)615 static inline void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
616 uint8x16_t d0, uint8x16_t d1) {
617 for (int i = 0; i < h; ++i) {
618 vst1q_u8(dst + 0, d0);
619 vst1q_u8(dst + 16, d1);
620 dst += stride;
621 }
622 }
623
v_store_64xh(uint8_t * dst,ptrdiff_t stride,int h,uint8x16_t d0,uint8x16_t d1,uint8x16_t d2,uint8x16_t d3)624 static inline void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
625 uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
626 uint8x16_t d3) {
627 for (int i = 0; i < h; ++i) {
628 vst1q_u8(dst + 0, d0);
629 vst1q_u8(dst + 16, d1);
630 vst1q_u8(dst + 32, d2);
631 vst1q_u8(dst + 48, d3);
632 dst += stride;
633 }
634 }
635
aom_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)636 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
637 const uint8_t *above, const uint8_t *left) {
638 (void)left;
639 v_store_4xh(dst, stride, 4, load_u8_4x1(above));
640 }
641
aom_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)642 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
643 const uint8_t *above, const uint8_t *left) {
644 (void)left;
645 v_store_8xh(dst, stride, 8, vld1_u8(above));
646 }
647
aom_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)648 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
649 const uint8_t *above, const uint8_t *left) {
650 (void)left;
651 v_store_16xh(dst, stride, 16, vld1q_u8(above));
652 }
653
aom_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
655 const uint8_t *above, const uint8_t *left) {
656 const uint8x16_t d0 = vld1q_u8(above);
657 const uint8x16_t d1 = vld1q_u8(above + 16);
658 (void)left;
659 v_store_32xh(dst, stride, 32, d0, d1);
660 }
661
aom_v_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)662 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
663 const uint8_t *above, const uint8_t *left) {
664 (void)left;
665 v_store_4xh(dst, stride, 8, load_u8_4x1(above));
666 }
667
668 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)669 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
670 const uint8_t *above, const uint8_t *left) {
671 (void)left;
672 v_store_4xh(dst, stride, 16, load_u8_4x1(above));
673 }
674 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
675
aom_v_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)676 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
677 const uint8_t *above, const uint8_t *left) {
678 (void)left;
679 v_store_8xh(dst, stride, 4, vld1_u8(above));
680 }
681
aom_v_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)682 void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
683 const uint8_t *above, const uint8_t *left) {
684 (void)left;
685 v_store_8xh(dst, stride, 16, vld1_u8(above));
686 }
687
688 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689 void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
690 const uint8_t *above, const uint8_t *left) {
691 (void)left;
692 v_store_8xh(dst, stride, 32, vld1_u8(above));
693 }
694
aom_v_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)695 void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
696 const uint8_t *above, const uint8_t *left) {
697 (void)left;
698 v_store_16xh(dst, stride, 4, vld1q_u8(above));
699 }
700 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
701
aom_v_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)702 void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
703 const uint8_t *above, const uint8_t *left) {
704 (void)left;
705 v_store_16xh(dst, stride, 8, vld1q_u8(above));
706 }
707
aom_v_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)708 void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
709 const uint8_t *above, const uint8_t *left) {
710 (void)left;
711 v_store_16xh(dst, stride, 32, vld1q_u8(above));
712 }
713
714 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)715 void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
716 const uint8_t *above, const uint8_t *left) {
717 (void)left;
718 v_store_16xh(dst, stride, 64, vld1q_u8(above));
719 }
720
aom_v_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)721 void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
722 const uint8_t *above, const uint8_t *left) {
723 const uint8x16_t d0 = vld1q_u8(above);
724 const uint8x16_t d1 = vld1q_u8(above + 16);
725 (void)left;
726 v_store_32xh(dst, stride, 8, d0, d1);
727 }
728 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
729
aom_v_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)730 void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
731 const uint8_t *above, const uint8_t *left) {
732 const uint8x16_t d0 = vld1q_u8(above);
733 const uint8x16_t d1 = vld1q_u8(above + 16);
734 (void)left;
735 v_store_32xh(dst, stride, 16, d0, d1);
736 }
737
aom_v_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)738 void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
739 const uint8_t *above, const uint8_t *left) {
740 const uint8x16_t d0 = vld1q_u8(above);
741 const uint8x16_t d1 = vld1q_u8(above + 16);
742 (void)left;
743 v_store_32xh(dst, stride, 64, d0, d1);
744 }
745
746 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)747 void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
748 const uint8_t *above, const uint8_t *left) {
749 const uint8x16_t d0 = vld1q_u8(above);
750 const uint8x16_t d1 = vld1q_u8(above + 16);
751 const uint8x16_t d2 = vld1q_u8(above + 32);
752 const uint8x16_t d3 = vld1q_u8(above + 48);
753 (void)left;
754 v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
755 }
756 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
757
aom_v_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)758 void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
759 const uint8_t *above, const uint8_t *left) {
760 const uint8x16_t d0 = vld1q_u8(above);
761 const uint8x16_t d1 = vld1q_u8(above + 16);
762 const uint8x16_t d2 = vld1q_u8(above + 32);
763 const uint8x16_t d3 = vld1q_u8(above + 48);
764 (void)left;
765 v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
766 }
767
aom_v_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)768 void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
769 const uint8_t *above, const uint8_t *left) {
770 const uint8x16_t d0 = vld1q_u8(above);
771 const uint8x16_t d1 = vld1q_u8(above + 16);
772 const uint8x16_t d2 = vld1q_u8(above + 32);
773 const uint8x16_t d3 = vld1q_u8(above + 48);
774 (void)left;
775 v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
776 }
777
778 // -----------------------------------------------------------------------------
779
h_store_4x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)780 static inline void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
781 store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
782 store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
783 store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
784 store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
785 store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
786 store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
787 store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
788 store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
789 }
790
h_store_8x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)791 static inline void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
792 vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
793 vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
794 vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
795 vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
796 vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
797 vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
798 vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
799 vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
800 }
801
h_store_16x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)802 static inline void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
803 vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
804 vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
805 vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
806 vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
807 vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
808 vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
809 vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
810 vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
811 }
812
h_store_32x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)813 static inline void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
814 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
815 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
816 dst += stride;
817 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
818 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
819 dst += stride;
820 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
821 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
822 dst += stride;
823 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
824 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
825 dst += stride;
826 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
827 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
828 dst += stride;
829 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
830 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
831 dst += stride;
832 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
833 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
834 dst += stride;
835 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
836 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
837 }
838
h_store_64x8(uint8_t * dst,ptrdiff_t stride,uint8x8_t d0)839 static inline void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
840 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
841 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
842 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
843 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
844 dst += stride;
845 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
846 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
847 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
848 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
849 dst += stride;
850 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
851 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
852 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
853 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
854 dst += stride;
855 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
856 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
857 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
858 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
859 dst += stride;
860 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
861 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
862 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
863 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
864 dst += stride;
865 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
866 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
867 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
868 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
869 dst += stride;
870 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
871 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
872 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
873 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
874 dst += stride;
875 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
876 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
877 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
878 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
879 }
880
aom_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)881 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
882 const uint8_t *above, const uint8_t *left) {
883 const uint8x8_t d0 = load_u8_4x1(left);
884 (void)above;
885 store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
886 store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
887 store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
888 store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
889 }
890
aom_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)891 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
892 const uint8_t *above, const uint8_t *left) {
893 const uint8x8_t d0 = vld1_u8(left);
894 (void)above;
895 h_store_8x8(dst, stride, d0);
896 }
897
aom_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)898 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
899 const uint8_t *above, const uint8_t *left) {
900 const uint8x16_t d0 = vld1q_u8(left);
901 (void)above;
902 h_store_16x8(dst, stride, vget_low_u8(d0));
903 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
904 }
905
aom_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)906 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
907 const uint8_t *above, const uint8_t *left) {
908 const uint8x16_t d0 = vld1q_u8(left);
909 const uint8x16_t d1 = vld1q_u8(left + 16);
910 (void)above;
911 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
912 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
913 h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
914 h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
915 }
916
aom_h_predictor_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)917 void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
918 const uint8_t *above, const uint8_t *left) {
919 const uint8x8_t d0 = vld1_u8(left);
920 (void)above;
921 h_store_4x8(dst, stride, d0);
922 }
923
924 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)925 void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
926 const uint8_t *above, const uint8_t *left) {
927 const uint8x16_t d0 = vld1q_u8(left);
928 (void)above;
929 h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
930 h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
931 }
932 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
933
aom_h_predictor_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)934 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
935 const uint8_t *above, const uint8_t *left) {
936 const uint8x8_t d0 = load_u8_4x1(left);
937 (void)above;
938 vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
939 vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
940 vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
941 vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
942 }
943
aom_h_predictor_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)944 void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
945 const uint8_t *above, const uint8_t *left) {
946 const uint8x16_t d0 = vld1q_u8(left);
947 (void)above;
948 h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
949 h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
950 }
951
952 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)953 void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
954 const uint8_t *above, const uint8_t *left) {
955 const uint8x16_t d0 = vld1q_u8(left);
956 const uint8x16_t d1 = vld1q_u8(left + 16);
957 (void)above;
958 h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
959 h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
960 h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
961 h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
962 }
963
aom_h_predictor_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)964 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
965 const uint8_t *above, const uint8_t *left) {
966 const uint8x8_t d0 = load_u8_4x1(left);
967 (void)above;
968 vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
969 vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
970 vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
971 vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
972 }
973 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
974
aom_h_predictor_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)975 void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
976 const uint8_t *above, const uint8_t *left) {
977 const uint8x8_t d0 = vld1_u8(left);
978 (void)above;
979 h_store_16x8(dst, stride, d0);
980 }
981
aom_h_predictor_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)982 void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
983 const uint8_t *above, const uint8_t *left) {
984 const uint8x16_t d0 = vld1q_u8(left);
985 const uint8x16_t d1 = vld1q_u8(left + 16);
986 (void)above;
987 h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
988 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
989 h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
990 h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
991 }
992
993 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)994 void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
995 const uint8_t *above, const uint8_t *left) {
996 const uint8x16_t d0 = vld1q_u8(left);
997 const uint8x16_t d1 = vld1q_u8(left + 16);
998 const uint8x16_t d2 = vld1q_u8(left + 32);
999 const uint8x16_t d3 = vld1q_u8(left + 48);
1000 (void)above;
1001 h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
1002 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
1003 h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
1004 h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
1005 h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
1006 h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
1007 h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
1008 h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
1009 }
1010
aom_h_predictor_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1011 void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
1012 const uint8_t *above, const uint8_t *left) {
1013 const uint8x8_t d0 = vld1_u8(left);
1014 (void)above;
1015 h_store_32x8(dst, stride, d0);
1016 }
1017 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1018
aom_h_predictor_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1019 void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
1020 const uint8_t *above, const uint8_t *left) {
1021 const uint8x16_t d0 = vld1q_u8(left);
1022 (void)above;
1023 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
1024 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
1025 }
1026
aom_h_predictor_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1027 void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
1028 const uint8_t *above, const uint8_t *left) {
1029 const uint8x16_t d0 = vld1q_u8(left + 0);
1030 const uint8x16_t d1 = vld1q_u8(left + 16);
1031 const uint8x16_t d2 = vld1q_u8(left + 32);
1032 const uint8x16_t d3 = vld1q_u8(left + 48);
1033 (void)above;
1034 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
1035 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
1036 h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
1037 h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
1038 h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
1039 h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
1040 h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
1041 h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
1042 }
1043
1044 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_h_predictor_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1045 void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
1046 const uint8_t *above, const uint8_t *left) {
1047 const uint8x16_t d0 = vld1q_u8(left);
1048 (void)above;
1049 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1050 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1051 }
1052 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1053
aom_h_predictor_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1054 void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
1055 const uint8_t *above, const uint8_t *left) {
1056 (void)above;
1057 for (int i = 0; i < 2; ++i) {
1058 const uint8x16_t d0 = vld1q_u8(left);
1059 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1060 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1061 left += 16;
1062 dst += 16 * stride;
1063 }
1064 }
1065
aom_h_predictor_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1066 void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
1067 const uint8_t *above, const uint8_t *left) {
1068 (void)above;
1069 for (int i = 0; i < 4; ++i) {
1070 const uint8x16_t d0 = vld1q_u8(left);
1071 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
1072 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
1073 left += 16;
1074 dst += 16 * stride;
1075 }
1076 }
1077
1078 /* ---------------------P R E D I C T I O N Z 1--------------------------- */
1079
1080 // Low bit depth functions
1081 static DECLARE_ALIGNED(32, const uint8_t, BaseMask[33][32]) = {
1082 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1084 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1086 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1088 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1090 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1092 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1094 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1096 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1098 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1100 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1102 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1105 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1108 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1109 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1111 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1112 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1114 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1115 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
1116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1117 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1118 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
1119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1120 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1121 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
1122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1123 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1124 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1126 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1129 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1130 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
1131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1132 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1133 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
1134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1135 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1136 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1137 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1138 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1139 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1141 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1142 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1143 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1144 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1145 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1146 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1147 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1148 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1149 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
1150 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1151 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1152 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
1153 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1154 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1155 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
1156 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1157 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1158 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
1159 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1160 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1161 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
1162 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1163 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1164 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
1165 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1166 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1167 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
1168 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1169 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1170 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
1171 };
1172
dr_prediction_z1_HxW_internal_neon_64(int H,int W,uint8x8_t * dst,const uint8_t * above,int upsample_above,int dx)1173 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
1174 int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
1175 int dx) {
1176 const int frac_bits = 6 - upsample_above;
1177 const int max_base_x = ((W + H) - 1) << upsample_above;
1178
1179 assert(dx > 0);
1180 // pre-filter above pixels
1181 // store in temp buffers:
1182 // above[x] * 32 + 16
1183 // above[x+1] - above[x]
1184 // final pixels will be calculated as:
1185 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1186
1187 const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
1188
1189 int x = dx;
1190 for (int r = 0; r < W; r++) {
1191 int base = x >> frac_bits;
1192 int base_max_diff = (max_base_x - base) >> upsample_above;
1193 if (base_max_diff <= 0) {
1194 for (int i = r; i < W; ++i) {
1195 dst[i] = a_mbase_x; // save 4 values
1196 }
1197 return;
1198 }
1199
1200 if (base_max_diff > H) base_max_diff = H;
1201
1202 uint8x8x2_t a01_128;
1203 uint16x8_t shift;
1204 if (upsample_above) {
1205 a01_128 = vld2_u8(above + base);
1206 shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
1207 } else {
1208 a01_128.val[0] = vld1_u8(above + base);
1209 a01_128.val[1] = vld1_u8(above + base + 1);
1210 shift = vdupq_n_u16((x & 0x3f) >> 1);
1211 }
1212 uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
1213 uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
1214 uint16x8_t res = vmlaq_u16(a32, diff, shift);
1215
1216 uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
1217 dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
1218
1219 x += dx;
1220 }
1221 }
1222
dr_prediction_z1_4xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1223 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1224 const uint8_t *above, int upsample_above,
1225 int dx) {
1226 uint8x8_t dstvec[16];
1227
1228 dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
1229 dx);
1230 for (int i = 0; i < N; i++) {
1231 vst1_lane_u32((uint32_t *)(dst + stride * i),
1232 vreinterpret_u32_u8(dstvec[i]), 0);
1233 }
1234 }
1235
dr_prediction_z1_8xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1236 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1237 const uint8_t *above, int upsample_above,
1238 int dx) {
1239 uint8x8_t dstvec[32];
1240
1241 dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
1242 dx);
1243 for (int i = 0; i < N; i++) {
1244 vst1_u8(dst + stride * i, dstvec[i]);
1245 }
1246 }
1247
dr_prediction_z1_HxW_internal_neon(int H,int W,uint8x16_t * dst,const uint8_t * above,int upsample_above,int dx)1248 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
1249 int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
1250 int dx) {
1251 const int frac_bits = 6 - upsample_above;
1252 const int max_base_x = ((W + H) - 1) << upsample_above;
1253
1254 assert(dx > 0);
1255 // pre-filter above pixels
1256 // store in temp buffers:
1257 // above[x] * 32 + 16
1258 // above[x+1] - above[x]
1259 // final pixels will be calculated as:
1260 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1261
1262 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1263
1264 int x = dx;
1265 for (int r = 0; r < W; r++) {
1266 int base = x >> frac_bits;
1267 int base_max_diff = (max_base_x - base) >> upsample_above;
1268 if (base_max_diff <= 0) {
1269 for (int i = r; i < W; ++i) {
1270 dst[i] = a_mbase_x; // save 4 values
1271 }
1272 return;
1273 }
1274
1275 if (base_max_diff > H) base_max_diff = H;
1276
1277 uint16x8_t shift;
1278 uint8x16_t a0_128, a1_128;
1279 if (upsample_above) {
1280 uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
1281 a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
1282 a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
1283 shift = vdupq_n_u16(x & 0x1f);
1284 } else {
1285 a0_128 = vld1q_u8(above + base);
1286 a1_128 = vld1q_u8(above + base + 1);
1287 shift = vdupq_n_u16((x & 0x3f) >> 1);
1288 }
1289 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1290 uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1291 uint16x8_t a32_lo =
1292 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1293 uint16x8_t a32_hi =
1294 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1295 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1296 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1297 uint8x16_t v_temp =
1298 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1299
1300 uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
1301 dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
1302
1303 x += dx;
1304 }
1305 }
1306
dr_prediction_z1_16xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)1307 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1308 const uint8_t *above, int upsample_above,
1309 int dx) {
1310 uint8x16_t dstvec[64];
1311
1312 dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
1313 for (int i = 0; i < N; i++) {
1314 vst1q_u8(dst + stride * i, dstvec[i]);
1315 }
1316 }
1317
dr_prediction_z1_32xN_internal_neon(int N,uint8x16x2_t * dstvec,const uint8_t * above,int dx)1318 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
1319 int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
1320 const int frac_bits = 6;
1321 const int max_base_x = ((32 + N) - 1);
1322
1323 // pre-filter above pixels
1324 // store in temp buffers:
1325 // above[x] * 32 + 16
1326 // above[x+1] - above[x]
1327 // final pixels will be calculated as:
1328 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1329
1330 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1331
1332 int x = dx;
1333 for (int r = 0; r < N; r++) {
1334 int base = x >> frac_bits;
1335 int base_max_diff = (max_base_x - base);
1336 if (base_max_diff <= 0) {
1337 for (int i = r; i < N; ++i) {
1338 dstvec[i].val[0] = a_mbase_x; // save 32 values
1339 dstvec[i].val[1] = a_mbase_x;
1340 }
1341 return;
1342 }
1343 if (base_max_diff > 32) base_max_diff = 32;
1344
1345 uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1346
1347 uint8x16_t res16[2];
1348 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
1349 int mdiff = base_max_diff - j;
1350 if (mdiff <= 0) {
1351 res16[jj] = a_mbase_x;
1352 } else {
1353 uint8x16_t a0_128 = vld1q_u8(above + base + j);
1354 uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
1355 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1356 uint16x8_t diff_hi =
1357 vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1358 uint16x8_t a32_lo =
1359 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1360 uint16x8_t a32_hi =
1361 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1362 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1363 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1364
1365 res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
1366 }
1367 }
1368
1369 uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
1370 uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
1371 dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
1372 dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
1373 x += dx;
1374 }
1375 }
1376
dr_prediction_z1_32xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1377 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1378 const uint8_t *above, int dx) {
1379 uint8x16x2_t dstvec[64];
1380
1381 dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
1382 for (int i = 0; i < N; i++) {
1383 vst1q_u8(dst + stride * i, dstvec[i].val[0]);
1384 vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
1385 }
1386 }
1387
1388 // clang-format off
1389 static const uint8_t kLoadMaxShuffles[] = {
1390 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1391 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1392 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1393 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1394 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1395 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1396 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1397 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1398 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15,
1399 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15,
1400 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15,
1401 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15,
1402 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15,
1403 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15,
1404 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15,
1405 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1406 };
1407 // clang-format on
1408
z1_load_masked_neon(const uint8_t * ptr,int shuffle_idx)1409 static inline uint8x16_t z1_load_masked_neon(const uint8_t *ptr,
1410 int shuffle_idx) {
1411 uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
1412 uint8x16_t src = vld1q_u8(ptr);
1413 #if AOM_ARCH_AARCH64
1414 return vqtbl1q_u8(src, shuffle);
1415 #else
1416 uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
1417 uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
1418 uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
1419 return vcombine_u8(lo, hi);
1420 #endif
1421 }
1422
dr_prediction_z1_64xN_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int dx)1423 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
1424 const uint8_t *above, int dx) {
1425 const int frac_bits = 6;
1426 const int max_base_x = ((64 + N) - 1);
1427
1428 // pre-filter above pixels
1429 // store in temp buffers:
1430 // above[x] * 32 + 16
1431 // above[x+1] - above[x]
1432 // final pixels will be calculated as:
1433 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1434
1435 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
1436
1437 int x = dx;
1438 for (int r = 0; r < N; r++, dst += stride) {
1439 int base = x >> frac_bits;
1440 if (base >= max_base_x) {
1441 for (int i = r; i < N; ++i) {
1442 vst1q_u8(dst, a_mbase_x);
1443 vst1q_u8(dst + 16, a_mbase_x);
1444 vst1q_u8(dst + 32, a_mbase_x);
1445 vst1q_u8(dst + 48, a_mbase_x);
1446 dst += stride;
1447 }
1448 return;
1449 }
1450
1451 uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
1452 uint8x16_t base_inc128 =
1453 vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
1454 vcreate_u8(0x0F0E0D0C0B0A0908)));
1455
1456 for (int j = 0; j < 64; j += 16) {
1457 if (base + j >= max_base_x) {
1458 vst1q_u8(dst + j, a_mbase_x);
1459 } else {
1460 uint8x16_t a0_128;
1461 uint8x16_t a1_128;
1462 if (base + j + 15 >= max_base_x) {
1463 int shuffle_idx = max_base_x - base - j;
1464 a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1465 } else {
1466 a0_128 = vld1q_u8(above + base + j);
1467 }
1468 if (base + j + 16 >= max_base_x) {
1469 int shuffle_idx = max_base_x - base - j - 1;
1470 a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx);
1471 } else {
1472 a1_128 = vld1q_u8(above + base + j + 1);
1473 }
1474
1475 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
1476 uint16x8_t diff_hi =
1477 vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
1478 uint16x8_t a32_lo =
1479 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
1480 uint16x8_t a32_hi =
1481 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
1482 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
1483 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
1484 vst1q_u8(dst + j,
1485 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)));
1486
1487 base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
1488 }
1489 }
1490 x += dx;
1491 }
1492 }
1493
1494 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)1495 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1496 const uint8_t *above, const uint8_t *left,
1497 int upsample_above, int dx, int dy) {
1498 (void)left;
1499 (void)dy;
1500
1501 switch (bw) {
1502 case 4:
1503 dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
1504 break;
1505 case 8:
1506 dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
1507 break;
1508 case 16:
1509 dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
1510 break;
1511 case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
1512 case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
1513 default: break;
1514 }
1515 }
1516
1517 /* ---------------------P R E D I C T I O N Z 2--------------------------- */
1518
1519 // TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
1520 #if AOM_ARCH_AARCH64
1521 #if !AOM_ARCH_AARCH64
1522 static DECLARE_ALIGNED(16, const uint8_t, LoadMaskz2[4][16]) = {
1523 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1524 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
1525 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
1526 0, 0, 0 },
1527 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
1528 0xff, 0xff, 0xff, 0xff }
1529 };
1530 #endif // !AOM_ARCH_AARCH64
1531
dr_prediction_z2_Nx4_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y,uint8x8_t * a0_x,uint8x8_t * a1_x,uint16x4_t * shift0)1532 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
1533 const uint8_t *above, int upsample_above, int dx, int base_x, int y,
1534 uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
1535 uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
1536 uint16x4_t ydx = vdup_n_u16(y * dx);
1537 if (upsample_above) {
1538 // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
1539 // only load either 16 or 32.
1540 uint8x8_t v_tmp = vld1_u8(above + base_x);
1541 *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
1542 *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
1543 *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
1544 } else {
1545 *a0_x = load_unaligned_u8_4x1(above + base_x);
1546 *a1_x = load_unaligned_u8_4x1(above + base_x + 1);
1547 *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
1548 }
1549 }
1550
dr_prediction_z2_Nx4_left_neon(uint8x16x2_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y,uint16x4_t * a0_y,uint16x4_t * a1_y,uint16x4_t * shift1)1551 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
1552 #if AOM_ARCH_AARCH64
1553 uint8x16x2_t left_vals,
1554 #else
1555 const uint8_t *left,
1556 #endif
1557 int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
1558 uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
1559 int16x4_t dy64 = vdup_n_s16(dy);
1560 int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
1561 int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
1562 int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
1563 int16x4_t v_r6 = vdup_n_s16(r << 6);
1564 int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
1565 int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
1566
1567 // Values in base_y_c64 range from -2 through 14 inclusive.
1568 base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
1569
1570 #if AOM_ARCH_AARCH64
1571 uint8x8_t left_idx0 =
1572 vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16]
1573 uint8x8_t left_idx1 =
1574 vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17]
1575
1576 *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
1577 *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
1578 #else // !AOM_ARCH_AARCH64
1579 DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
1580
1581 vst1_s16(base_y_c, base_y_c64);
1582 uint8x8_t a0_y_u8 = vdup_n_u8(0);
1583 a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
1584 a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
1585 a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
1586 a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
1587
1588 base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
1589 vst1_s16(base_y_c, base_y_c64);
1590 uint8x8_t a1_y_u8 = vdup_n_u8(0);
1591 a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
1592 a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
1593 a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
1594 a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
1595
1596 *a0_y = vreinterpret_u16_u8(a0_y_u8);
1597 *a1_y = vreinterpret_u16_u8(a1_y_u8);
1598 #endif // AOM_ARCH_AARCH64
1599
1600 if (upsample_left) {
1601 *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
1602 } else {
1603 *shift1 =
1604 vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
1605 }
1606 }
1607
dr_prediction_z2_Nx8_above_neon(const uint8_t * above,int upsample_above,int dx,int base_x,int y)1608 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
1609 const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
1610 uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1611 vcreate_u16(0x0008000700060005));
1612 uint16x8_t ydx = vdupq_n_u16(y * dx);
1613 uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
1614
1615 uint16x8_t shift0;
1616 uint8x8_t a0_x0;
1617 uint8x8_t a1_x0;
1618 if (upsample_above) {
1619 uint8x8x2_t v_tmp = vld2_u8(above + base_x);
1620 a0_x0 = v_tmp.val[0];
1621 a1_x0 = v_tmp.val[1];
1622 shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1623 } else {
1624 a0_x0 = vld1_u8(above + base_x);
1625 a1_x0 = vld1_u8(above + base_x + 1);
1626 shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
1627 }
1628
1629 uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x]
1630 uint16x8_t a32 =
1631 vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16
1632 uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
1633 return vshrn_n_u16(res, 5);
1634 }
1635
dr_prediction_z2_Nx8_left_neon(uint8x16x3_t left_vals,int upsample_left,int dy,int r,int min_base_y,int frac_bits_y)1636 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
1637 #if AOM_ARCH_AARCH64
1638 uint8x16x3_t left_vals,
1639 #else
1640 const uint8_t *left,
1641 #endif
1642 int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
1643 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1644 int16x8_t dy128 = vdupq_n_s16(dy);
1645 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
1646 int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
1647
1648 uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
1649 vcreate_u16(0x0008000700060005));
1650 int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
1651 int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
1652
1653 // Values in base_y_c128 range from -2 through 31 inclusive.
1654 base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
1655
1656 #if AOM_ARCH_AARCH64
1657 uint8x16_t left_idx0 =
1658 vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33]
1659 uint8x16_t left_idx1 =
1660 vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34]
1661 uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1662
1663 uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
1664 uint8x8_t a0_x1 = vget_low_u8(a01_x);
1665 uint8x8_t a1_x1 = vget_high_u8(a01_x);
1666 #else // !AOM_ARCH_AARCH64
1667 uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
1668 uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
1669 #endif // AOM_ARCH_AARCH64
1670
1671 uint16x8_t shift1;
1672 if (upsample_left) {
1673 shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
1674 } else {
1675 shift1 = vshrq_n_u16(
1676 vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
1677 }
1678
1679 uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
1680 uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
1681 uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
1682 return vshrn_n_u16(res, 5);
1683 }
1684
dr_prediction_z2_NxW_above_neon(const uint8_t * above,int dx,int base_x,int y,int j)1685 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
1686 const uint8_t *above, int dx, int base_x, int y, int j) {
1687 uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1688 vcreate_u16(0x0007000600050004)),
1689 vcombine_u16(vcreate_u16(0x000B000A00090008),
1690 vcreate_u16(0x000F000E000D000C)) } };
1691 uint16x8_t j256 = vdupq_n_u16(j);
1692 uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
1693
1694 const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
1695 const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
1696 uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
1697 uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
1698 uint16x8_t shift0 =
1699 vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
1700 uint16x8_t shift1 =
1701 vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
1702 // a[x+1] - a[x]
1703 uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
1704 uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
1705 // a[x] * 32 + 16
1706 uint16x8_t a32_0 =
1707 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
1708 uint16x8_t a32_1 =
1709 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
1710 uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
1711 uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
1712 return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1713 }
1714
dr_prediction_z2_NxW_left_neon(uint8x16x4_t left_vals0,uint8x16x4_t left_vals1,int dy,int r,int j)1715 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
1716 #if AOM_ARCH_AARCH64
1717 uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
1718 #else
1719 const uint8_t *left,
1720 #endif
1721 int dy, int r, int j) {
1722 // here upsample_above and upsample_left are 0 by design of
1723 // av1_use_intra_edge_upsample
1724 const int min_base_y = -1;
1725
1726 int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
1727 int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
1728 int16x8_t dy256 = vdupq_n_s16(dy);
1729 uint16x8_t j256 = vdupq_n_u16(j);
1730
1731 uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
1732 vcreate_u16(0x0007000600050004)),
1733 vcombine_u16(vcreate_u16(0x000B000A00090008),
1734 vcreate_u16(0x000F000E000D000C)) } };
1735 uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
1736 vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
1737
1738 int16x8_t v_r6 = vdupq_n_s16(r << 6);
1739
1740 int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
1741 int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
1742 int16x8_t mul16_lo = vreinterpretq_s16_u16(
1743 vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
1744 vreinterpretq_u16_s16(half_min_base_y256)));
1745 int16x8_t mul16_hi = vreinterpretq_s16_u16(
1746 vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
1747 vreinterpretq_u16_s16(half_min_base_y256)));
1748 int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
1749 int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
1750
1751 int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
1752 int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
1753
1754 base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
1755 base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
1756
1757 #if !AOM_ARCH_AARCH64
1758 int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
1759 int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
1760 int16_t offset_diff = max_y - min_y;
1761
1762 uint8x8_t a0_y0;
1763 uint8x8_t a0_y1;
1764 uint8x8_t a1_y0;
1765 uint8x8_t a1_y1;
1766 if (offset_diff < 16) {
1767 // Avoid gathers where the data we want is close together in memory.
1768 // We don't need this for AArch64 since we can already use TBL to cover the
1769 // full range of possible values.
1770 assert(offset_diff >= 0);
1771 int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
1772
1773 int16x8x2_t base_y_offset;
1774 base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
1775 base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
1776
1777 int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
1778 vqmovn_s16(base_y_offset.val[1]));
1779
1780 uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
1781 uint8x16_t a0_y128 = vld1q_u8(left + min_y);
1782 uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
1783 a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
1784 a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
1785
1786 uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
1787 uint8x8_t v_index_high =
1788 vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
1789 uint8x8x2_t v_tmp, v_res;
1790 v_tmp.val[0] = vget_low_u8(a0_y128);
1791 v_tmp.val[1] = vget_high_u8(a0_y128);
1792 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1793 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1794 a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1795 v_tmp.val[0] = vget_low_u8(a1_y128);
1796 v_tmp.val[1] = vget_high_u8(a1_y128);
1797 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
1798 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
1799 a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
1800
1801 a0_y0 = vget_low_u8(a0_y128);
1802 a0_y1 = vget_high_u8(a0_y128);
1803 a1_y0 = vget_low_u8(a1_y128);
1804 a1_y1 = vget_high_u8(a1_y128);
1805 } else {
1806 a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
1807 a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
1808 a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
1809 a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
1810 }
1811 #else
1812 // Values in left_idx{0,1} range from 0 through 63 inclusive.
1813 uint8x16_t left_idx0 =
1814 vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
1815 uint8x16_t left_idx1 =
1816 vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
1817 uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
1818
1819 uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
1820 uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
1821
1822 uint8x8_t a0_y0 = vget_low_u8(a0_y01);
1823 uint8x8_t a0_y1 = vget_high_u8(a0_y01);
1824 uint8x8_t a1_y0 = vget_low_u8(a1_y01);
1825 uint8x8_t a1_y1 = vget_high_u8(a1_y01);
1826 #endif // !AOM_ARCH_AARCH64
1827
1828 uint16x8_t shifty_lo = vshrq_n_u16(
1829 vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
1830 uint16x8_t shifty_hi = vshrq_n_u16(
1831 vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
1832
1833 // a[x+1] - a[x]
1834 uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
1835 uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
1836 // a[x] * 32 + 16
1837 uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
1838 uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
1839
1840 uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
1841 uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
1842
1843 return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
1844 }
1845
dr_prediction_z2_Nx4_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1846 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
1847 const uint8_t *above, const uint8_t *left,
1848 int upsample_above, int upsample_left,
1849 int dx, int dy) {
1850 const int min_base_x = -(1 << upsample_above);
1851 const int min_base_y = -(1 << upsample_left);
1852 const int frac_bits_x = 6 - upsample_above;
1853 const int frac_bits_y = 6 - upsample_left;
1854
1855 assert(dx > 0);
1856 // pre-filter above pixels
1857 // store in temp buffers:
1858 // above[x] * 32 + 16
1859 // above[x+1] - above[x]
1860 // final pixels will be calculated as:
1861 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1862
1863 #if AOM_ARCH_AARCH64
1864 // Use ext rather than loading left + 14 directly to avoid over-read.
1865 const uint8x16_t left_m2 = vld1q_u8(left - 2);
1866 const uint8x16_t left_0 = vld1q_u8(left);
1867 const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
1868 const uint8x16x2_t left_vals = { { left_m2, left_14 } };
1869 #define LEFT left_vals
1870 #else // !AOM_ARCH_AARCH64
1871 #define LEFT left
1872 #endif // AOM_ARCH_AARCH64
1873
1874 for (int r = 0; r < N; r++) {
1875 int y = r + 1;
1876 int base_x = (-y * dx) >> frac_bits_x;
1877 const int base_min_diff =
1878 (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
1879 upsample_above;
1880
1881 if (base_min_diff <= 0) {
1882 uint8x8_t a0_x_u8, a1_x_u8;
1883 uint16x4_t shift0;
1884 dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1885 &a0_x_u8, &a1_x_u8, &shift0);
1886 uint8x8_t a0_x = a0_x_u8;
1887 uint8x8_t a1_x = a1_x_u8;
1888
1889 uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x]
1890 uint16x8_t a32 =
1891 vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16
1892 uint16x8_t res =
1893 vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
1894 uint8x8_t resx = vshrn_n_u16(res, 5);
1895 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
1896 } else if (base_min_diff < 4) {
1897 uint8x8_t a0_x_u8, a1_x_u8;
1898 uint16x4_t shift0;
1899 dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
1900 &a0_x_u8, &a1_x_u8, &shift0);
1901 uint16x8_t a0_x = vmovl_u8(a0_x_u8);
1902 uint16x8_t a1_x = vmovl_u8(a1_x_u8);
1903
1904 uint16x4_t a0_y;
1905 uint16x4_t a1_y;
1906 uint16x4_t shift1;
1907 dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1908 frac_bits_y, &a0_y, &a1_y, &shift1);
1909 a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
1910 a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
1911
1912 uint16x8_t shift = vcombine_u16(shift0, shift1);
1913 uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x]
1914 uint16x8_t a32 =
1915 vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16
1916 uint16x8_t res = vmlaq_u16(a32, diff, shift);
1917 uint8x8_t resx = vshrn_n_u16(res, 5);
1918 uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
1919
1920 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1921 uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
1922 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
1923 } else {
1924 uint16x4_t a0_y, a1_y;
1925 uint16x4_t shift1;
1926 dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
1927 frac_bits_y, &a0_y, &a1_y, &shift1);
1928 uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x]
1929 uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16
1930 uint16x4_t res = vmla_u16(a32, diff, shift1);
1931 uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
1932
1933 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
1934 }
1935
1936 dst += stride;
1937 }
1938 #undef LEFT
1939 }
1940
dr_prediction_z2_Nx8_neon(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)1941 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
1942 const uint8_t *above, const uint8_t *left,
1943 int upsample_above, int upsample_left,
1944 int dx, int dy) {
1945 const int min_base_x = -(1 << upsample_above);
1946 const int min_base_y = -(1 << upsample_left);
1947 const int frac_bits_x = 6 - upsample_above;
1948 const int frac_bits_y = 6 - upsample_left;
1949
1950 // pre-filter above pixels
1951 // store in temp buffers:
1952 // above[x] * 32 + 16
1953 // above[x+1] - above[x]
1954 // final pixels will be calculated as:
1955 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1956
1957 #if AOM_ARCH_AARCH64
1958 // Use ext rather than loading left + 30 directly to avoid over-read.
1959 const uint8x16_t left_m2 = vld1q_u8(left - 2);
1960 const uint8x16_t left_0 = vld1q_u8(left + 0);
1961 const uint8x16_t left_16 = vld1q_u8(left + 16);
1962 const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
1963 const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
1964 const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
1965 #define LEFT left_vals
1966 #else // !AOM_ARCH_AARCH64
1967 #define LEFT left
1968 #endif // AOM_ARCH_AARCH64
1969
1970 for (int r = 0; r < N; r++) {
1971 int y = r + 1;
1972 int base_x = (-y * dx) >> frac_bits_x;
1973 int base_min_diff =
1974 (min_base_x - base_x + upsample_above) >> upsample_above;
1975
1976 if (base_min_diff <= 0) {
1977 uint8x8_t resx =
1978 dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1979 vst1_u8(dst, resx);
1980 } else if (base_min_diff < 8) {
1981 uint8x8_t resx =
1982 dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
1983 uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1984 LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1985 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
1986 uint8x8_t resxy = vbsl_u8(mask, resy, resx);
1987 vst1_u8(dst, resxy);
1988 } else {
1989 uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
1990 LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
1991 vst1_u8(dst, resy);
1992 }
1993
1994 dst += stride;
1995 }
1996 #undef LEFT
1997 }
1998
dr_prediction_z2_HxW_neon(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int dx,int dy)1999 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
2000 ptrdiff_t stride, const uint8_t *above,
2001 const uint8_t *left, int dx, int dy) {
2002 // here upsample_above and upsample_left are 0 by design of
2003 // av1_use_intra_edge_upsample
2004 const int min_base_x = -1;
2005
2006 #if AOM_ARCH_AARCH64
2007 const uint8x16_t left_m1 = vld1q_u8(left - 1);
2008 const uint8x16_t left_0 = vld1q_u8(left + 0);
2009 const uint8x16_t left_16 = vld1q_u8(left + 16);
2010 const uint8x16_t left_32 = vld1q_u8(left + 32);
2011 const uint8x16_t left_48 = vld1q_u8(left + 48);
2012 const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
2013 const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
2014 const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
2015 const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
2016 const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
2017 #define LEFT left_vals0, left_vals1
2018 #else // !AOM_ARCH_AARCH64
2019 #define LEFT left
2020 #endif // AOM_ARCH_AARCH64
2021
2022 for (int r = 0; r < H; r++) {
2023 int y = r + 1;
2024 int base_x = (-y * dx) >> 6;
2025 for (int j = 0; j < W; j += 16) {
2026 const int base_min_diff = min_base_x - base_x - j;
2027
2028 if (base_min_diff <= 0) {
2029 uint8x16_t resx =
2030 dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2031 vst1q_u8(dst + j, resx);
2032 } else if (base_min_diff < 16) {
2033 uint8x16_t resx =
2034 dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
2035 uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2036 uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
2037 uint8x16_t resxy = vbslq_u8(mask, resy, resx);
2038 vst1q_u8(dst + j, resxy);
2039 } else {
2040 uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
2041 vst1q_u8(dst + j, resy);
2042 }
2043 } // for j
2044 dst += stride;
2045 }
2046 #undef LEFT
2047 }
2048
2049 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)2050 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2051 const uint8_t *above, const uint8_t *left,
2052 int upsample_above, int upsample_left, int dx,
2053 int dy) {
2054 assert(dx > 0);
2055 assert(dy > 0);
2056
2057 switch (bw) {
2058 case 4:
2059 dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
2060 upsample_left, dx, dy);
2061 break;
2062 case 8:
2063 dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
2064 upsample_left, dx, dy);
2065 break;
2066 default:
2067 dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
2068 break;
2069 }
2070 }
2071 #endif // AOM_ARCH_AARCH64
2072
2073 /* ---------------------P R E D I C T I O N Z 3--------------------------- */
2074 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
z3_transpose_arrays_u8_16x4(const uint8x16_t * x,uint8x16x2_t * d)2075 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
2076 uint8x16x2_t *d) {
2077 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
2078 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
2079
2080 d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
2081 vreinterpretq_u16_u8(w1.val[0])));
2082 d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
2083 vreinterpretq_u16_u8(w1.val[1])));
2084 }
2085 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2086
z3_transpose_arrays_u8_4x4(const uint8x8_t * x,uint8x8x2_t * d)2087 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
2088 uint8x8x2_t *d) {
2089 uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2090 uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2091
2092 *d = aom_reinterpret_u8_u16_x2(
2093 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2094 }
2095
z3_transpose_arrays_u8_8x4(const uint8x8_t * x,uint8x8x2_t * d)2096 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
2097 uint8x8x2_t *d) {
2098 uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
2099 uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
2100
2101 d[0] = aom_reinterpret_u8_u16_x2(
2102 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
2103 d[1] = aom_reinterpret_u8_u16_x2(
2104 vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
2105 }
2106
z3_transpose_arrays_u8_16x16(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst)2107 static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
2108 uint8_t *dst, ptrdiff_t pitchDst) {
2109 // The same as the normal transposes in transpose_neon.h, but with a stride
2110 // between consecutive vectors of elements.
2111 uint8x16_t r[16];
2112 uint8x16_t d[16];
2113 for (int i = 0; i < 16; i++) {
2114 r[i] = vld1q_u8(src + i * pitchSrc);
2115 }
2116 transpose_arrays_u8_16x16(r, d);
2117 for (int i = 0; i < 16; i++) {
2118 vst1q_u8(dst + i * pitchDst, d[i]);
2119 }
2120 }
2121
z3_transpose_arrays_u8_16nx16n(const uint8_t * src,ptrdiff_t pitchSrc,uint8_t * dst,ptrdiff_t pitchDst,int width,int height)2122 static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
2123 ptrdiff_t pitchSrc, uint8_t *dst,
2124 ptrdiff_t pitchDst, int width,
2125 int height) {
2126 for (int j = 0; j < height; j += 16) {
2127 for (int i = 0; i < width; i += 16) {
2128 z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
2129 dst + j * pitchDst + i, pitchDst);
2130 }
2131 }
2132 }
2133
dr_prediction_z3_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2134 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
2135 const uint8_t *left, int upsample_left,
2136 int dy) {
2137 uint8x8_t dstvec[4];
2138 uint8x8x2_t dest;
2139
2140 dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
2141 z3_transpose_arrays_u8_4x4(dstvec, &dest);
2142 store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
2143 store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
2144 }
2145
dr_prediction_z3_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2146 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
2147 const uint8_t *left, int upsample_left,
2148 int dy) {
2149 uint8x8_t dstvec[8];
2150 uint8x8_t d[8];
2151
2152 dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
2153 transpose_arrays_u8_8x8(dstvec, d);
2154 store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
2155 }
2156
dr_prediction_z3_4x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2157 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
2158 const uint8_t *left, int upsample_left,
2159 int dy) {
2160 uint8x8_t dstvec[4];
2161 uint8x8x2_t d[2];
2162
2163 dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
2164 z3_transpose_arrays_u8_8x4(dstvec, d);
2165 store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
2166 store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
2167 store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
2168 store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
2169 }
2170
dr_prediction_z3_8x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2171 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
2172 const uint8_t *left, int upsample_left,
2173 int dy) {
2174 uint8x8_t dstvec[8];
2175 uint8x8_t d[8];
2176
2177 dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
2178 transpose_arrays_u8_8x8(dstvec, d);
2179 store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
2180 }
2181
dr_prediction_z3_8x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2182 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
2183 const uint8_t *left, int upsample_left,
2184 int dy) {
2185 uint8x16_t dstvec[8];
2186 uint8x8_t d[16];
2187
2188 dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
2189 transpose_arrays_u8_16x8(dstvec, d);
2190 for (int i = 0; i < 16; i++) {
2191 vst1_u8(dst + i * stride, d[i]);
2192 }
2193 }
2194
dr_prediction_z3_16x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2195 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
2196 const uint8_t *left, int upsample_left,
2197 int dy) {
2198 uint8x8_t dstvec[16];
2199 uint8x16_t d[8];
2200
2201 dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
2202 transpose_arrays_u8_8x16(dstvec, d);
2203 for (int i = 0; i < 8; i++) {
2204 vst1q_u8(dst + i * stride, d[i]);
2205 }
2206 }
2207
2208 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2209 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
2210 const uint8_t *left, int upsample_left,
2211 int dy) {
2212 uint8x16_t dstvec[4];
2213 uint8x16x2_t d[2];
2214
2215 dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
2216 z3_transpose_arrays_u8_16x4(dstvec, d);
2217 store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
2218 store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
2219 store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
2220 store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
2221 }
2222
dr_prediction_z3_16x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2223 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
2224 const uint8_t *left, int upsample_left,
2225 int dy) {
2226 uint8x8_t dstvec[16];
2227 uint8x16_t d[8];
2228
2229 dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
2230 transpose_arrays_u8_8x16(dstvec, d);
2231 for (int i = 0; i < 4; i++) {
2232 vst1q_u8(dst + i * stride, d[i]);
2233 }
2234 }
2235
dr_prediction_z3_8x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2236 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
2237 const uint8_t *left, int upsample_left,
2238 int dy) {
2239 (void)upsample_left;
2240 uint8x16x2_t dstvec[16];
2241 uint8x16_t d[32];
2242 uint8x16_t v_zero = vdupq_n_u8(0);
2243
2244 dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
2245 for (int i = 8; i < 16; i++) {
2246 dstvec[i].val[0] = v_zero;
2247 dstvec[i].val[1] = v_zero;
2248 }
2249 transpose_arrays_u8_32x16(dstvec, d);
2250 for (int i = 0; i < 32; i++) {
2251 vst1_u8(dst + i * stride, vget_low_u8(d[i]));
2252 }
2253 }
2254
dr_prediction_z3_32x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2255 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
2256 const uint8_t *left, int upsample_left,
2257 int dy) {
2258 uint8x8_t dstvec[32];
2259 uint8x16_t d[16];
2260
2261 dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
2262 transpose_arrays_u8_8x16(dstvec, d);
2263 transpose_arrays_u8_8x16(dstvec + 16, d + 8);
2264 for (int i = 0; i < 8; i++) {
2265 vst1q_u8(dst + i * stride, d[i]);
2266 vst1q_u8(dst + i * stride + 16, d[i + 8]);
2267 }
2268 }
2269 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2270
dr_prediction_z3_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2271 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
2272 const uint8_t *left, int upsample_left,
2273 int dy) {
2274 uint8x16_t dstvec[16];
2275 uint8x16_t d[16];
2276
2277 dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
2278 transpose_arrays_u8_16x16(dstvec, d);
2279 for (int i = 0; i < 16; i++) {
2280 vst1q_u8(dst + i * stride, d[i]);
2281 }
2282 }
2283
dr_prediction_z3_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2284 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
2285 const uint8_t *left, int upsample_left,
2286 int dy) {
2287 (void)upsample_left;
2288 uint8x16x2_t dstvec[32];
2289 uint8x16_t d[64];
2290
2291 dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
2292 transpose_arrays_u8_32x16(dstvec, d);
2293 transpose_arrays_u8_32x16(dstvec + 16, d + 32);
2294 for (int i = 0; i < 32; i++) {
2295 vst1q_u8(dst + i * stride, d[i]);
2296 vst1q_u8(dst + i * stride + 16, d[i + 32]);
2297 }
2298 }
2299
dr_prediction_z3_64x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2300 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
2301 const uint8_t *left, int upsample_left,
2302 int dy) {
2303 (void)upsample_left;
2304 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
2305
2306 dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
2307 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
2308 }
2309
dr_prediction_z3_16x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2310 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
2311 const uint8_t *left, int upsample_left,
2312 int dy) {
2313 (void)upsample_left;
2314 uint8x16x2_t dstvec[16];
2315 uint8x16_t d[32];
2316
2317 dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
2318 transpose_arrays_u8_32x16(dstvec, d);
2319 for (int i = 0; i < 16; i++) {
2320 vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
2321 vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
2322 }
2323 }
2324
dr_prediction_z3_32x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2325 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
2326 const uint8_t *left, int upsample_left,
2327 int dy) {
2328 uint8x16_t dstvec[32];
2329
2330 dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
2331 for (int i = 0; i < 32; i += 16) {
2332 uint8x16_t d[16];
2333 transpose_arrays_u8_16x16(dstvec + i, d);
2334 for (int j = 0; j < 16; j++) {
2335 vst1q_u8(dst + j * stride + i, d[j]);
2336 }
2337 }
2338 }
2339
dr_prediction_z3_32x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2340 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
2341 const uint8_t *left, int upsample_left,
2342 int dy) {
2343 (void)upsample_left;
2344 uint8_t dstT[64 * 32];
2345
2346 dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
2347 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
2348 }
2349
dr_prediction_z3_64x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2350 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
2351 const uint8_t *left, int upsample_left,
2352 int dy) {
2353 (void)upsample_left;
2354 uint8_t dstT[32 * 64];
2355
2356 dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
2357 z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
2358 }
2359
2360 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2361 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
2362 const uint8_t *left, int upsample_left,
2363 int dy) {
2364 (void)upsample_left;
2365 uint8_t dstT[64 * 16];
2366
2367 dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
2368 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
2369 }
2370
dr_prediction_z3_64x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)2371 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
2372 const uint8_t *left, int upsample_left,
2373 int dy) {
2374 uint8x16_t dstvec[64];
2375
2376 dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
2377 for (int i = 0; i < 64; i += 16) {
2378 uint8x16_t d[16];
2379 transpose_arrays_u8_16x16(dstvec + i, d);
2380 for (int j = 0; j < 16; ++j) {
2381 vst1q_u8(dst + j * stride + i, d[j]);
2382 }
2383 }
2384 }
2385 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2386
2387 typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
2388 const uint8_t *left, int upsample_left,
2389 int dy);
2390
2391 #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
2392 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
2393 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2394 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2395 { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, NULL,
2396 NULL, NULL },
2397 { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
2398 dr_prediction_z3_8x16_neon, NULL, NULL },
2399 { NULL, NULL, NULL, dr_prediction_z3_16x8_neon, dr_prediction_z3_16x16_neon,
2400 dr_prediction_z3_16x32_neon, NULL },
2401 { NULL, NULL, NULL, NULL, dr_prediction_z3_32x16_neon,
2402 dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
2403 { NULL, NULL, NULL, NULL, NULL, dr_prediction_z3_64x32_neon,
2404 dr_prediction_z3_64x64_neon },
2405 };
2406 #else
2407 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
2408 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2409 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2410 { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
2411 dr_prediction_z3_4x16_neon, NULL, NULL },
2412 { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
2413 dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
2414 { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
2415 dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
2416 dr_prediction_z3_16x64_neon },
2417 { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
2418 dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
2419 { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
2420 dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
2421 };
2422 #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER
2423
av1_dr_prediction_z3_neon(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)2424 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
2425 const uint8_t *above, const uint8_t *left,
2426 int upsample_left, int dx, int dy) {
2427 (void)above;
2428 (void)dx;
2429 assert(dx == 1);
2430 assert(dy > 0);
2431
2432 dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
2433 assert(f != NULL);
2434 f(dst, stride, left, upsample_left, dy);
2435 }
2436
2437 // -----------------------------------------------------------------------------
2438 // SMOOTH_PRED
2439
2440 // 256 - v = vneg_s8(v)
negate_s8(const uint8x8_t v)2441 static inline uint8x8_t negate_s8(const uint8x8_t v) {
2442 return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
2443 }
2444
smooth_4xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2445 static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
2446 const uint8_t *const top_row,
2447 const uint8_t *const left_column,
2448 const int height) {
2449 const uint8_t top_right = top_row[3];
2450 const uint8_t bottom_left = left_column[height - 1];
2451 const uint8_t *const weights_y = smooth_weights + height - 4;
2452
2453 uint8x8_t top_v = load_u8_4x1(top_row);
2454 const uint8x8_t top_right_v = vdup_n_u8(top_right);
2455 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2456 uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
2457 const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2458 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2459
2460 assert(height > 0);
2461 int y = 0;
2462 do {
2463 const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2464 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2465 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2466 const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
2467 const uint16x8_t weighted_top_bl =
2468 vmlal_u8(weighted_bl, weights_y_v, top_v);
2469 const uint16x8_t weighted_left_tr =
2470 vmlal_u8(weighted_tr, weights_x_v, left_v);
2471 // Maximum value of each parameter: 0xFF00
2472 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2473 const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2474
2475 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
2476 dst += stride;
2477 } while (++y != height);
2478 }
2479
calculate_pred(const uint16x8_t weighted_top_bl,const uint16x8_t weighted_left_tr)2480 static inline uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
2481 const uint16x8_t weighted_left_tr) {
2482 // Maximum value of each parameter: 0xFF00
2483 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
2484 return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
2485 }
2486
calculate_weights_and_pred(const uint8x8_t top,const uint8x8_t left,const uint16x8_t weighted_tr,const uint8x8_t bottom_left,const uint8x8_t weights_x,const uint8x8_t scaled_weights_y,const uint8x8_t weights_y)2487 static inline uint8x8_t calculate_weights_and_pred(
2488 const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
2489 const uint8x8_t bottom_left, const uint8x8_t weights_x,
2490 const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
2491 const uint16x8_t weighted_top = vmull_u8(weights_y, top);
2492 const uint16x8_t weighted_top_bl =
2493 vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
2494 const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
2495 return calculate_pred(weighted_top_bl, weighted_left_tr);
2496 }
2497
smooth_8xh_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,const int height)2498 static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
2499 const uint8_t *const top_row,
2500 const uint8_t *const left_column,
2501 const int height) {
2502 const uint8_t top_right = top_row[7];
2503 const uint8_t bottom_left = left_column[height - 1];
2504 const uint8_t *const weights_y = smooth_weights + height - 4;
2505
2506 const uint8x8_t top_v = vld1_u8(top_row);
2507 const uint8x8_t top_right_v = vdup_n_u8(top_right);
2508 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
2509 const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
2510 const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
2511 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
2512
2513 assert(height > 0);
2514 int y = 0;
2515 do {
2516 const uint8x8_t left_v = vdup_n_u8(left_column[y]);
2517 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
2518 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
2519 const uint8x8_t result =
2520 calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
2521 weights_x_v, scaled_weights_y, weights_y_v);
2522
2523 vst1_u8(dst, result);
2524 dst += stride;
2525 } while (++y != height);
2526 }
2527
2528 #define SMOOTH_NXM(W, H) \
2529 void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2530 const uint8_t *above, \
2531 const uint8_t *left) { \
2532 smooth_##W##xh_neon(dst, y_stride, above, left, H); \
2533 }
2534
2535 SMOOTH_NXM(4, 4)
2536 SMOOTH_NXM(4, 8)
2537 SMOOTH_NXM(8, 4)
2538 SMOOTH_NXM(8, 8)
2539 SMOOTH_NXM(8, 16)
2540 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2541 SMOOTH_NXM(4, 16)
2542 SMOOTH_NXM(8, 32)
2543 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2544
2545 #undef SMOOTH_NXM
2546
calculate_weights_and_predq(const uint8x16_t top,const uint8x8_t left,const uint8x8_t top_right,const uint8x8_t weights_y,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x,const uint16x8_t weighted_bl)2547 static inline uint8x16_t calculate_weights_and_predq(
2548 const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
2549 const uint8x8_t weights_y, const uint8x16_t weights_x,
2550 const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
2551 const uint16x8_t weighted_top_bl_low =
2552 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2553 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2554 const uint16x8_t weighted_left_tr_low =
2555 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2556 const uint8x8_t result_low =
2557 calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
2558
2559 const uint16x8_t weighted_top_bl_high =
2560 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2561 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2562 const uint16x8_t weighted_left_tr_high =
2563 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2564 const uint8x8_t result_high =
2565 calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
2566
2567 return vcombine_u8(result_low, result_high);
2568 }
2569
2570 // 256 - v = vneg_s8(v)
negate_s8q(const uint8x16_t v)2571 static inline uint8x16_t negate_s8q(const uint8x16_t v) {
2572 return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
2573 }
2574
2575 // For width 16 and above.
2576 #define SMOOTH_PREDICTOR(W) \
2577 static void smooth_##W##xh_neon( \
2578 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2579 const uint8_t *const left_column, const int height) { \
2580 const uint8_t top_right = top_row[(W)-1]; \
2581 const uint8_t bottom_left = left_column[height - 1]; \
2582 const uint8_t *const weights_y = smooth_weights + height - 4; \
2583 \
2584 uint8x16_t top_v[4]; \
2585 top_v[0] = vld1q_u8(top_row); \
2586 if ((W) > 16) { \
2587 top_v[1] = vld1q_u8(top_row + 16); \
2588 if ((W) == 64) { \
2589 top_v[2] = vld1q_u8(top_row + 32); \
2590 top_v[3] = vld1q_u8(top_row + 48); \
2591 } \
2592 } \
2593 \
2594 const uint8x8_t top_right_v = vdup_n_u8(top_right); \
2595 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
2596 \
2597 uint8x16_t weights_x_v[4]; \
2598 weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4); \
2599 if ((W) > 16) { \
2600 weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
2601 if ((W) == 64) { \
2602 weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
2603 weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
2604 } \
2605 } \
2606 \
2607 uint8x16_t scaled_weights_x[4]; \
2608 scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \
2609 if ((W) > 16) { \
2610 scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \
2611 if ((W) == 64) { \
2612 scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \
2613 scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \
2614 } \
2615 } \
2616 \
2617 for (int y = 0; y < height; ++y) { \
2618 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
2619 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
2620 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
2621 const uint16x8_t weighted_bl = \
2622 vmull_u8(scaled_weights_y, bottom_left_v); \
2623 \
2624 vst1q_u8(dst, calculate_weights_and_predq( \
2625 top_v[0], left_v, top_right_v, weights_y_v, \
2626 weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
2627 \
2628 if ((W) > 16) { \
2629 vst1q_u8(dst + 16, \
2630 calculate_weights_and_predq( \
2631 top_v[1], left_v, top_right_v, weights_y_v, \
2632 weights_x_v[1], scaled_weights_x[1], weighted_bl)); \
2633 if ((W) == 64) { \
2634 vst1q_u8(dst + 32, \
2635 calculate_weights_and_predq( \
2636 top_v[2], left_v, top_right_v, weights_y_v, \
2637 weights_x_v[2], scaled_weights_x[2], weighted_bl)); \
2638 vst1q_u8(dst + 48, \
2639 calculate_weights_and_predq( \
2640 top_v[3], left_v, top_right_v, weights_y_v, \
2641 weights_x_v[3], scaled_weights_x[3], weighted_bl)); \
2642 } \
2643 } \
2644 \
2645 dst += stride; \
2646 } \
2647 }
2648
2649 SMOOTH_PREDICTOR(16)
2650 SMOOTH_PREDICTOR(32)
2651 SMOOTH_PREDICTOR(64)
2652
2653 #undef SMOOTH_PREDICTOR
2654
2655 #define SMOOTH_NXM_WIDE(W, H) \
2656 void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
2657 const uint8_t *above, \
2658 const uint8_t *left) { \
2659 smooth_##W##xh_neon(dst, y_stride, above, left, H); \
2660 }
2661
2662 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2663 SMOOTH_NXM_WIDE(16, 4)
2664 SMOOTH_NXM_WIDE(16, 64)
2665 SMOOTH_NXM_WIDE(32, 8)
2666 SMOOTH_NXM_WIDE(64, 16)
2667 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2668 SMOOTH_NXM_WIDE(16, 8)
2669 SMOOTH_NXM_WIDE(16, 16)
2670 SMOOTH_NXM_WIDE(16, 32)
2671 SMOOTH_NXM_WIDE(32, 16)
2672 SMOOTH_NXM_WIDE(32, 32)
2673 SMOOTH_NXM_WIDE(32, 64)
2674 SMOOTH_NXM_WIDE(64, 32)
2675 SMOOTH_NXM_WIDE(64, 64)
2676
2677 #undef SMOOTH_NXM_WIDE
2678
2679 // -----------------------------------------------------------------------------
2680 // SMOOTH_V_PRED
2681
2682 // For widths 4 and 8.
2683 #define SMOOTH_V_PREDICTOR(W) \
2684 static void smooth_v_##W##xh_neon( \
2685 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2686 const uint8_t *const left_column, const int height) { \
2687 const uint8_t bottom_left = left_column[height - 1]; \
2688 const uint8_t *const weights_y = smooth_weights + height - 4; \
2689 \
2690 uint8x8_t top_v; \
2691 if ((W) == 4) { \
2692 top_v = load_u8_4x1(top_row); \
2693 } else { /* width == 8 */ \
2694 top_v = vld1_u8(top_row); \
2695 } \
2696 \
2697 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
2698 \
2699 assert(height > 0); \
2700 int y = 0; \
2701 do { \
2702 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
2703 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
2704 \
2705 const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \
2706 const uint16x8_t weighted_top_bl = \
2707 vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \
2708 const uint8x8_t pred = \
2709 vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \
2710 \
2711 if ((W) == 4) { \
2712 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
2713 } else { /* width == 8 */ \
2714 vst1_u8(dst, pred); \
2715 } \
2716 dst += stride; \
2717 } while (++y != height); \
2718 }
2719
2720 SMOOTH_V_PREDICTOR(4)
2721 SMOOTH_V_PREDICTOR(8)
2722
2723 #undef SMOOTH_V_PREDICTOR
2724
2725 #define SMOOTH_V_NXM(W, H) \
2726 void aom_smooth_v_predictor_##W##x##H##_neon( \
2727 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2728 const uint8_t *left) { \
2729 smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
2730 }
2731
2732 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2733 SMOOTH_V_NXM(4, 16)
2734 SMOOTH_V_NXM(8, 32)
2735 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2736 SMOOTH_V_NXM(4, 4)
2737 SMOOTH_V_NXM(4, 8)
2738 SMOOTH_V_NXM(8, 4)
2739 SMOOTH_V_NXM(8, 8)
2740 SMOOTH_V_NXM(8, 16)
2741
2742 #undef SMOOTH_V_NXM
2743
calculate_vertical_weights_and_pred(const uint8x16_t top,const uint8x8_t weights_y,const uint16x8_t weighted_bl)2744 static inline uint8x16_t calculate_vertical_weights_and_pred(
2745 const uint8x16_t top, const uint8x8_t weights_y,
2746 const uint16x8_t weighted_bl) {
2747 const uint16x8_t pred_low =
2748 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
2749 const uint16x8_t pred_high =
2750 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
2751 const uint8x8_t pred_scaled_low =
2752 vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
2753 const uint8x8_t pred_scaled_high =
2754 vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
2755 return vcombine_u8(pred_scaled_low, pred_scaled_high);
2756 }
2757
2758 // For width 16 and above.
2759 #define SMOOTH_V_PREDICTOR(W) \
2760 static void smooth_v_##W##xh_neon( \
2761 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2762 const uint8_t *const left_column, const int height) { \
2763 const uint8_t bottom_left = left_column[height - 1]; \
2764 const uint8_t *const weights_y = smooth_weights + height - 4; \
2765 \
2766 uint8x16_t top_v[4]; \
2767 top_v[0] = vld1q_u8(top_row); \
2768 if ((W) > 16) { \
2769 top_v[1] = vld1q_u8(top_row + 16); \
2770 if ((W) == 64) { \
2771 top_v[2] = vld1q_u8(top_row + 32); \
2772 top_v[3] = vld1q_u8(top_row + 48); \
2773 } \
2774 } \
2775 \
2776 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \
2777 \
2778 assert(height > 0); \
2779 int y = 0; \
2780 do { \
2781 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \
2782 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \
2783 const uint16x8_t weighted_bl = \
2784 vmull_u8(scaled_weights_y, bottom_left_v); \
2785 \
2786 const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \
2787 top_v[0], weights_y_v, weighted_bl); \
2788 vst1q_u8(dst, pred_0); \
2789 \
2790 if ((W) > 16) { \
2791 const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \
2792 top_v[1], weights_y_v, weighted_bl); \
2793 vst1q_u8(dst + 16, pred_1); \
2794 \
2795 if ((W) == 64) { \
2796 const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
2797 top_v[2], weights_y_v, weighted_bl); \
2798 vst1q_u8(dst + 32, pred_2); \
2799 \
2800 const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
2801 top_v[3], weights_y_v, weighted_bl); \
2802 vst1q_u8(dst + 48, pred_3); \
2803 } \
2804 } \
2805 \
2806 dst += stride; \
2807 } while (++y != height); \
2808 }
2809
2810 SMOOTH_V_PREDICTOR(16)
2811 SMOOTH_V_PREDICTOR(32)
2812 SMOOTH_V_PREDICTOR(64)
2813
2814 #undef SMOOTH_V_PREDICTOR
2815
2816 #define SMOOTH_V_NXM_WIDE(W, H) \
2817 void aom_smooth_v_predictor_##W##x##H##_neon( \
2818 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2819 const uint8_t *left) { \
2820 smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
2821 }
2822
2823 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2824 SMOOTH_V_NXM_WIDE(16, 4)
2825 SMOOTH_V_NXM_WIDE(32, 8)
2826 SMOOTH_V_NXM_WIDE(64, 16)
2827 SMOOTH_V_NXM_WIDE(16, 64)
2828 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2829 SMOOTH_V_NXM_WIDE(16, 8)
2830 SMOOTH_V_NXM_WIDE(16, 16)
2831 SMOOTH_V_NXM_WIDE(16, 32)
2832 SMOOTH_V_NXM_WIDE(32, 16)
2833 SMOOTH_V_NXM_WIDE(32, 32)
2834 SMOOTH_V_NXM_WIDE(32, 64)
2835 SMOOTH_V_NXM_WIDE(64, 32)
2836 SMOOTH_V_NXM_WIDE(64, 64)
2837
2838 #undef SMOOTH_V_NXM_WIDE
2839
2840 // -----------------------------------------------------------------------------
2841 // SMOOTH_H_PRED
2842
2843 // For widths 4 and 8.
2844 #define SMOOTH_H_PREDICTOR(W) \
2845 static void smooth_h_##W##xh_neon( \
2846 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2847 const uint8_t *const left_column, const int height) { \
2848 const uint8_t top_right = top_row[(W)-1]; \
2849 \
2850 const uint8x8_t top_right_v = vdup_n_u8(top_right); \
2851 /* Over-reads for 4xN but still within the array. */ \
2852 const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4); \
2853 const uint8x8_t scaled_weights_x = negate_s8(weights_x); \
2854 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
2855 \
2856 assert(height > 0); \
2857 int y = 0; \
2858 do { \
2859 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
2860 const uint16x8_t weighted_left_tr = \
2861 vmlal_u8(weighted_tr, weights_x, left_v); \
2862 const uint8x8_t pred = \
2863 vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \
2864 \
2865 if ((W) == 4) { \
2866 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
2867 } else { /* width == 8 */ \
2868 vst1_u8(dst, pred); \
2869 } \
2870 dst += stride; \
2871 } while (++y != height); \
2872 }
2873
2874 SMOOTH_H_PREDICTOR(4)
2875 SMOOTH_H_PREDICTOR(8)
2876
2877 #undef SMOOTH_H_PREDICTOR
2878
2879 #define SMOOTH_H_NXM(W, H) \
2880 void aom_smooth_h_predictor_##W##x##H##_neon( \
2881 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2882 const uint8_t *left) { \
2883 smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
2884 }
2885
2886 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2887 SMOOTH_H_NXM(4, 16)
2888 SMOOTH_H_NXM(8, 32)
2889 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2890 SMOOTH_H_NXM(4, 4)
2891 SMOOTH_H_NXM(4, 8)
2892 SMOOTH_H_NXM(8, 4)
2893 SMOOTH_H_NXM(8, 8)
2894 SMOOTH_H_NXM(8, 16)
2895
2896 #undef SMOOTH_H_NXM
2897
calculate_horizontal_weights_and_pred(const uint8x8_t left,const uint8x8_t top_right,const uint8x16_t weights_x,const uint8x16_t scaled_weights_x)2898 static inline uint8x16_t calculate_horizontal_weights_and_pred(
2899 const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
2900 const uint8x16_t scaled_weights_x) {
2901 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
2902 const uint16x8_t weighted_left_tr_low =
2903 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
2904 const uint8x8_t pred_scaled_low =
2905 vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
2906
2907 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
2908 const uint16x8_t weighted_left_tr_high =
2909 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
2910 const uint8x8_t pred_scaled_high =
2911 vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
2912
2913 return vcombine_u8(pred_scaled_low, pred_scaled_high);
2914 }
2915
2916 // For width 16 and above.
2917 #define SMOOTH_H_PREDICTOR(W) \
2918 static void smooth_h_##W##xh_neon( \
2919 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \
2920 const uint8_t *const left_column, const int height) { \
2921 const uint8_t top_right = top_row[(W)-1]; \
2922 \
2923 const uint8x8_t top_right_v = vdup_n_u8(top_right); \
2924 \
2925 uint8x16_t weights_x[4]; \
2926 weights_x[0] = vld1q_u8(smooth_weights + (W)-4); \
2927 if ((W) > 16) { \
2928 weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \
2929 if ((W) == 64) { \
2930 weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \
2931 weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \
2932 } \
2933 } \
2934 \
2935 uint8x16_t scaled_weights_x[4]; \
2936 scaled_weights_x[0] = negate_s8q(weights_x[0]); \
2937 if ((W) > 16) { \
2938 scaled_weights_x[1] = negate_s8q(weights_x[1]); \
2939 if ((W) == 64) { \
2940 scaled_weights_x[2] = negate_s8q(weights_x[2]); \
2941 scaled_weights_x[3] = negate_s8q(weights_x[3]); \
2942 } \
2943 } \
2944 \
2945 assert(height > 0); \
2946 int y = 0; \
2947 do { \
2948 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \
2949 \
2950 const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \
2951 left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \
2952 vst1q_u8(dst, pred_0); \
2953 \
2954 if ((W) > 16) { \
2955 const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \
2956 left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \
2957 vst1q_u8(dst + 16, pred_1); \
2958 \
2959 if ((W) == 64) { \
2960 const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
2961 left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \
2962 vst1q_u8(dst + 32, pred_2); \
2963 \
2964 const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
2965 left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \
2966 vst1q_u8(dst + 48, pred_3); \
2967 } \
2968 } \
2969 dst += stride; \
2970 } while (++y != height); \
2971 }
2972
2973 SMOOTH_H_PREDICTOR(16)
2974 SMOOTH_H_PREDICTOR(32)
2975 SMOOTH_H_PREDICTOR(64)
2976
2977 #undef SMOOTH_H_PREDICTOR
2978
2979 #define SMOOTH_H_NXM_WIDE(W, H) \
2980 void aom_smooth_h_predictor_##W##x##H##_neon( \
2981 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
2982 const uint8_t *left) { \
2983 smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
2984 }
2985
2986 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2987 SMOOTH_H_NXM_WIDE(16, 4)
2988 SMOOTH_H_NXM_WIDE(16, 64)
2989 SMOOTH_H_NXM_WIDE(32, 8)
2990 SMOOTH_H_NXM_WIDE(64, 16)
2991 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2992 SMOOTH_H_NXM_WIDE(16, 8)
2993 SMOOTH_H_NXM_WIDE(16, 16)
2994 SMOOTH_H_NXM_WIDE(16, 32)
2995 SMOOTH_H_NXM_WIDE(32, 16)
2996 SMOOTH_H_NXM_WIDE(32, 32)
2997 SMOOTH_H_NXM_WIDE(32, 64)
2998 SMOOTH_H_NXM_WIDE(64, 32)
2999 SMOOTH_H_NXM_WIDE(64, 64)
3000
3001 #undef SMOOTH_H_NXM_WIDE
3002
3003 // -----------------------------------------------------------------------------
3004 // PAETH
3005
paeth_4or8_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)3006 static inline void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
3007 const uint8_t *const top_row,
3008 const uint8_t *const left_column,
3009 int width, int height) {
3010 const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
3011 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
3012 uint8x8_t top;
3013 if (width == 4) {
3014 top = load_u8_4x1(top_row);
3015 } else { // width == 8
3016 top = vld1_u8(top_row);
3017 }
3018
3019 assert(height > 0);
3020 int y = 0;
3021 do {
3022 const uint8x8_t left = vdup_n_u8(left_column[y]);
3023
3024 const uint8x8_t left_dist = vabd_u8(top, top_left);
3025 const uint8x8_t top_dist = vabd_u8(left, top_left);
3026 const uint16x8_t top_left_dist =
3027 vabdq_u16(vaddl_u8(top, left), top_left_x2);
3028
3029 const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
3030 const uint8x8_t left_le_top_left =
3031 vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
3032 const uint8x8_t top_le_top_left =
3033 vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
3034
3035 // if (left_dist <= top_dist && left_dist <= top_left_dist)
3036 const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
3037 // dest[x] = left_column[y];
3038 // Fill all the unused spaces with 'top'. They will be overwritten when
3039 // the positions for top_left are known.
3040 uint8x8_t result = vbsl_u8(left_mask, left, top);
3041 // else if (top_dist <= top_left_dist)
3042 // dest[x] = top_row[x];
3043 // Add these values to the mask. They were already set.
3044 const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
3045 // else
3046 // dest[x] = top_left;
3047 result = vbsl_u8(left_or_top_mask, result, top_left);
3048
3049 if (width == 4) {
3050 store_u8_4x1(dest, result);
3051 } else { // width == 8
3052 vst1_u8(dest, result);
3053 }
3054 dest += stride;
3055 } while (++y != height);
3056 }
3057
3058 #define PAETH_NXM(W, H) \
3059 void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
3060 const uint8_t *above, \
3061 const uint8_t *left) { \
3062 paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
3063 }
3064
3065 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3066 PAETH_NXM(4, 16)
3067 PAETH_NXM(8, 32)
3068 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3069 PAETH_NXM(4, 4)
3070 PAETH_NXM(4, 8)
3071 PAETH_NXM(8, 4)
3072 PAETH_NXM(8, 8)
3073 PAETH_NXM(8, 16)
3074
3075 // Calculate X distance <= TopLeft distance and pack the resulting mask into
3076 // uint8x8_t.
x_le_top_left(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)3077 static inline uint8x16_t x_le_top_left(const uint8x16_t x_dist,
3078 const uint16x8_t top_left_dist_low,
3079 const uint16x8_t top_left_dist_high) {
3080 const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
3081 vqmovn_u16(top_left_dist_high));
3082 return vcleq_u8(x_dist, top_left_dist);
3083 }
3084
3085 // Select the closest values and collect them.
select_paeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)3086 static inline uint8x16_t select_paeth(const uint8x16_t top,
3087 const uint8x16_t left,
3088 const uint8x16_t top_left,
3089 const uint8x16_t left_le_top,
3090 const uint8x16_t left_le_top_left,
3091 const uint8x16_t top_le_top_left) {
3092 // if (left_dist <= top_dist && left_dist <= top_left_dist)
3093 const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
3094 // dest[x] = left_column[y];
3095 // Fill all the unused spaces with 'top'. They will be overwritten when
3096 // the positions for top_left are known.
3097 uint8x16_t result = vbslq_u8(left_mask, left, top);
3098 // else if (top_dist <= top_left_dist)
3099 // dest[x] = top_row[x];
3100 // Add these values to the mask. They were already set.
3101 const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
3102 // else
3103 // dest[x] = top_left;
3104 return vbslq_u8(left_or_top_mask, result, top_left);
3105 }
3106
3107 // Generate numbered and high/low versions of top_left_dist.
3108 #define TOP_LEFT_DIST(num) \
3109 const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
3110 vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
3111 const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
3112 vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
3113
3114 // Generate numbered versions of XLeTopLeft with x = left.
3115 #define LEFT_LE_TOP_LEFT(num) \
3116 const uint8x16_t left_le_top_left_##num = \
3117 x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
3118 top_left_##num##_dist_high)
3119
3120 // Generate numbered versions of XLeTopLeft with x = top.
3121 #define TOP_LE_TOP_LEFT(num) \
3122 const uint8x16_t top_le_top_left_##num = x_le_top_left( \
3123 top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
3124
paeth16_plus_x_h_neon(uint8_t * dest,ptrdiff_t stride,const uint8_t * const top_row,const uint8_t * const left_column,int width,int height)3125 static inline void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
3126 const uint8_t *const top_row,
3127 const uint8_t *const left_column,
3128 int width, int height) {
3129 const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
3130 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
3131 uint8x16_t top[4];
3132 top[0] = vld1q_u8(top_row);
3133 if (width > 16) {
3134 top[1] = vld1q_u8(top_row + 16);
3135 if (width == 64) {
3136 top[2] = vld1q_u8(top_row + 32);
3137 top[3] = vld1q_u8(top_row + 48);
3138 }
3139 }
3140
3141 assert(height > 0);
3142 int y = 0;
3143 do {
3144 const uint8x16_t left = vdupq_n_u8(left_column[y]);
3145
3146 const uint8x16_t top_dist = vabdq_u8(left, top_left);
3147
3148 const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
3149 TOP_LEFT_DIST(0);
3150 const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
3151 LEFT_LE_TOP_LEFT(0);
3152 TOP_LE_TOP_LEFT(0);
3153
3154 const uint8x16_t result_0 =
3155 select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
3156 top_le_top_left_0);
3157 vst1q_u8(dest, result_0);
3158
3159 if (width > 16) {
3160 const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
3161 TOP_LEFT_DIST(1);
3162 const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
3163 LEFT_LE_TOP_LEFT(1);
3164 TOP_LE_TOP_LEFT(1);
3165
3166 const uint8x16_t result_1 =
3167 select_paeth(top[1], left, top_left, left_1_le_top,
3168 left_le_top_left_1, top_le_top_left_1);
3169 vst1q_u8(dest + 16, result_1);
3170
3171 if (width == 64) {
3172 const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
3173 TOP_LEFT_DIST(2);
3174 const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
3175 LEFT_LE_TOP_LEFT(2);
3176 TOP_LE_TOP_LEFT(2);
3177
3178 const uint8x16_t result_2 =
3179 select_paeth(top[2], left, top_left, left_2_le_top,
3180 left_le_top_left_2, top_le_top_left_2);
3181 vst1q_u8(dest + 32, result_2);
3182
3183 const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
3184 TOP_LEFT_DIST(3);
3185 const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
3186 LEFT_LE_TOP_LEFT(3);
3187 TOP_LE_TOP_LEFT(3);
3188
3189 const uint8x16_t result_3 =
3190 select_paeth(top[3], left, top_left, left_3_le_top,
3191 left_le_top_left_3, top_le_top_left_3);
3192 vst1q_u8(dest + 48, result_3);
3193 }
3194 }
3195
3196 dest += stride;
3197 } while (++y != height);
3198 }
3199
3200 #define PAETH_NXM_WIDE(W, H) \
3201 void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
3202 const uint8_t *above, \
3203 const uint8_t *left) { \
3204 paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
3205 }
3206
3207 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3208 PAETH_NXM_WIDE(16, 4)
3209 PAETH_NXM_WIDE(16, 64)
3210 PAETH_NXM_WIDE(32, 8)
3211 PAETH_NXM_WIDE(64, 16)
3212 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3213 PAETH_NXM_WIDE(16, 8)
3214 PAETH_NXM_WIDE(16, 16)
3215 PAETH_NXM_WIDE(16, 32)
3216 PAETH_NXM_WIDE(32, 16)
3217 PAETH_NXM_WIDE(32, 32)
3218 PAETH_NXM_WIDE(32, 64)
3219 PAETH_NXM_WIDE(64, 32)
3220 PAETH_NXM_WIDE(64, 64)
3221