1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
16*77c1e3ccSAndroid Build Coastguard Worker
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/dist_wtd_avg_neon.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/sum_neon.h"
21*77c1e3ccSAndroid Build Coastguard Worker
sad128xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)22*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
23*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
24*77c1e3ccSAndroid Build Coastguard Worker int h) {
25*77c1e3ccSAndroid Build Coastguard Worker // We use 8 accumulators to prevent overflow for large values of 'h', as well
26*77c1e3ccSAndroid Build Coastguard Worker // as enabling optimal UADALP instruction throughput on CPUs that have either
27*77c1e3ccSAndroid Build Coastguard Worker // 2 or 4 Neon pipes.
28*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
29*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
30*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0), vdupq_n_u16(0) };
31*77c1e3ccSAndroid Build Coastguard Worker
32*77c1e3ccSAndroid Build Coastguard Worker int i = h;
33*77c1e3ccSAndroid Build Coastguard Worker do {
34*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
35*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
36*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
37*77c1e3ccSAndroid Build Coastguard Worker
38*77c1e3ccSAndroid Build Coastguard Worker s0 = vld1q_u8(src_ptr);
39*77c1e3ccSAndroid Build Coastguard Worker r0 = vld1q_u8(ref_ptr);
40*77c1e3ccSAndroid Build Coastguard Worker diff0 = vabdq_u8(s0, r0);
41*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
42*77c1e3ccSAndroid Build Coastguard Worker
43*77c1e3ccSAndroid Build Coastguard Worker s1 = vld1q_u8(src_ptr + 16);
44*77c1e3ccSAndroid Build Coastguard Worker r1 = vld1q_u8(ref_ptr + 16);
45*77c1e3ccSAndroid Build Coastguard Worker diff1 = vabdq_u8(s1, r1);
46*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
47*77c1e3ccSAndroid Build Coastguard Worker
48*77c1e3ccSAndroid Build Coastguard Worker s2 = vld1q_u8(src_ptr + 32);
49*77c1e3ccSAndroid Build Coastguard Worker r2 = vld1q_u8(ref_ptr + 32);
50*77c1e3ccSAndroid Build Coastguard Worker diff2 = vabdq_u8(s2, r2);
51*77c1e3ccSAndroid Build Coastguard Worker sum[2] = vpadalq_u8(sum[2], diff2);
52*77c1e3ccSAndroid Build Coastguard Worker
53*77c1e3ccSAndroid Build Coastguard Worker s3 = vld1q_u8(src_ptr + 48);
54*77c1e3ccSAndroid Build Coastguard Worker r3 = vld1q_u8(ref_ptr + 48);
55*77c1e3ccSAndroid Build Coastguard Worker diff3 = vabdq_u8(s3, r3);
56*77c1e3ccSAndroid Build Coastguard Worker sum[3] = vpadalq_u8(sum[3], diff3);
57*77c1e3ccSAndroid Build Coastguard Worker
58*77c1e3ccSAndroid Build Coastguard Worker s4 = vld1q_u8(src_ptr + 64);
59*77c1e3ccSAndroid Build Coastguard Worker r4 = vld1q_u8(ref_ptr + 64);
60*77c1e3ccSAndroid Build Coastguard Worker diff4 = vabdq_u8(s4, r4);
61*77c1e3ccSAndroid Build Coastguard Worker sum[4] = vpadalq_u8(sum[4], diff4);
62*77c1e3ccSAndroid Build Coastguard Worker
63*77c1e3ccSAndroid Build Coastguard Worker s5 = vld1q_u8(src_ptr + 80);
64*77c1e3ccSAndroid Build Coastguard Worker r5 = vld1q_u8(ref_ptr + 80);
65*77c1e3ccSAndroid Build Coastguard Worker diff5 = vabdq_u8(s5, r5);
66*77c1e3ccSAndroid Build Coastguard Worker sum[5] = vpadalq_u8(sum[5], diff5);
67*77c1e3ccSAndroid Build Coastguard Worker
68*77c1e3ccSAndroid Build Coastguard Worker s6 = vld1q_u8(src_ptr + 96);
69*77c1e3ccSAndroid Build Coastguard Worker r6 = vld1q_u8(ref_ptr + 96);
70*77c1e3ccSAndroid Build Coastguard Worker diff6 = vabdq_u8(s6, r6);
71*77c1e3ccSAndroid Build Coastguard Worker sum[6] = vpadalq_u8(sum[6], diff6);
72*77c1e3ccSAndroid Build Coastguard Worker
73*77c1e3ccSAndroid Build Coastguard Worker s7 = vld1q_u8(src_ptr + 112);
74*77c1e3ccSAndroid Build Coastguard Worker r7 = vld1q_u8(ref_ptr + 112);
75*77c1e3ccSAndroid Build Coastguard Worker diff7 = vabdq_u8(s7, r7);
76*77c1e3ccSAndroid Build Coastguard Worker sum[7] = vpadalq_u8(sum[7], diff7);
77*77c1e3ccSAndroid Build Coastguard Worker
78*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
79*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
80*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
81*77c1e3ccSAndroid Build Coastguard Worker
82*77c1e3ccSAndroid Build Coastguard Worker uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
83*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[1]);
84*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[2]);
85*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[3]);
86*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[4]);
87*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[5]);
88*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[6]);
89*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[7]);
90*77c1e3ccSAndroid Build Coastguard Worker
91*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u32x4(sum_u32);
92*77c1e3ccSAndroid Build Coastguard Worker }
93*77c1e3ccSAndroid Build Coastguard Worker
sad64xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)94*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
95*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
96*77c1e3ccSAndroid Build Coastguard Worker int h) {
97*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
98*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0) };
99*77c1e3ccSAndroid Build Coastguard Worker
100*77c1e3ccSAndroid Build Coastguard Worker int i = h;
101*77c1e3ccSAndroid Build Coastguard Worker do {
102*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
103*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0, diff1, diff2, diff3;
104*77c1e3ccSAndroid Build Coastguard Worker
105*77c1e3ccSAndroid Build Coastguard Worker s0 = vld1q_u8(src_ptr);
106*77c1e3ccSAndroid Build Coastguard Worker r0 = vld1q_u8(ref_ptr);
107*77c1e3ccSAndroid Build Coastguard Worker diff0 = vabdq_u8(s0, r0);
108*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
109*77c1e3ccSAndroid Build Coastguard Worker
110*77c1e3ccSAndroid Build Coastguard Worker s1 = vld1q_u8(src_ptr + 16);
111*77c1e3ccSAndroid Build Coastguard Worker r1 = vld1q_u8(ref_ptr + 16);
112*77c1e3ccSAndroid Build Coastguard Worker diff1 = vabdq_u8(s1, r1);
113*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
114*77c1e3ccSAndroid Build Coastguard Worker
115*77c1e3ccSAndroid Build Coastguard Worker s2 = vld1q_u8(src_ptr + 32);
116*77c1e3ccSAndroid Build Coastguard Worker r2 = vld1q_u8(ref_ptr + 32);
117*77c1e3ccSAndroid Build Coastguard Worker diff2 = vabdq_u8(s2, r2);
118*77c1e3ccSAndroid Build Coastguard Worker sum[2] = vpadalq_u8(sum[2], diff2);
119*77c1e3ccSAndroid Build Coastguard Worker
120*77c1e3ccSAndroid Build Coastguard Worker s3 = vld1q_u8(src_ptr + 48);
121*77c1e3ccSAndroid Build Coastguard Worker r3 = vld1q_u8(ref_ptr + 48);
122*77c1e3ccSAndroid Build Coastguard Worker diff3 = vabdq_u8(s3, r3);
123*77c1e3ccSAndroid Build Coastguard Worker sum[3] = vpadalq_u8(sum[3], diff3);
124*77c1e3ccSAndroid Build Coastguard Worker
125*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
126*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
127*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
128*77c1e3ccSAndroid Build Coastguard Worker
129*77c1e3ccSAndroid Build Coastguard Worker uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
130*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[1]);
131*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[2]);
132*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[3]);
133*77c1e3ccSAndroid Build Coastguard Worker
134*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u32x4(sum_u32);
135*77c1e3ccSAndroid Build Coastguard Worker }
136*77c1e3ccSAndroid Build Coastguard Worker
sad32xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)137*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
138*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
139*77c1e3ccSAndroid Build Coastguard Worker int h) {
140*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
141*77c1e3ccSAndroid Build Coastguard Worker
142*77c1e3ccSAndroid Build Coastguard Worker int i = h;
143*77c1e3ccSAndroid Build Coastguard Worker do {
144*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
145*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0 = vld1q_u8(ref_ptr);
146*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0 = vabdq_u8(s0, r0);
147*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
148*77c1e3ccSAndroid Build Coastguard Worker
149*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + 16);
150*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
151*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff1 = vabdq_u8(s1, r1);
152*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
153*77c1e3ccSAndroid Build Coastguard Worker
154*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
155*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
156*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
157*77c1e3ccSAndroid Build Coastguard Worker
158*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
159*77c1e3ccSAndroid Build Coastguard Worker }
160*77c1e3ccSAndroid Build Coastguard Worker
sad16xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)161*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
162*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
163*77c1e3ccSAndroid Build Coastguard Worker int h) {
164*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
165*77c1e3ccSAndroid Build Coastguard Worker
166*77c1e3ccSAndroid Build Coastguard Worker int i = h;
167*77c1e3ccSAndroid Build Coastguard Worker do {
168*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s = vld1q_u8(src_ptr);
169*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r = vld1q_u8(ref_ptr);
170*77c1e3ccSAndroid Build Coastguard Worker
171*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vabdq_u8(s, r);
172*77c1e3ccSAndroid Build Coastguard Worker sum = vpadalq_u8(sum, diff);
173*77c1e3ccSAndroid Build Coastguard Worker
174*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
175*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
176*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
177*77c1e3ccSAndroid Build Coastguard Worker
178*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
179*77c1e3ccSAndroid Build Coastguard Worker }
180*77c1e3ccSAndroid Build Coastguard Worker
sad8xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)181*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
182*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
183*77c1e3ccSAndroid Build Coastguard Worker int h) {
184*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
185*77c1e3ccSAndroid Build Coastguard Worker
186*77c1e3ccSAndroid Build Coastguard Worker int i = h;
187*77c1e3ccSAndroid Build Coastguard Worker do {
188*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s = vld1_u8(src_ptr);
189*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t r = vld1_u8(ref_ptr);
190*77c1e3ccSAndroid Build Coastguard Worker
191*77c1e3ccSAndroid Build Coastguard Worker sum = vabal_u8(sum, s, r);
192*77c1e3ccSAndroid Build Coastguard Worker
193*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
194*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
195*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
196*77c1e3ccSAndroid Build Coastguard Worker
197*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
198*77c1e3ccSAndroid Build Coastguard Worker }
199*77c1e3ccSAndroid Build Coastguard Worker
sad4xh_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h)200*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
201*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr, int ref_stride,
202*77c1e3ccSAndroid Build Coastguard Worker int h) {
203*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
204*77c1e3ccSAndroid Build Coastguard Worker
205*77c1e3ccSAndroid Build Coastguard Worker int i = h / 2;
206*77c1e3ccSAndroid Build Coastguard Worker do {
207*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
208*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
209*77c1e3ccSAndroid Build Coastguard Worker
210*77c1e3ccSAndroid Build Coastguard Worker sum = vabal_u8(sum, s, r);
211*77c1e3ccSAndroid Build Coastguard Worker
212*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 2 * src_stride;
213*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += 2 * ref_stride;
214*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
215*77c1e3ccSAndroid Build Coastguard Worker
216*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
217*77c1e3ccSAndroid Build Coastguard Worker }
218*77c1e3ccSAndroid Build Coastguard Worker
219*77c1e3ccSAndroid Build Coastguard Worker #define SAD_WXH_NEON(w, h) \
220*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \
221*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, int ref_stride) { \
222*77c1e3ccSAndroid Build Coastguard Worker return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
223*77c1e3ccSAndroid Build Coastguard Worker }
224*77c1e3ccSAndroid Build Coastguard Worker
225*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(4, 4)
226*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(4, 8)
227*77c1e3ccSAndroid Build Coastguard Worker
228*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(8, 4)
229*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(8, 8)
230*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(8, 16)
231*77c1e3ccSAndroid Build Coastguard Worker
232*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(16, 8)
233*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(16, 16)
234*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(16, 32)
235*77c1e3ccSAndroid Build Coastguard Worker
236*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(32, 16)
237*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(32, 32)
238*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(32, 64)
239*77c1e3ccSAndroid Build Coastguard Worker
240*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(64, 32)
241*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(64, 64)
242*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(64, 128)
243*77c1e3ccSAndroid Build Coastguard Worker
244*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(128, 64)
245*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(128, 128)
246*77c1e3ccSAndroid Build Coastguard Worker
247*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
248*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(4, 16)
249*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(8, 32)
250*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(16, 4)
251*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(16, 64)
252*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(32, 8)
253*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_NEON(64, 16)
254*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
255*77c1e3ccSAndroid Build Coastguard Worker
256*77c1e3ccSAndroid Build Coastguard Worker #undef SAD_WXH_NEON
257*77c1e3ccSAndroid Build Coastguard Worker
258*77c1e3ccSAndroid Build Coastguard Worker #define SAD_SKIP_WXH_NEON(w, h) \
259*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad_skip_##w##x##h##_neon( \
260*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, const uint8_t *ref, \
261*77c1e3ccSAndroid Build Coastguard Worker int ref_stride) { \
262*77c1e3ccSAndroid Build Coastguard Worker return 2 * \
263*77c1e3ccSAndroid Build Coastguard Worker sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
264*77c1e3ccSAndroid Build Coastguard Worker }
265*77c1e3ccSAndroid Build Coastguard Worker
266*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(4, 4)
267*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(4, 8)
268*77c1e3ccSAndroid Build Coastguard Worker
269*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(8, 4)
270*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(8, 8)
271*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(8, 16)
272*77c1e3ccSAndroid Build Coastguard Worker
273*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(16, 8)
274*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(16, 16)
275*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(16, 32)
276*77c1e3ccSAndroid Build Coastguard Worker
277*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(32, 16)
278*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(32, 32)
279*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(32, 64)
280*77c1e3ccSAndroid Build Coastguard Worker
281*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(64, 32)
282*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(64, 64)
283*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(64, 128)
284*77c1e3ccSAndroid Build Coastguard Worker
285*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(128, 64)
286*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(128, 128)
287*77c1e3ccSAndroid Build Coastguard Worker
288*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
289*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(4, 16)
290*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(8, 32)
291*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(16, 4)
292*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(16, 64)
293*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(32, 8)
294*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_NEON(64, 16)
295*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
296*77c1e3ccSAndroid Build Coastguard Worker
297*77c1e3ccSAndroid Build Coastguard Worker #undef SAD_SKIP_WXH_NEON
298*77c1e3ccSAndroid Build Coastguard Worker
sad128xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)299*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
300*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
301*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr,
302*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h,
303*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) {
304*77c1e3ccSAndroid Build Coastguard Worker // We use 8 accumulators to prevent overflow for large values of 'h', as well
305*77c1e3ccSAndroid Build Coastguard Worker // as enabling optimal UADALP instruction throughput on CPUs that have either
306*77c1e3ccSAndroid Build Coastguard Worker // 2 or 4 Neon pipes.
307*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
308*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
309*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0), vdupq_n_u16(0) };
310*77c1e3ccSAndroid Build Coastguard Worker
311*77c1e3ccSAndroid Build Coastguard Worker int i = h;
312*77c1e3ccSAndroid Build Coastguard Worker do {
313*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
314*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
315*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7;
316*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7;
317*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
318*77c1e3ccSAndroid Build Coastguard Worker
319*77c1e3ccSAndroid Build Coastguard Worker s0 = vld1q_u8(src_ptr);
320*77c1e3ccSAndroid Build Coastguard Worker r0 = vld1q_u8(ref_ptr);
321*77c1e3ccSAndroid Build Coastguard Worker p0 = vld1q_u8(second_pred);
322*77c1e3ccSAndroid Build Coastguard Worker avg0 = vrhaddq_u8(r0, p0);
323*77c1e3ccSAndroid Build Coastguard Worker diff0 = vabdq_u8(s0, avg0);
324*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
325*77c1e3ccSAndroid Build Coastguard Worker
326*77c1e3ccSAndroid Build Coastguard Worker s1 = vld1q_u8(src_ptr + 16);
327*77c1e3ccSAndroid Build Coastguard Worker r1 = vld1q_u8(ref_ptr + 16);
328*77c1e3ccSAndroid Build Coastguard Worker p1 = vld1q_u8(second_pred + 16);
329*77c1e3ccSAndroid Build Coastguard Worker avg1 = vrhaddq_u8(r1, p1);
330*77c1e3ccSAndroid Build Coastguard Worker diff1 = vabdq_u8(s1, avg1);
331*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
332*77c1e3ccSAndroid Build Coastguard Worker
333*77c1e3ccSAndroid Build Coastguard Worker s2 = vld1q_u8(src_ptr + 32);
334*77c1e3ccSAndroid Build Coastguard Worker r2 = vld1q_u8(ref_ptr + 32);
335*77c1e3ccSAndroid Build Coastguard Worker p2 = vld1q_u8(second_pred + 32);
336*77c1e3ccSAndroid Build Coastguard Worker avg2 = vrhaddq_u8(r2, p2);
337*77c1e3ccSAndroid Build Coastguard Worker diff2 = vabdq_u8(s2, avg2);
338*77c1e3ccSAndroid Build Coastguard Worker sum[2] = vpadalq_u8(sum[2], diff2);
339*77c1e3ccSAndroid Build Coastguard Worker
340*77c1e3ccSAndroid Build Coastguard Worker s3 = vld1q_u8(src_ptr + 48);
341*77c1e3ccSAndroid Build Coastguard Worker r3 = vld1q_u8(ref_ptr + 48);
342*77c1e3ccSAndroid Build Coastguard Worker p3 = vld1q_u8(second_pred + 48);
343*77c1e3ccSAndroid Build Coastguard Worker avg3 = vrhaddq_u8(r3, p3);
344*77c1e3ccSAndroid Build Coastguard Worker diff3 = vabdq_u8(s3, avg3);
345*77c1e3ccSAndroid Build Coastguard Worker sum[3] = vpadalq_u8(sum[3], diff3);
346*77c1e3ccSAndroid Build Coastguard Worker
347*77c1e3ccSAndroid Build Coastguard Worker s4 = vld1q_u8(src_ptr + 64);
348*77c1e3ccSAndroid Build Coastguard Worker r4 = vld1q_u8(ref_ptr + 64);
349*77c1e3ccSAndroid Build Coastguard Worker p4 = vld1q_u8(second_pred + 64);
350*77c1e3ccSAndroid Build Coastguard Worker avg4 = vrhaddq_u8(r4, p4);
351*77c1e3ccSAndroid Build Coastguard Worker diff4 = vabdq_u8(s4, avg4);
352*77c1e3ccSAndroid Build Coastguard Worker sum[4] = vpadalq_u8(sum[4], diff4);
353*77c1e3ccSAndroid Build Coastguard Worker
354*77c1e3ccSAndroid Build Coastguard Worker s5 = vld1q_u8(src_ptr + 80);
355*77c1e3ccSAndroid Build Coastguard Worker r5 = vld1q_u8(ref_ptr + 80);
356*77c1e3ccSAndroid Build Coastguard Worker p5 = vld1q_u8(second_pred + 80);
357*77c1e3ccSAndroid Build Coastguard Worker avg5 = vrhaddq_u8(r5, p5);
358*77c1e3ccSAndroid Build Coastguard Worker diff5 = vabdq_u8(s5, avg5);
359*77c1e3ccSAndroid Build Coastguard Worker sum[5] = vpadalq_u8(sum[5], diff5);
360*77c1e3ccSAndroid Build Coastguard Worker
361*77c1e3ccSAndroid Build Coastguard Worker s6 = vld1q_u8(src_ptr + 96);
362*77c1e3ccSAndroid Build Coastguard Worker r6 = vld1q_u8(ref_ptr + 96);
363*77c1e3ccSAndroid Build Coastguard Worker p6 = vld1q_u8(second_pred + 96);
364*77c1e3ccSAndroid Build Coastguard Worker avg6 = vrhaddq_u8(r6, p6);
365*77c1e3ccSAndroid Build Coastguard Worker diff6 = vabdq_u8(s6, avg6);
366*77c1e3ccSAndroid Build Coastguard Worker sum[6] = vpadalq_u8(sum[6], diff6);
367*77c1e3ccSAndroid Build Coastguard Worker
368*77c1e3ccSAndroid Build Coastguard Worker s7 = vld1q_u8(src_ptr + 112);
369*77c1e3ccSAndroid Build Coastguard Worker r7 = vld1q_u8(ref_ptr + 112);
370*77c1e3ccSAndroid Build Coastguard Worker p7 = vld1q_u8(second_pred + 112);
371*77c1e3ccSAndroid Build Coastguard Worker avg7 = vrhaddq_u8(r7, p7);
372*77c1e3ccSAndroid Build Coastguard Worker diff7 = vabdq_u8(s7, avg7);
373*77c1e3ccSAndroid Build Coastguard Worker sum[7] = vpadalq_u8(sum[7], diff7);
374*77c1e3ccSAndroid Build Coastguard Worker
375*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
376*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
377*77c1e3ccSAndroid Build Coastguard Worker second_pred += 128;
378*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
379*77c1e3ccSAndroid Build Coastguard Worker
380*77c1e3ccSAndroid Build Coastguard Worker uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
381*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[1]);
382*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[2]);
383*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[3]);
384*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[4]);
385*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[5]);
386*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[6]);
387*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[7]);
388*77c1e3ccSAndroid Build Coastguard Worker
389*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u32x4(sum_u32);
390*77c1e3ccSAndroid Build Coastguard Worker }
391*77c1e3ccSAndroid Build Coastguard Worker
sad64xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)392*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
393*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
394*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr,
395*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h,
396*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) {
397*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
398*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0) };
399*77c1e3ccSAndroid Build Coastguard Worker
400*77c1e3ccSAndroid Build Coastguard Worker int i = h;
401*77c1e3ccSAndroid Build Coastguard Worker do {
402*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
403*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
404*77c1e3ccSAndroid Build Coastguard Worker
405*77c1e3ccSAndroid Build Coastguard Worker s0 = vld1q_u8(src_ptr);
406*77c1e3ccSAndroid Build Coastguard Worker r0 = vld1q_u8(ref_ptr);
407*77c1e3ccSAndroid Build Coastguard Worker p0 = vld1q_u8(second_pred);
408*77c1e3ccSAndroid Build Coastguard Worker avg0 = vrhaddq_u8(r0, p0);
409*77c1e3ccSAndroid Build Coastguard Worker diff0 = vabdq_u8(s0, avg0);
410*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
411*77c1e3ccSAndroid Build Coastguard Worker
412*77c1e3ccSAndroid Build Coastguard Worker s1 = vld1q_u8(src_ptr + 16);
413*77c1e3ccSAndroid Build Coastguard Worker r1 = vld1q_u8(ref_ptr + 16);
414*77c1e3ccSAndroid Build Coastguard Worker p1 = vld1q_u8(second_pred + 16);
415*77c1e3ccSAndroid Build Coastguard Worker avg1 = vrhaddq_u8(r1, p1);
416*77c1e3ccSAndroid Build Coastguard Worker diff1 = vabdq_u8(s1, avg1);
417*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
418*77c1e3ccSAndroid Build Coastguard Worker
419*77c1e3ccSAndroid Build Coastguard Worker s2 = vld1q_u8(src_ptr + 32);
420*77c1e3ccSAndroid Build Coastguard Worker r2 = vld1q_u8(ref_ptr + 32);
421*77c1e3ccSAndroid Build Coastguard Worker p2 = vld1q_u8(second_pred + 32);
422*77c1e3ccSAndroid Build Coastguard Worker avg2 = vrhaddq_u8(r2, p2);
423*77c1e3ccSAndroid Build Coastguard Worker diff2 = vabdq_u8(s2, avg2);
424*77c1e3ccSAndroid Build Coastguard Worker sum[2] = vpadalq_u8(sum[2], diff2);
425*77c1e3ccSAndroid Build Coastguard Worker
426*77c1e3ccSAndroid Build Coastguard Worker s3 = vld1q_u8(src_ptr + 48);
427*77c1e3ccSAndroid Build Coastguard Worker r3 = vld1q_u8(ref_ptr + 48);
428*77c1e3ccSAndroid Build Coastguard Worker p3 = vld1q_u8(second_pred + 48);
429*77c1e3ccSAndroid Build Coastguard Worker avg3 = vrhaddq_u8(r3, p3);
430*77c1e3ccSAndroid Build Coastguard Worker diff3 = vabdq_u8(s3, avg3);
431*77c1e3ccSAndroid Build Coastguard Worker sum[3] = vpadalq_u8(sum[3], diff3);
432*77c1e3ccSAndroid Build Coastguard Worker
433*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
434*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
435*77c1e3ccSAndroid Build Coastguard Worker second_pred += 64;
436*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
437*77c1e3ccSAndroid Build Coastguard Worker
438*77c1e3ccSAndroid Build Coastguard Worker uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
439*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[1]);
440*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[2]);
441*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[3]);
442*77c1e3ccSAndroid Build Coastguard Worker
443*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u32x4(sum_u32);
444*77c1e3ccSAndroid Build Coastguard Worker }
445*77c1e3ccSAndroid Build Coastguard Worker
sad32xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)446*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
447*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
448*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr,
449*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h,
450*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) {
451*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
452*77c1e3ccSAndroid Build Coastguard Worker
453*77c1e3ccSAndroid Build Coastguard Worker int i = h;
454*77c1e3ccSAndroid Build Coastguard Worker do {
455*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
456*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0 = vld1q_u8(ref_ptr);
457*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p0 = vld1q_u8(second_pred);
458*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t avg0 = vrhaddq_u8(r0, p0);
459*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0 = vabdq_u8(s0, avg0);
460*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
461*77c1e3ccSAndroid Build Coastguard Worker
462*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + 16);
463*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
464*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p1 = vld1q_u8(second_pred + 16);
465*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t avg1 = vrhaddq_u8(r1, p1);
466*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff1 = vabdq_u8(s1, avg1);
467*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
468*77c1e3ccSAndroid Build Coastguard Worker
469*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
470*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
471*77c1e3ccSAndroid Build Coastguard Worker second_pred += 32;
472*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
473*77c1e3ccSAndroid Build Coastguard Worker
474*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
475*77c1e3ccSAndroid Build Coastguard Worker }
476*77c1e3ccSAndroid Build Coastguard Worker
sad16xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)477*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
478*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
479*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr,
480*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h,
481*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) {
482*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
483*77c1e3ccSAndroid Build Coastguard Worker
484*77c1e3ccSAndroid Build Coastguard Worker int i = h;
485*77c1e3ccSAndroid Build Coastguard Worker do {
486*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s = vld1q_u8(src_ptr);
487*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r = vld1q_u8(ref_ptr);
488*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p = vld1q_u8(second_pred);
489*77c1e3ccSAndroid Build Coastguard Worker
490*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t avg = vrhaddq_u8(r, p);
491*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vabdq_u8(s, avg);
492*77c1e3ccSAndroid Build Coastguard Worker sum = vpadalq_u8(sum, diff);
493*77c1e3ccSAndroid Build Coastguard Worker
494*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
495*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
496*77c1e3ccSAndroid Build Coastguard Worker second_pred += 16;
497*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
498*77c1e3ccSAndroid Build Coastguard Worker
499*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
500*77c1e3ccSAndroid Build Coastguard Worker }
501*77c1e3ccSAndroid Build Coastguard Worker
sad8xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)502*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
503*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
504*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr,
505*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h,
506*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) {
507*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
508*77c1e3ccSAndroid Build Coastguard Worker
509*77c1e3ccSAndroid Build Coastguard Worker int i = h;
510*77c1e3ccSAndroid Build Coastguard Worker do {
511*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s = vld1_u8(src_ptr);
512*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t r = vld1_u8(ref_ptr);
513*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t p = vld1_u8(second_pred);
514*77c1e3ccSAndroid Build Coastguard Worker
515*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t avg = vrhadd_u8(r, p);
516*77c1e3ccSAndroid Build Coastguard Worker sum = vabal_u8(sum, s, avg);
517*77c1e3ccSAndroid Build Coastguard Worker
518*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
519*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
520*77c1e3ccSAndroid Build Coastguard Worker second_pred += 8;
521*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
522*77c1e3ccSAndroid Build Coastguard Worker
523*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
524*77c1e3ccSAndroid Build Coastguard Worker }
525*77c1e3ccSAndroid Build Coastguard Worker
sad4xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred)526*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
527*77c1e3ccSAndroid Build Coastguard Worker int src_stride,
528*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref_ptr,
529*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h,
530*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) {
531*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
532*77c1e3ccSAndroid Build Coastguard Worker
533*77c1e3ccSAndroid Build Coastguard Worker int i = h / 2;
534*77c1e3ccSAndroid Build Coastguard Worker do {
535*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
536*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
537*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t p = vld1_u8(second_pred);
538*77c1e3ccSAndroid Build Coastguard Worker
539*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t avg = vrhadd_u8(r, p);
540*77c1e3ccSAndroid Build Coastguard Worker sum = vabal_u8(sum, s, avg);
541*77c1e3ccSAndroid Build Coastguard Worker
542*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 2 * src_stride;
543*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += 2 * ref_stride;
544*77c1e3ccSAndroid Build Coastguard Worker second_pred += 8;
545*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
546*77c1e3ccSAndroid Build Coastguard Worker
547*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
548*77c1e3ccSAndroid Build Coastguard Worker }
549*77c1e3ccSAndroid Build Coastguard Worker
550*77c1e3ccSAndroid Build Coastguard Worker #define SAD_WXH_AVG_NEON(w, h) \
551*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
552*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *ref, int ref_stride, \
553*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred) { \
554*77c1e3ccSAndroid Build Coastguard Worker return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
555*77c1e3ccSAndroid Build Coastguard Worker second_pred); \
556*77c1e3ccSAndroid Build Coastguard Worker }
557*77c1e3ccSAndroid Build Coastguard Worker
558*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(4, 4)
559*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(4, 8)
560*77c1e3ccSAndroid Build Coastguard Worker
561*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(8, 4)
562*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(8, 8)
563*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(8, 16)
564*77c1e3ccSAndroid Build Coastguard Worker
565*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(16, 8)
566*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(16, 16)
567*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(16, 32)
568*77c1e3ccSAndroid Build Coastguard Worker
569*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(32, 16)
570*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(32, 32)
571*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(32, 64)
572*77c1e3ccSAndroid Build Coastguard Worker
573*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(64, 32)
574*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(64, 64)
575*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(64, 128)
576*77c1e3ccSAndroid Build Coastguard Worker
577*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(128, 64)
578*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(128, 128)
579*77c1e3ccSAndroid Build Coastguard Worker
580*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
581*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(4, 16)
582*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(8, 32)
583*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(16, 4)
584*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(16, 64)
585*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(32, 8)
586*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_AVG_NEON(64, 16)
587*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
588*77c1e3ccSAndroid Build Coastguard Worker
589*77c1e3ccSAndroid Build Coastguard Worker #undef SAD_WXH_AVG_NEON
590*77c1e3ccSAndroid Build Coastguard Worker
dist_wtd_sad128xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)591*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int dist_wtd_sad128xh_avg_neon(
592*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
593*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h, const uint8_t *second_pred,
594*77c1e3ccSAndroid Build Coastguard Worker const DIST_WTD_COMP_PARAMS *jcp_param) {
595*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
596*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
597*77c1e3ccSAndroid Build Coastguard Worker // We use 8 accumulators to prevent overflow for large values of 'h', as well
598*77c1e3ccSAndroid Build Coastguard Worker // as enabling optimal UADALP instruction throughput on CPUs that have either
599*77c1e3ccSAndroid Build Coastguard Worker // 2 or 4 Neon pipes.
600*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
601*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
602*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0), vdupq_n_u16(0) };
603*77c1e3ccSAndroid Build Coastguard Worker
604*77c1e3ccSAndroid Build Coastguard Worker do {
605*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
606*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0 = vld1q_u8(ref_ptr);
607*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p0 = vld1q_u8(second_pred);
608*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
609*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
610*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
611*77c1e3ccSAndroid Build Coastguard Worker
612*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + 16);
613*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
614*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p1 = vld1q_u8(second_pred + 16);
615*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
616*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
617*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
618*77c1e3ccSAndroid Build Coastguard Worker
619*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s2 = vld1q_u8(src_ptr + 32);
620*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
621*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p2 = vld1q_u8(second_pred + 32);
622*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
623*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
624*77c1e3ccSAndroid Build Coastguard Worker sum[2] = vpadalq_u8(sum[2], diff2);
625*77c1e3ccSAndroid Build Coastguard Worker
626*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s3 = vld1q_u8(src_ptr + 48);
627*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
628*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p3 = vld1q_u8(second_pred + 48);
629*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
630*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
631*77c1e3ccSAndroid Build Coastguard Worker sum[3] = vpadalq_u8(sum[3], diff3);
632*77c1e3ccSAndroid Build Coastguard Worker
633*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s4 = vld1q_u8(src_ptr + 64);
634*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
635*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p4 = vld1q_u8(second_pred + 64);
636*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
637*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
638*77c1e3ccSAndroid Build Coastguard Worker sum[4] = vpadalq_u8(sum[4], diff4);
639*77c1e3ccSAndroid Build Coastguard Worker
640*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s5 = vld1q_u8(src_ptr + 80);
641*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
642*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p5 = vld1q_u8(second_pred + 80);
643*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
644*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
645*77c1e3ccSAndroid Build Coastguard Worker sum[5] = vpadalq_u8(sum[5], diff5);
646*77c1e3ccSAndroid Build Coastguard Worker
647*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s6 = vld1q_u8(src_ptr + 96);
648*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
649*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p6 = vld1q_u8(second_pred + 96);
650*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
651*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
652*77c1e3ccSAndroid Build Coastguard Worker sum[6] = vpadalq_u8(sum[6], diff6);
653*77c1e3ccSAndroid Build Coastguard Worker
654*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s7 = vld1q_u8(src_ptr + 112);
655*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
656*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p7 = vld1q_u8(second_pred + 112);
657*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
658*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
659*77c1e3ccSAndroid Build Coastguard Worker sum[7] = vpadalq_u8(sum[7], diff7);
660*77c1e3ccSAndroid Build Coastguard Worker
661*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
662*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
663*77c1e3ccSAndroid Build Coastguard Worker second_pred += 128;
664*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
665*77c1e3ccSAndroid Build Coastguard Worker
666*77c1e3ccSAndroid Build Coastguard Worker uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
667*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[1]);
668*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[2]);
669*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[3]);
670*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[4]);
671*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[5]);
672*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[6]);
673*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[7]);
674*77c1e3ccSAndroid Build Coastguard Worker
675*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u32x4(sum_u32);
676*77c1e3ccSAndroid Build Coastguard Worker }
677*77c1e3ccSAndroid Build Coastguard Worker
dist_wtd_sad64xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)678*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int dist_wtd_sad64xh_avg_neon(
679*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
680*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h, const uint8_t *second_pred,
681*77c1e3ccSAndroid Build Coastguard Worker const DIST_WTD_COMP_PARAMS *jcp_param) {
682*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
683*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
684*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
685*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_u16(0) };
686*77c1e3ccSAndroid Build Coastguard Worker
687*77c1e3ccSAndroid Build Coastguard Worker do {
688*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
689*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0 = vld1q_u8(ref_ptr);
690*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p0 = vld1q_u8(second_pred);
691*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
692*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
693*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
694*77c1e3ccSAndroid Build Coastguard Worker
695*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + 16);
696*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
697*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p1 = vld1q_u8(second_pred + 16);
698*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
699*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
700*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
701*77c1e3ccSAndroid Build Coastguard Worker
702*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s2 = vld1q_u8(src_ptr + 32);
703*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
704*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p2 = vld1q_u8(second_pred + 32);
705*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
706*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
707*77c1e3ccSAndroid Build Coastguard Worker sum[2] = vpadalq_u8(sum[2], diff2);
708*77c1e3ccSAndroid Build Coastguard Worker
709*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s3 = vld1q_u8(src_ptr + 48);
710*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
711*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p3 = vld1q_u8(second_pred + 48);
712*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
713*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
714*77c1e3ccSAndroid Build Coastguard Worker sum[3] = vpadalq_u8(sum[3], diff3);
715*77c1e3ccSAndroid Build Coastguard Worker
716*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
717*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
718*77c1e3ccSAndroid Build Coastguard Worker second_pred += 64;
719*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
720*77c1e3ccSAndroid Build Coastguard Worker
721*77c1e3ccSAndroid Build Coastguard Worker uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
722*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[1]);
723*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[2]);
724*77c1e3ccSAndroid Build Coastguard Worker sum_u32 = vpadalq_u16(sum_u32, sum[3]);
725*77c1e3ccSAndroid Build Coastguard Worker
726*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u32x4(sum_u32);
727*77c1e3ccSAndroid Build Coastguard Worker }
728*77c1e3ccSAndroid Build Coastguard Worker
dist_wtd_sad32xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)729*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int dist_wtd_sad32xh_avg_neon(
730*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
731*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h, const uint8_t *second_pred,
732*77c1e3ccSAndroid Build Coastguard Worker const DIST_WTD_COMP_PARAMS *jcp_param) {
733*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
734*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
735*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
736*77c1e3ccSAndroid Build Coastguard Worker
737*77c1e3ccSAndroid Build Coastguard Worker do {
738*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src_ptr);
739*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r0 = vld1q_u8(ref_ptr);
740*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p0 = vld1q_u8(second_pred);
741*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
742*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
743*77c1e3ccSAndroid Build Coastguard Worker sum[0] = vpadalq_u8(sum[0], diff0);
744*77c1e3ccSAndroid Build Coastguard Worker
745*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src_ptr + 16);
746*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
747*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p1 = vld1q_u8(second_pred + 16);
748*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
749*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
750*77c1e3ccSAndroid Build Coastguard Worker sum[1] = vpadalq_u8(sum[1], diff1);
751*77c1e3ccSAndroid Build Coastguard Worker
752*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
753*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
754*77c1e3ccSAndroid Build Coastguard Worker second_pred += 32;
755*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
756*77c1e3ccSAndroid Build Coastguard Worker
757*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
758*77c1e3ccSAndroid Build Coastguard Worker }
759*77c1e3ccSAndroid Build Coastguard Worker
dist_wtd_sad16xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)760*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int dist_wtd_sad16xh_avg_neon(
761*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
762*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h, const uint8_t *second_pred,
763*77c1e3ccSAndroid Build Coastguard Worker const DIST_WTD_COMP_PARAMS *jcp_param) {
764*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
765*77c1e3ccSAndroid Build Coastguard Worker const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
766*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
767*77c1e3ccSAndroid Build Coastguard Worker
768*77c1e3ccSAndroid Build Coastguard Worker do {
769*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s = vld1q_u8(src_ptr);
770*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t r = vld1q_u8(ref_ptr);
771*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t p = vld1q_u8(second_pred);
772*77c1e3ccSAndroid Build Coastguard Worker
773*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
774*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vabdq_u8(s, wtd_avg);
775*77c1e3ccSAndroid Build Coastguard Worker sum = vpadalq_u8(sum, diff);
776*77c1e3ccSAndroid Build Coastguard Worker
777*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
778*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
779*77c1e3ccSAndroid Build Coastguard Worker second_pred += 16;
780*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
781*77c1e3ccSAndroid Build Coastguard Worker
782*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
783*77c1e3ccSAndroid Build Coastguard Worker }
784*77c1e3ccSAndroid Build Coastguard Worker
dist_wtd_sad8xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)785*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int dist_wtd_sad8xh_avg_neon(
786*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
787*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h, const uint8_t *second_pred,
788*77c1e3ccSAndroid Build Coastguard Worker const DIST_WTD_COMP_PARAMS *jcp_param) {
789*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
790*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
791*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
792*77c1e3ccSAndroid Build Coastguard Worker
793*77c1e3ccSAndroid Build Coastguard Worker do {
794*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s = vld1_u8(src_ptr);
795*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t r = vld1_u8(ref_ptr);
796*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t p = vld1_u8(second_pred);
797*77c1e3ccSAndroid Build Coastguard Worker
798*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
799*77c1e3ccSAndroid Build Coastguard Worker sum = vabal_u8(sum, s, wtd_avg);
800*77c1e3ccSAndroid Build Coastguard Worker
801*77c1e3ccSAndroid Build Coastguard Worker src_ptr += src_stride;
802*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += ref_stride;
803*77c1e3ccSAndroid Build Coastguard Worker second_pred += 8;
804*77c1e3ccSAndroid Build Coastguard Worker } while (--h != 0);
805*77c1e3ccSAndroid Build Coastguard Worker
806*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
807*77c1e3ccSAndroid Build Coastguard Worker }
808*77c1e3ccSAndroid Build Coastguard Worker
dist_wtd_sad4xh_avg_neon(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int h,const uint8_t * second_pred,const DIST_WTD_COMP_PARAMS * jcp_param)809*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int dist_wtd_sad4xh_avg_neon(
810*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
811*77c1e3ccSAndroid Build Coastguard Worker int ref_stride, int h, const uint8_t *second_pred,
812*77c1e3ccSAndroid Build Coastguard Worker const DIST_WTD_COMP_PARAMS *jcp_param) {
813*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
814*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
815*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vdupq_n_u16(0);
816*77c1e3ccSAndroid Build Coastguard Worker
817*77c1e3ccSAndroid Build Coastguard Worker int i = h / 2;
818*77c1e3ccSAndroid Build Coastguard Worker do {
819*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
820*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
821*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t p = vld1_u8(second_pred);
822*77c1e3ccSAndroid Build Coastguard Worker
823*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
824*77c1e3ccSAndroid Build Coastguard Worker sum = vabal_u8(sum, s, wtd_avg);
825*77c1e3ccSAndroid Build Coastguard Worker
826*77c1e3ccSAndroid Build Coastguard Worker src_ptr += 2 * src_stride;
827*77c1e3ccSAndroid Build Coastguard Worker ref_ptr += 2 * ref_stride;
828*77c1e3ccSAndroid Build Coastguard Worker second_pred += 8;
829*77c1e3ccSAndroid Build Coastguard Worker } while (--i != 0);
830*77c1e3ccSAndroid Build Coastguard Worker
831*77c1e3ccSAndroid Build Coastguard Worker return horizontal_add_u16x8(sum);
832*77c1e3ccSAndroid Build Coastguard Worker }
833*77c1e3ccSAndroid Build Coastguard Worker
834*77c1e3ccSAndroid Build Coastguard Worker #define DIST_WTD_SAD_WXH_AVG_NEON(w, h) \
835*77c1e3ccSAndroid Build Coastguard Worker unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon( \
836*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
837*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
838*77c1e3ccSAndroid Build Coastguard Worker return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
839*77c1e3ccSAndroid Build Coastguard Worker second_pred, jcp_param); \
840*77c1e3ccSAndroid Build Coastguard Worker }
841*77c1e3ccSAndroid Build Coastguard Worker
842*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
843*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
844*77c1e3ccSAndroid Build Coastguard Worker
845*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
846*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
847*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
848*77c1e3ccSAndroid Build Coastguard Worker
849*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
850*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
851*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
852*77c1e3ccSAndroid Build Coastguard Worker
853*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
854*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
855*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
856*77c1e3ccSAndroid Build Coastguard Worker
857*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
858*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
859*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
860*77c1e3ccSAndroid Build Coastguard Worker
861*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
862*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
863*77c1e3ccSAndroid Build Coastguard Worker
864*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
865*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
866*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
867*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
868*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
869*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
870*77c1e3ccSAndroid Build Coastguard Worker DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
871*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY
872*77c1e3ccSAndroid Build Coastguard Worker
873*77c1e3ccSAndroid Build Coastguard Worker #undef DIST_WTD_SAD_WXH_AVG_NEON
874