xref: /aosp_15_r20/external/libaom/aom_dsp/arm/sadxd_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker 
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
16*77c1e3ccSAndroid Build Coastguard Worker 
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/sum_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker 
sad16_neon(uint8x16_t src,uint8x16_t ref,uint16x8_t * const sad_sum)21*77c1e3ccSAndroid Build Coastguard Worker static inline void sad16_neon(uint8x16_t src, uint8x16_t ref,
22*77c1e3ccSAndroid Build Coastguard Worker                               uint16x8_t *const sad_sum) {
23*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t abs_diff = vabdq_u8(src, ref);
24*77c1e3ccSAndroid Build Coastguard Worker   *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
25*77c1e3ccSAndroid Build Coastguard Worker }
26*77c1e3ccSAndroid Build Coastguard Worker 
sadwxhx3d_large_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int w,int h,int h_overflow)27*77c1e3ccSAndroid Build Coastguard Worker static inline void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
28*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *const ref[3],
29*77c1e3ccSAndroid Build Coastguard Worker                                         int ref_stride, uint32_t res[3], int w,
30*77c1e3ccSAndroid Build Coastguard Worker                                         int h, int h_overflow) {
31*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
32*77c1e3ccSAndroid Build Coastguard Worker   int h_limit = h > h_overflow ? h_overflow : h;
33*77c1e3ccSAndroid Build Coastguard Worker 
34*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 0;
35*77c1e3ccSAndroid Build Coastguard Worker   int i = 0;
36*77c1e3ccSAndroid Build Coastguard Worker   do {
37*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
38*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
39*77c1e3ccSAndroid Build Coastguard Worker 
40*77c1e3ccSAndroid Build Coastguard Worker     do {
41*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
42*77c1e3ccSAndroid Build Coastguard Worker       do {
43*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t s0 = vld1q_u8(src + j);
44*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
45*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
46*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
47*77c1e3ccSAndroid Build Coastguard Worker 
48*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t s1 = vld1q_u8(src + j + 16);
49*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
50*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
51*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
52*77c1e3ccSAndroid Build Coastguard Worker 
53*77c1e3ccSAndroid Build Coastguard Worker         j += 32;
54*77c1e3ccSAndroid Build Coastguard Worker       } while (j < w);
55*77c1e3ccSAndroid Build Coastguard Worker 
56*77c1e3ccSAndroid Build Coastguard Worker       src += src_stride;
57*77c1e3ccSAndroid Build Coastguard Worker       ref_offset += ref_stride;
58*77c1e3ccSAndroid Build Coastguard Worker     } while (++i < h_limit);
59*77c1e3ccSAndroid Build Coastguard Worker 
60*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
61*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
62*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
63*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
64*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
65*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
66*77c1e3ccSAndroid Build Coastguard Worker 
67*77c1e3ccSAndroid Build Coastguard Worker     h_limit += h_overflow;
68*77c1e3ccSAndroid Build Coastguard Worker   } while (i < h);
69*77c1e3ccSAndroid Build Coastguard Worker 
70*77c1e3ccSAndroid Build Coastguard Worker   res[0] = horizontal_add_u32x4(sum[0]);
71*77c1e3ccSAndroid Build Coastguard Worker   res[1] = horizontal_add_u32x4(sum[1]);
72*77c1e3ccSAndroid Build Coastguard Worker   res[2] = horizontal_add_u32x4(sum[2]);
73*77c1e3ccSAndroid Build Coastguard Worker }
74*77c1e3ccSAndroid Build Coastguard Worker 
sad128xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)75*77c1e3ccSAndroid Build Coastguard Worker static inline void sad128xhx3d_neon(const uint8_t *src, int src_stride,
76*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *const ref[3], int ref_stride,
77*77c1e3ccSAndroid Build Coastguard Worker                                     uint32_t res[3], int h) {
78*77c1e3ccSAndroid Build Coastguard Worker   sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
79*77c1e3ccSAndroid Build Coastguard Worker }
80*77c1e3ccSAndroid Build Coastguard Worker 
sad64xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)81*77c1e3ccSAndroid Build Coastguard Worker static inline void sad64xhx3d_neon(const uint8_t *src, int src_stride,
82*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *const ref[3], int ref_stride,
83*77c1e3ccSAndroid Build Coastguard Worker                                    uint32_t res[3], int h) {
84*77c1e3ccSAndroid Build Coastguard Worker   sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
85*77c1e3ccSAndroid Build Coastguard Worker }
86*77c1e3ccSAndroid Build Coastguard Worker 
sad32xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)87*77c1e3ccSAndroid Build Coastguard Worker static inline void sad32xhx3d_neon(const uint8_t *src, int src_stride,
88*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *const ref[3], int ref_stride,
89*77c1e3ccSAndroid Build Coastguard Worker                                    uint32_t res[3], int h) {
90*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
91*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
92*77c1e3ccSAndroid Build Coastguard Worker 
93*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 0;
94*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
95*77c1e3ccSAndroid Build Coastguard Worker   do {
96*77c1e3ccSAndroid Build Coastguard Worker     const uint8x16_t s0 = vld1q_u8(src);
97*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
98*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
99*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
100*77c1e3ccSAndroid Build Coastguard Worker 
101*77c1e3ccSAndroid Build Coastguard Worker     const uint8x16_t s1 = vld1q_u8(src + 16);
102*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
103*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
104*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
105*77c1e3ccSAndroid Build Coastguard Worker 
106*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
107*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += ref_stride;
108*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
109*77c1e3ccSAndroid Build Coastguard Worker 
110*77c1e3ccSAndroid Build Coastguard Worker   res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
111*77c1e3ccSAndroid Build Coastguard Worker   res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
112*77c1e3ccSAndroid Build Coastguard Worker   res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
113*77c1e3ccSAndroid Build Coastguard Worker }
114*77c1e3ccSAndroid Build Coastguard Worker 
sad16xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)115*77c1e3ccSAndroid Build Coastguard Worker static inline void sad16xhx3d_neon(const uint8_t *src, int src_stride,
116*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *const ref[3], int ref_stride,
117*77c1e3ccSAndroid Build Coastguard Worker                                    uint32_t res[3], int h) {
118*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
119*77c1e3ccSAndroid Build Coastguard Worker 
120*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 0;
121*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
122*77c1e3ccSAndroid Build Coastguard Worker   do {
123*77c1e3ccSAndroid Build Coastguard Worker     const uint8x16_t s = vld1q_u8(src);
124*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
125*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
126*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
127*77c1e3ccSAndroid Build Coastguard Worker 
128*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
129*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += ref_stride;
130*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
131*77c1e3ccSAndroid Build Coastguard Worker 
132*77c1e3ccSAndroid Build Coastguard Worker   res[0] = horizontal_add_u16x8(sum[0]);
133*77c1e3ccSAndroid Build Coastguard Worker   res[1] = horizontal_add_u16x8(sum[1]);
134*77c1e3ccSAndroid Build Coastguard Worker   res[2] = horizontal_add_u16x8(sum[2]);
135*77c1e3ccSAndroid Build Coastguard Worker }
136*77c1e3ccSAndroid Build Coastguard Worker 
sad8xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)137*77c1e3ccSAndroid Build Coastguard Worker static inline void sad8xhx3d_neon(const uint8_t *src, int src_stride,
138*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[3], int ref_stride,
139*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[3], int h) {
140*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum[3];
141*77c1e3ccSAndroid Build Coastguard Worker 
142*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t s = vld1_u8(src);
143*77c1e3ccSAndroid Build Coastguard Worker   sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
144*77c1e3ccSAndroid Build Coastguard Worker   sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
145*77c1e3ccSAndroid Build Coastguard Worker   sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
146*77c1e3ccSAndroid Build Coastguard Worker 
147*77c1e3ccSAndroid Build Coastguard Worker   src += src_stride;
148*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = ref_stride;
149*77c1e3ccSAndroid Build Coastguard Worker   int i = h - 1;
150*77c1e3ccSAndroid Build Coastguard Worker   do {
151*77c1e3ccSAndroid Build Coastguard Worker     s = vld1_u8(src);
152*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
153*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
154*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
155*77c1e3ccSAndroid Build Coastguard Worker 
156*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
157*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += ref_stride;
158*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
159*77c1e3ccSAndroid Build Coastguard Worker 
160*77c1e3ccSAndroid Build Coastguard Worker   res[0] = horizontal_add_u16x8(sum[0]);
161*77c1e3ccSAndroid Build Coastguard Worker   res[1] = horizontal_add_u16x8(sum[1]);
162*77c1e3ccSAndroid Build Coastguard Worker   res[2] = horizontal_add_u16x8(sum[2]);
163*77c1e3ccSAndroid Build Coastguard Worker }
164*77c1e3ccSAndroid Build Coastguard Worker 
sad4xhx3d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[3],int ref_stride,uint32_t res[3],int h)165*77c1e3ccSAndroid Build Coastguard Worker static inline void sad4xhx3d_neon(const uint8_t *src, int src_stride,
166*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[3], int ref_stride,
167*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[3], int h) {
168*77c1e3ccSAndroid Build Coastguard Worker   assert(h % 2 == 0);
169*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum[3];
170*77c1e3ccSAndroid Build Coastguard Worker 
171*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t s = load_unaligned_u8(src, src_stride);
172*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
173*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
174*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
175*77c1e3ccSAndroid Build Coastguard Worker 
176*77c1e3ccSAndroid Build Coastguard Worker   sum[0] = vabdl_u8(s, r0);
177*77c1e3ccSAndroid Build Coastguard Worker   sum[1] = vabdl_u8(s, r1);
178*77c1e3ccSAndroid Build Coastguard Worker   sum[2] = vabdl_u8(s, r2);
179*77c1e3ccSAndroid Build Coastguard Worker 
180*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * src_stride;
181*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 2 * ref_stride;
182*77c1e3ccSAndroid Build Coastguard Worker   int i = (h / 2) - 1;
183*77c1e3ccSAndroid Build Coastguard Worker   do {
184*77c1e3ccSAndroid Build Coastguard Worker     s = load_unaligned_u8(src, src_stride);
185*77c1e3ccSAndroid Build Coastguard Worker     r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
186*77c1e3ccSAndroid Build Coastguard Worker     r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
187*77c1e3ccSAndroid Build Coastguard Worker     r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
188*77c1e3ccSAndroid Build Coastguard Worker 
189*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vabal_u8(sum[0], s, r0);
190*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vabal_u8(sum[1], s, r1);
191*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vabal_u8(sum[2], s, r2);
192*77c1e3ccSAndroid Build Coastguard Worker 
193*77c1e3ccSAndroid Build Coastguard Worker     src += 2 * src_stride;
194*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += 2 * ref_stride;
195*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
196*77c1e3ccSAndroid Build Coastguard Worker 
197*77c1e3ccSAndroid Build Coastguard Worker   res[0] = horizontal_add_u16x8(sum[0]);
198*77c1e3ccSAndroid Build Coastguard Worker   res[1] = horizontal_add_u16x8(sum[1]);
199*77c1e3ccSAndroid Build Coastguard Worker   res[2] = horizontal_add_u16x8(sum[2]);
200*77c1e3ccSAndroid Build Coastguard Worker }
201*77c1e3ccSAndroid Build Coastguard Worker 
202*77c1e3ccSAndroid Build Coastguard Worker #define SAD_WXH_3D_NEON(w, h)                                                  \
203*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride,          \
204*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[4], int ref_stride, \
205*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[4]) {                           \
206*77c1e3ccSAndroid Build Coastguard Worker     sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h));            \
207*77c1e3ccSAndroid Build Coastguard Worker   }
208*77c1e3ccSAndroid Build Coastguard Worker 
209*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(4, 4)
210*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(4, 8)
211*77c1e3ccSAndroid Build Coastguard Worker 
212*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(8, 4)
213*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(8, 8)
214*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(8, 16)
215*77c1e3ccSAndroid Build Coastguard Worker 
216*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(16, 8)
217*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(16, 16)
218*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(16, 32)
219*77c1e3ccSAndroid Build Coastguard Worker 
220*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(32, 16)
221*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(32, 32)
222*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(32, 64)
223*77c1e3ccSAndroid Build Coastguard Worker 
224*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(64, 32)
225*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(64, 64)
226*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(64, 128)
227*77c1e3ccSAndroid Build Coastguard Worker 
228*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(128, 64)
229*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(128, 128)
230*77c1e3ccSAndroid Build Coastguard Worker 
231*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
232*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(4, 16)
233*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(8, 32)
234*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(16, 4)
235*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(16, 64)
236*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(32, 8)
237*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_3D_NEON(64, 16)
238*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
239*77c1e3ccSAndroid Build Coastguard Worker 
240*77c1e3ccSAndroid Build Coastguard Worker #undef SAD_WXH_3D_NEON
241*77c1e3ccSAndroid Build Coastguard Worker 
sadwxhx4d_large_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int w,int h,int h_overflow)242*77c1e3ccSAndroid Build Coastguard Worker static inline void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
243*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *const ref[4],
244*77c1e3ccSAndroid Build Coastguard Worker                                         int ref_stride, uint32_t res[4], int w,
245*77c1e3ccSAndroid Build Coastguard Worker                                         int h, int h_overflow) {
246*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
247*77c1e3ccSAndroid Build Coastguard Worker                         vdupq_n_u32(0) };
248*77c1e3ccSAndroid Build Coastguard Worker   int h_limit = h > h_overflow ? h_overflow : h;
249*77c1e3ccSAndroid Build Coastguard Worker 
250*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 0;
251*77c1e3ccSAndroid Build Coastguard Worker   int i = 0;
252*77c1e3ccSAndroid Build Coastguard Worker   do {
253*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
254*77c1e3ccSAndroid Build Coastguard Worker                              vdupq_n_u16(0) };
255*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
256*77c1e3ccSAndroid Build Coastguard Worker                              vdupq_n_u16(0) };
257*77c1e3ccSAndroid Build Coastguard Worker 
258*77c1e3ccSAndroid Build Coastguard Worker     do {
259*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
260*77c1e3ccSAndroid Build Coastguard Worker       do {
261*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t s0 = vld1q_u8(src + j);
262*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
263*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
264*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
265*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
266*77c1e3ccSAndroid Build Coastguard Worker 
267*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t s1 = vld1q_u8(src + j + 16);
268*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
269*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
270*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
271*77c1e3ccSAndroid Build Coastguard Worker         sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
272*77c1e3ccSAndroid Build Coastguard Worker 
273*77c1e3ccSAndroid Build Coastguard Worker         j += 32;
274*77c1e3ccSAndroid Build Coastguard Worker       } while (j < w);
275*77c1e3ccSAndroid Build Coastguard Worker 
276*77c1e3ccSAndroid Build Coastguard Worker       src += src_stride;
277*77c1e3ccSAndroid Build Coastguard Worker       ref_offset += ref_stride;
278*77c1e3ccSAndroid Build Coastguard Worker     } while (++i < h_limit);
279*77c1e3ccSAndroid Build Coastguard Worker 
280*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
281*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
282*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
283*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
284*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
285*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
286*77c1e3ccSAndroid Build Coastguard Worker     sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
287*77c1e3ccSAndroid Build Coastguard Worker     sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
288*77c1e3ccSAndroid Build Coastguard Worker 
289*77c1e3ccSAndroid Build Coastguard Worker     h_limit += h_overflow;
290*77c1e3ccSAndroid Build Coastguard Worker   } while (i < h);
291*77c1e3ccSAndroid Build Coastguard Worker 
292*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(res, horizontal_add_4d_u32x4(sum));
293*77c1e3ccSAndroid Build Coastguard Worker }
294*77c1e3ccSAndroid Build Coastguard Worker 
sad128xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)295*77c1e3ccSAndroid Build Coastguard Worker static inline void sad128xhx4d_neon(const uint8_t *src, int src_stride,
296*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *const ref[4], int ref_stride,
297*77c1e3ccSAndroid Build Coastguard Worker                                     uint32_t res[4], int h) {
298*77c1e3ccSAndroid Build Coastguard Worker   sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
299*77c1e3ccSAndroid Build Coastguard Worker }
300*77c1e3ccSAndroid Build Coastguard Worker 
sad64xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)301*77c1e3ccSAndroid Build Coastguard Worker static inline void sad64xhx4d_neon(const uint8_t *src, int src_stride,
302*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *const ref[4], int ref_stride,
303*77c1e3ccSAndroid Build Coastguard Worker                                    uint32_t res[4], int h) {
304*77c1e3ccSAndroid Build Coastguard Worker   sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
305*77c1e3ccSAndroid Build Coastguard Worker }
306*77c1e3ccSAndroid Build Coastguard Worker 
sad32xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)307*77c1e3ccSAndroid Build Coastguard Worker static inline void sad32xhx4d_neon(const uint8_t *src, int src_stride,
308*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *const ref[4], int ref_stride,
309*77c1e3ccSAndroid Build Coastguard Worker                                    uint32_t res[4], int h) {
310*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
311*77c1e3ccSAndroid Build Coastguard Worker                            vdupq_n_u16(0) };
312*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
313*77c1e3ccSAndroid Build Coastguard Worker                            vdupq_n_u16(0) };
314*77c1e3ccSAndroid Build Coastguard Worker 
315*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 0;
316*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
317*77c1e3ccSAndroid Build Coastguard Worker   do {
318*77c1e3ccSAndroid Build Coastguard Worker     const uint8x16_t s0 = vld1q_u8(src);
319*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
320*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
321*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
322*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
323*77c1e3ccSAndroid Build Coastguard Worker 
324*77c1e3ccSAndroid Build Coastguard Worker     const uint8x16_t s1 = vld1q_u8(src + 16);
325*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
326*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
327*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
328*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
329*77c1e3ccSAndroid Build Coastguard Worker 
330*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
331*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += ref_stride;
332*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
333*77c1e3ccSAndroid Build Coastguard Worker 
334*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
335*77c1e3ccSAndroid Build Coastguard Worker }
336*77c1e3ccSAndroid Build Coastguard Worker 
sad16xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)337*77c1e3ccSAndroid Build Coastguard Worker static inline void sad16xhx4d_neon(const uint8_t *src, int src_stride,
338*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *const ref[4], int ref_stride,
339*77c1e3ccSAndroid Build Coastguard Worker                                    uint32_t res[4], int h) {
340*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
341*77c1e3ccSAndroid Build Coastguard Worker                             vdupq_n_u16(0) };
342*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t sum_u32[4];
343*77c1e3ccSAndroid Build Coastguard Worker 
344*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 0;
345*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
346*77c1e3ccSAndroid Build Coastguard Worker   do {
347*77c1e3ccSAndroid Build Coastguard Worker     const uint8x16_t s = vld1q_u8(src);
348*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
349*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
350*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
351*77c1e3ccSAndroid Build Coastguard Worker     sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
352*77c1e3ccSAndroid Build Coastguard Worker 
353*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
354*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += ref_stride;
355*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
356*77c1e3ccSAndroid Build Coastguard Worker 
357*77c1e3ccSAndroid Build Coastguard Worker   sum_u32[0] = vpaddlq_u16(sum_u16[0]);
358*77c1e3ccSAndroid Build Coastguard Worker   sum_u32[1] = vpaddlq_u16(sum_u16[1]);
359*77c1e3ccSAndroid Build Coastguard Worker   sum_u32[2] = vpaddlq_u16(sum_u16[2]);
360*77c1e3ccSAndroid Build Coastguard Worker   sum_u32[3] = vpaddlq_u16(sum_u16[3]);
361*77c1e3ccSAndroid Build Coastguard Worker 
362*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
363*77c1e3ccSAndroid Build Coastguard Worker }
364*77c1e3ccSAndroid Build Coastguard Worker 
sad8xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)365*77c1e3ccSAndroid Build Coastguard Worker static inline void sad8xhx4d_neon(const uint8_t *src, int src_stride,
366*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[4], int ref_stride,
367*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[4], int h) {
368*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum[4];
369*77c1e3ccSAndroid Build Coastguard Worker 
370*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t s = vld1_u8(src);
371*77c1e3ccSAndroid Build Coastguard Worker   sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
372*77c1e3ccSAndroid Build Coastguard Worker   sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
373*77c1e3ccSAndroid Build Coastguard Worker   sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
374*77c1e3ccSAndroid Build Coastguard Worker   sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
375*77c1e3ccSAndroid Build Coastguard Worker 
376*77c1e3ccSAndroid Build Coastguard Worker   src += src_stride;
377*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = ref_stride;
378*77c1e3ccSAndroid Build Coastguard Worker   int i = h - 1;
379*77c1e3ccSAndroid Build Coastguard Worker   do {
380*77c1e3ccSAndroid Build Coastguard Worker     s = vld1_u8(src);
381*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
382*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
383*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
384*77c1e3ccSAndroid Build Coastguard Worker     sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
385*77c1e3ccSAndroid Build Coastguard Worker 
386*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
387*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += ref_stride;
388*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
389*77c1e3ccSAndroid Build Coastguard Worker 
390*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
391*77c1e3ccSAndroid Build Coastguard Worker }
392*77c1e3ccSAndroid Build Coastguard Worker 
sad4xhx4d_neon(const uint8_t * src,int src_stride,const uint8_t * const ref[4],int ref_stride,uint32_t res[4],int h)393*77c1e3ccSAndroid Build Coastguard Worker static inline void sad4xhx4d_neon(const uint8_t *src, int src_stride,
394*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[4], int ref_stride,
395*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[4], int h) {
396*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t sum[4];
397*77c1e3ccSAndroid Build Coastguard Worker 
398*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t s = load_unaligned_u8(src, src_stride);
399*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
400*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
401*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
402*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
403*77c1e3ccSAndroid Build Coastguard Worker 
404*77c1e3ccSAndroid Build Coastguard Worker   sum[0] = vabdl_u8(s, r0);
405*77c1e3ccSAndroid Build Coastguard Worker   sum[1] = vabdl_u8(s, r1);
406*77c1e3ccSAndroid Build Coastguard Worker   sum[2] = vabdl_u8(s, r2);
407*77c1e3ccSAndroid Build Coastguard Worker   sum[3] = vabdl_u8(s, r3);
408*77c1e3ccSAndroid Build Coastguard Worker 
409*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * src_stride;
410*77c1e3ccSAndroid Build Coastguard Worker   int ref_offset = 2 * ref_stride;
411*77c1e3ccSAndroid Build Coastguard Worker   int i = h / 2;
412*77c1e3ccSAndroid Build Coastguard Worker   while (--i != 0) {
413*77c1e3ccSAndroid Build Coastguard Worker     s = load_unaligned_u8(src, src_stride);
414*77c1e3ccSAndroid Build Coastguard Worker     r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
415*77c1e3ccSAndroid Build Coastguard Worker     r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
416*77c1e3ccSAndroid Build Coastguard Worker     r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
417*77c1e3ccSAndroid Build Coastguard Worker     r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
418*77c1e3ccSAndroid Build Coastguard Worker 
419*77c1e3ccSAndroid Build Coastguard Worker     sum[0] = vabal_u8(sum[0], s, r0);
420*77c1e3ccSAndroid Build Coastguard Worker     sum[1] = vabal_u8(sum[1], s, r1);
421*77c1e3ccSAndroid Build Coastguard Worker     sum[2] = vabal_u8(sum[2], s, r2);
422*77c1e3ccSAndroid Build Coastguard Worker     sum[3] = vabal_u8(sum[3], s, r3);
423*77c1e3ccSAndroid Build Coastguard Worker 
424*77c1e3ccSAndroid Build Coastguard Worker     src += 2 * src_stride;
425*77c1e3ccSAndroid Build Coastguard Worker     ref_offset += 2 * ref_stride;
426*77c1e3ccSAndroid Build Coastguard Worker   }
427*77c1e3ccSAndroid Build Coastguard Worker 
428*77c1e3ccSAndroid Build Coastguard Worker   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
429*77c1e3ccSAndroid Build Coastguard Worker }
430*77c1e3ccSAndroid Build Coastguard Worker 
431*77c1e3ccSAndroid Build Coastguard Worker #define SAD_WXH_4D_NEON(w, h)                                                  \
432*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
433*77c1e3ccSAndroid Build Coastguard Worker                                   const uint8_t *const ref[4], int ref_stride, \
434*77c1e3ccSAndroid Build Coastguard Worker                                   uint32_t res[4]) {                           \
435*77c1e3ccSAndroid Build Coastguard Worker     sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
436*77c1e3ccSAndroid Build Coastguard Worker   }
437*77c1e3ccSAndroid Build Coastguard Worker 
438*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(4, 4)
439*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(4, 8)
440*77c1e3ccSAndroid Build Coastguard Worker 
441*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(8, 4)
442*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(8, 8)
443*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(8, 16)
444*77c1e3ccSAndroid Build Coastguard Worker 
445*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(16, 8)
446*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(16, 16)
447*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(16, 32)
448*77c1e3ccSAndroid Build Coastguard Worker 
449*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(32, 16)
450*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(32, 32)
451*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(32, 64)
452*77c1e3ccSAndroid Build Coastguard Worker 
453*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(64, 32)
454*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(64, 64)
455*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(64, 128)
456*77c1e3ccSAndroid Build Coastguard Worker 
457*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(128, 64)
458*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(128, 128)
459*77c1e3ccSAndroid Build Coastguard Worker 
460*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
461*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(4, 16)
462*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(8, 32)
463*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(16, 4)
464*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(16, 64)
465*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(32, 8)
466*77c1e3ccSAndroid Build Coastguard Worker SAD_WXH_4D_NEON(64, 16)
467*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
468*77c1e3ccSAndroid Build Coastguard Worker 
469*77c1e3ccSAndroid Build Coastguard Worker #undef SAD_WXH_4D_NEON
470*77c1e3ccSAndroid Build Coastguard Worker 
471*77c1e3ccSAndroid Build Coastguard Worker #define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
472*77c1e3ccSAndroid Build Coastguard Worker   void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
473*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *const ref[4],        \
474*77c1e3ccSAndroid Build Coastguard Worker                                         int ref_stride, uint32_t res[4]) {  \
475*77c1e3ccSAndroid Build Coastguard Worker     sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
476*77c1e3ccSAndroid Build Coastguard Worker                        ((h) >> 1));                                         \
477*77c1e3ccSAndroid Build Coastguard Worker     res[0] <<= 1;                                                           \
478*77c1e3ccSAndroid Build Coastguard Worker     res[1] <<= 1;                                                           \
479*77c1e3ccSAndroid Build Coastguard Worker     res[2] <<= 1;                                                           \
480*77c1e3ccSAndroid Build Coastguard Worker     res[3] <<= 1;                                                           \
481*77c1e3ccSAndroid Build Coastguard Worker   }
482*77c1e3ccSAndroid Build Coastguard Worker 
483*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(4, 4)
484*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(4, 8)
485*77c1e3ccSAndroid Build Coastguard Worker 
486*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(8, 4)
487*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(8, 8)
488*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(8, 16)
489*77c1e3ccSAndroid Build Coastguard Worker 
490*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(16, 8)
491*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(16, 16)
492*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(16, 32)
493*77c1e3ccSAndroid Build Coastguard Worker 
494*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(32, 16)
495*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(32, 32)
496*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(32, 64)
497*77c1e3ccSAndroid Build Coastguard Worker 
498*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(64, 32)
499*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(64, 64)
500*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(64, 128)
501*77c1e3ccSAndroid Build Coastguard Worker 
502*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(128, 64)
503*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(128, 128)
504*77c1e3ccSAndroid Build Coastguard Worker 
505*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
506*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(4, 16)
507*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(8, 32)
508*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(16, 4)
509*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(16, 64)
510*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(32, 8)
511*77c1e3ccSAndroid Build Coastguard Worker SAD_SKIP_WXH_4D_NEON(64, 16)
512*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
513*77c1e3ccSAndroid Build Coastguard Worker 
514*77c1e3ccSAndroid Build Coastguard Worker #undef SAD_SKIP_WXH_4D_NEON
515