xref: /aosp_15_r20/external/libaom/aom_dsp/arm/variance_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker 
14*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/sum_neon.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_dsp_rtcd.h"
20*77c1e3ccSAndroid Build Coastguard Worker 
variance_4xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int h,uint32_t * sse,int * sum)21*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_4xh_neon(const uint8_t *src, int src_stride,
22*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *ref, int ref_stride, int h,
23*77c1e3ccSAndroid Build Coastguard Worker                                      uint32_t *sse, int *sum) {
24*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t sum_s16 = vdupq_n_s16(0);
25*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse_s32 = vdupq_n_s32(0);
26*77c1e3ccSAndroid Build Coastguard Worker 
27*77c1e3ccSAndroid Build Coastguard Worker   // Number of rows we can process before 'sum_s16' overflows:
28*77c1e3ccSAndroid Build Coastguard Worker   // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
29*77c1e3ccSAndroid Build Coastguard Worker   assert(h <= 256);
30*77c1e3ccSAndroid Build Coastguard Worker 
31*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
32*77c1e3ccSAndroid Build Coastguard Worker   do {
33*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t s = load_unaligned_u8(src, src_stride);
34*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t r = load_unaligned_u8(ref, ref_stride);
35*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
36*77c1e3ccSAndroid Build Coastguard Worker 
37*77c1e3ccSAndroid Build Coastguard Worker     sum_s16 = vaddq_s16(sum_s16, diff);
38*77c1e3ccSAndroid Build Coastguard Worker 
39*77c1e3ccSAndroid Build Coastguard Worker     sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
40*77c1e3ccSAndroid Build Coastguard Worker     sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
41*77c1e3ccSAndroid Build Coastguard Worker 
42*77c1e3ccSAndroid Build Coastguard Worker     src += 2 * src_stride;
43*77c1e3ccSAndroid Build Coastguard Worker     ref += 2 * ref_stride;
44*77c1e3ccSAndroid Build Coastguard Worker     i -= 2;
45*77c1e3ccSAndroid Build Coastguard Worker   } while (i != 0);
46*77c1e3ccSAndroid Build Coastguard Worker 
47*77c1e3ccSAndroid Build Coastguard Worker   *sum = horizontal_add_s16x8(sum_s16);
48*77c1e3ccSAndroid Build Coastguard Worker   *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
49*77c1e3ccSAndroid Build Coastguard Worker }
50*77c1e3ccSAndroid Build Coastguard Worker 
variance_8xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int h,uint32_t * sse,int * sum)51*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_8xh_neon(const uint8_t *src, int src_stride,
52*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *ref, int ref_stride, int h,
53*77c1e3ccSAndroid Build Coastguard Worker                                      uint32_t *sse, int *sum) {
54*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t sum_s16 = vdupq_n_s16(0);
55*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
56*77c1e3ccSAndroid Build Coastguard Worker 
57*77c1e3ccSAndroid Build Coastguard Worker   // Number of rows we can process before 'sum_s16' overflows:
58*77c1e3ccSAndroid Build Coastguard Worker   // 32767 / 255 ~= 128
59*77c1e3ccSAndroid Build Coastguard Worker   assert(h <= 128);
60*77c1e3ccSAndroid Build Coastguard Worker 
61*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
62*77c1e3ccSAndroid Build Coastguard Worker   do {
63*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t s = vld1_u8(src);
64*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t r = vld1_u8(ref);
65*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
66*77c1e3ccSAndroid Build Coastguard Worker 
67*77c1e3ccSAndroid Build Coastguard Worker     sum_s16 = vaddq_s16(sum_s16, diff);
68*77c1e3ccSAndroid Build Coastguard Worker 
69*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
70*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] =
71*77c1e3ccSAndroid Build Coastguard Worker         vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
72*77c1e3ccSAndroid Build Coastguard Worker 
73*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
74*77c1e3ccSAndroid Build Coastguard Worker     ref += ref_stride;
75*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
76*77c1e3ccSAndroid Build Coastguard Worker 
77*77c1e3ccSAndroid Build Coastguard Worker   *sum = horizontal_add_s16x8(sum_s16);
78*77c1e3ccSAndroid Build Coastguard Worker   *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
79*77c1e3ccSAndroid Build Coastguard Worker }
80*77c1e3ccSAndroid Build Coastguard Worker 
variance_16xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int h,uint32_t * sse,int * sum)81*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_16xh_neon(const uint8_t *src, int src_stride,
82*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *ref, int ref_stride, int h,
83*77c1e3ccSAndroid Build Coastguard Worker                                       uint32_t *sse, int *sum) {
84*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
85*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
86*77c1e3ccSAndroid Build Coastguard Worker 
87*77c1e3ccSAndroid Build Coastguard Worker   // Number of rows we can process before 'sum_s16' accumulators overflow:
88*77c1e3ccSAndroid Build Coastguard Worker   // 32767 / 255 ~= 128, so 128 16-wide rows.
89*77c1e3ccSAndroid Build Coastguard Worker   assert(h <= 128);
90*77c1e3ccSAndroid Build Coastguard Worker 
91*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
92*77c1e3ccSAndroid Build Coastguard Worker   do {
93*77c1e3ccSAndroid Build Coastguard Worker     uint8x16_t s = vld1q_u8(src);
94*77c1e3ccSAndroid Build Coastguard Worker     uint8x16_t r = vld1q_u8(ref);
95*77c1e3ccSAndroid Build Coastguard Worker 
96*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t diff_l =
97*77c1e3ccSAndroid Build Coastguard Worker         vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
98*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t diff_h =
99*77c1e3ccSAndroid Build Coastguard Worker         vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
100*77c1e3ccSAndroid Build Coastguard Worker 
101*77c1e3ccSAndroid Build Coastguard Worker     sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
102*77c1e3ccSAndroid Build Coastguard Worker     sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
103*77c1e3ccSAndroid Build Coastguard Worker 
104*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] =
105*77c1e3ccSAndroid Build Coastguard Worker         vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
106*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] =
107*77c1e3ccSAndroid Build Coastguard Worker         vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
108*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] =
109*77c1e3ccSAndroid Build Coastguard Worker         vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
110*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] =
111*77c1e3ccSAndroid Build Coastguard Worker         vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
112*77c1e3ccSAndroid Build Coastguard Worker 
113*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
114*77c1e3ccSAndroid Build Coastguard Worker     ref += ref_stride;
115*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
116*77c1e3ccSAndroid Build Coastguard Worker 
117*77c1e3ccSAndroid Build Coastguard Worker   *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
118*77c1e3ccSAndroid Build Coastguard Worker   *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
119*77c1e3ccSAndroid Build Coastguard Worker }
120*77c1e3ccSAndroid Build Coastguard Worker 
variance_large_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int w,int h,int h_limit,uint32_t * sse,int * sum)121*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_large_neon(const uint8_t *src, int src_stride,
122*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *ref, int ref_stride,
123*77c1e3ccSAndroid Build Coastguard Worker                                        int w, int h, int h_limit, uint32_t *sse,
124*77c1e3ccSAndroid Build Coastguard Worker                                        int *sum) {
125*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sum_s32 = vdupq_n_s32(0);
126*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
127*77c1e3ccSAndroid Build Coastguard Worker 
128*77c1e3ccSAndroid Build Coastguard Worker   // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
129*77c1e3ccSAndroid Build Coastguard Worker   // accumulator overflows. After hitting this limit we accumulate into 32-bit
130*77c1e3ccSAndroid Build Coastguard Worker   // elements.
131*77c1e3ccSAndroid Build Coastguard Worker   int h_tmp = h > h_limit ? h_limit : h;
132*77c1e3ccSAndroid Build Coastguard Worker 
133*77c1e3ccSAndroid Build Coastguard Worker   int i = 0;
134*77c1e3ccSAndroid Build Coastguard Worker   do {
135*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
136*77c1e3ccSAndroid Build Coastguard Worker     do {
137*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
138*77c1e3ccSAndroid Build Coastguard Worker       do {
139*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t s = vld1q_u8(src + j);
140*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t r = vld1q_u8(ref + j);
141*77c1e3ccSAndroid Build Coastguard Worker 
142*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t diff_l =
143*77c1e3ccSAndroid Build Coastguard Worker             vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
144*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t diff_h =
145*77c1e3ccSAndroid Build Coastguard Worker             vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
146*77c1e3ccSAndroid Build Coastguard Worker 
147*77c1e3ccSAndroid Build Coastguard Worker         sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
148*77c1e3ccSAndroid Build Coastguard Worker         sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
149*77c1e3ccSAndroid Build Coastguard Worker 
150*77c1e3ccSAndroid Build Coastguard Worker         sse_s32[0] =
151*77c1e3ccSAndroid Build Coastguard Worker             vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
152*77c1e3ccSAndroid Build Coastguard Worker         sse_s32[1] =
153*77c1e3ccSAndroid Build Coastguard Worker             vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
154*77c1e3ccSAndroid Build Coastguard Worker         sse_s32[0] =
155*77c1e3ccSAndroid Build Coastguard Worker             vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
156*77c1e3ccSAndroid Build Coastguard Worker         sse_s32[1] =
157*77c1e3ccSAndroid Build Coastguard Worker             vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
158*77c1e3ccSAndroid Build Coastguard Worker 
159*77c1e3ccSAndroid Build Coastguard Worker         j += 16;
160*77c1e3ccSAndroid Build Coastguard Worker       } while (j < w);
161*77c1e3ccSAndroid Build Coastguard Worker 
162*77c1e3ccSAndroid Build Coastguard Worker       src += src_stride;
163*77c1e3ccSAndroid Build Coastguard Worker       ref += ref_stride;
164*77c1e3ccSAndroid Build Coastguard Worker       i++;
165*77c1e3ccSAndroid Build Coastguard Worker     } while (i < h_tmp);
166*77c1e3ccSAndroid Build Coastguard Worker 
167*77c1e3ccSAndroid Build Coastguard Worker     sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
168*77c1e3ccSAndroid Build Coastguard Worker     sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
169*77c1e3ccSAndroid Build Coastguard Worker 
170*77c1e3ccSAndroid Build Coastguard Worker     h_tmp += h_limit;
171*77c1e3ccSAndroid Build Coastguard Worker   } while (i < h);
172*77c1e3ccSAndroid Build Coastguard Worker 
173*77c1e3ccSAndroid Build Coastguard Worker   *sum = horizontal_add_s32x4(sum_s32);
174*77c1e3ccSAndroid Build Coastguard Worker   *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
175*77c1e3ccSAndroid Build Coastguard Worker }
176*77c1e3ccSAndroid Build Coastguard Worker 
variance_32xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int h,uint32_t * sse,int * sum)177*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_32xh_neon(const uint8_t *src, int src_stride,
178*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *ref, int ref_stride, int h,
179*77c1e3ccSAndroid Build Coastguard Worker                                       uint32_t *sse, int *sum) {
180*77c1e3ccSAndroid Build Coastguard Worker   variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
181*77c1e3ccSAndroid Build Coastguard Worker }
182*77c1e3ccSAndroid Build Coastguard Worker 
variance_64xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int h,uint32_t * sse,int * sum)183*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_64xh_neon(const uint8_t *src, int src_stride,
184*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *ref, int ref_stride, int h,
185*77c1e3ccSAndroid Build Coastguard Worker                                       uint32_t *sse, int *sum) {
186*77c1e3ccSAndroid Build Coastguard Worker   variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
187*77c1e3ccSAndroid Build Coastguard Worker }
188*77c1e3ccSAndroid Build Coastguard Worker 
variance_128xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,int h,uint32_t * sse,int * sum)189*77c1e3ccSAndroid Build Coastguard Worker static inline void variance_128xh_neon(const uint8_t *src, int src_stride,
190*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *ref, int ref_stride,
191*77c1e3ccSAndroid Build Coastguard Worker                                        int h, uint32_t *sse, int *sum) {
192*77c1e3ccSAndroid Build Coastguard Worker   variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
193*77c1e3ccSAndroid Build Coastguard Worker }
194*77c1e3ccSAndroid Build Coastguard Worker 
195*77c1e3ccSAndroid Build Coastguard Worker #define VARIANCE_WXH_NEON(w, h, shift)                                        \
196*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_variance##w##x##h##_neon(                                  \
197*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
198*77c1e3ccSAndroid Build Coastguard Worker       unsigned int *sse) {                                                    \
199*77c1e3ccSAndroid Build Coastguard Worker     int sum;                                                                  \
200*77c1e3ccSAndroid Build Coastguard Worker     variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
201*77c1e3ccSAndroid Build Coastguard Worker     return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
202*77c1e3ccSAndroid Build Coastguard Worker   }
203*77c1e3ccSAndroid Build Coastguard Worker 
204*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(4, 4, 4)
205*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(4, 8, 5)
206*77c1e3ccSAndroid Build Coastguard Worker 
207*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(8, 4, 5)
208*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(8, 8, 6)
209*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(8, 16, 7)
210*77c1e3ccSAndroid Build Coastguard Worker 
211*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(16, 8, 7)
212*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(16, 16, 8)
213*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(16, 32, 9)
214*77c1e3ccSAndroid Build Coastguard Worker 
215*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(32, 16, 9)
216*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(32, 32, 10)
217*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(32, 64, 11)
218*77c1e3ccSAndroid Build Coastguard Worker 
219*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(64, 32, 11)
220*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(64, 64, 12)
221*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(64, 128, 13)
222*77c1e3ccSAndroid Build Coastguard Worker 
223*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(128, 64, 13)
224*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(128, 128, 14)
225*77c1e3ccSAndroid Build Coastguard Worker 
226*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
227*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(4, 16, 6)
228*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(8, 32, 8)
229*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(16, 4, 6)
230*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(16, 64, 10)
231*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(32, 8, 8)
232*77c1e3ccSAndroid Build Coastguard Worker VARIANCE_WXH_NEON(64, 16, 10)
233*77c1e3ccSAndroid Build Coastguard Worker #endif
234*77c1e3ccSAndroid Build Coastguard Worker 
235*77c1e3ccSAndroid Build Coastguard Worker #undef VARIANCE_WXH_NEON
236*77c1e3ccSAndroid Build Coastguard Worker 
237*77c1e3ccSAndroid Build Coastguard Worker // TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
238*77c1e3ccSAndroid Build Coastguard Worker // AVX2. Also, implement the NEON for variance computation present in this
239*77c1e3ccSAndroid Build Coastguard Worker // function.
aom_get_var_sse_sum_8x8_quad_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,uint32_t * sse8x8,int * sum8x8,unsigned int * tot_sse,int * tot_sum,uint32_t * var8x8)240*77c1e3ccSAndroid Build Coastguard Worker void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
241*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *ref, int ref_stride,
242*77c1e3ccSAndroid Build Coastguard Worker                                        uint32_t *sse8x8, int *sum8x8,
243*77c1e3ccSAndroid Build Coastguard Worker                                        unsigned int *tot_sse, int *tot_sum,
244*77c1e3ccSAndroid Build Coastguard Worker                                        uint32_t *var8x8) {
245*77c1e3ccSAndroid Build Coastguard Worker   // Loop over four 8x8 blocks. Process one 8x32 block.
246*77c1e3ccSAndroid Build Coastguard Worker   for (int k = 0; k < 4; k++) {
247*77c1e3ccSAndroid Build Coastguard Worker     variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
248*77c1e3ccSAndroid Build Coastguard Worker                       &sse8x8[k], &sum8x8[k]);
249*77c1e3ccSAndroid Build Coastguard Worker   }
250*77c1e3ccSAndroid Build Coastguard Worker 
251*77c1e3ccSAndroid Build Coastguard Worker   *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
252*77c1e3ccSAndroid Build Coastguard Worker   *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
253*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 4; i++) {
254*77c1e3ccSAndroid Build Coastguard Worker     var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
255*77c1e3ccSAndroid Build Coastguard Worker   }
256*77c1e3ccSAndroid Build Coastguard Worker }
257*77c1e3ccSAndroid Build Coastguard Worker 
aom_get_var_sse_sum_16x16_dual_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,uint32_t * sse16x16,unsigned int * tot_sse,int * tot_sum,uint32_t * var16x16)258*77c1e3ccSAndroid Build Coastguard Worker void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
259*77c1e3ccSAndroid Build Coastguard Worker                                          const uint8_t *ref, int ref_stride,
260*77c1e3ccSAndroid Build Coastguard Worker                                          uint32_t *sse16x16,
261*77c1e3ccSAndroid Build Coastguard Worker                                          unsigned int *tot_sse, int *tot_sum,
262*77c1e3ccSAndroid Build Coastguard Worker                                          uint32_t *var16x16) {
263*77c1e3ccSAndroid Build Coastguard Worker   int sum16x16[2] = { 0 };
264*77c1e3ccSAndroid Build Coastguard Worker   // Loop over two 16x16 blocks. Process one 16x32 block.
265*77c1e3ccSAndroid Build Coastguard Worker   for (int k = 0; k < 2; k++) {
266*77c1e3ccSAndroid Build Coastguard Worker     variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
267*77c1e3ccSAndroid Build Coastguard Worker                        16, &sse16x16[k], &sum16x16[k]);
268*77c1e3ccSAndroid Build Coastguard Worker   }
269*77c1e3ccSAndroid Build Coastguard Worker 
270*77c1e3ccSAndroid Build Coastguard Worker   *tot_sse += sse16x16[0] + sse16x16[1];
271*77c1e3ccSAndroid Build Coastguard Worker   *tot_sum += sum16x16[0] + sum16x16[1];
272*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 2; i++) {
273*77c1e3ccSAndroid Build Coastguard Worker     var16x16[i] =
274*77c1e3ccSAndroid Build Coastguard Worker         sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
275*77c1e3ccSAndroid Build Coastguard Worker   }
276*77c1e3ccSAndroid Build Coastguard Worker }
277*77c1e3ccSAndroid Build Coastguard Worker 
mse8xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse,int h)278*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
279*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *ref, int ref_stride,
280*77c1e3ccSAndroid Build Coastguard Worker                                        unsigned int *sse, int h) {
281*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t s[2], r[2];
282*77c1e3ccSAndroid Build Coastguard Worker   int16x4_t diff_lo[2], diff_hi[2];
283*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t diff[2];
284*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
285*77c1e3ccSAndroid Build Coastguard Worker 
286*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
287*77c1e3ccSAndroid Build Coastguard Worker   do {
288*77c1e3ccSAndroid Build Coastguard Worker     s[0] = vld1_u8(src);
289*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
290*77c1e3ccSAndroid Build Coastguard Worker     s[1] = vld1_u8(src);
291*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
292*77c1e3ccSAndroid Build Coastguard Worker     r[0] = vld1_u8(ref);
293*77c1e3ccSAndroid Build Coastguard Worker     ref += ref_stride;
294*77c1e3ccSAndroid Build Coastguard Worker     r[1] = vld1_u8(ref);
295*77c1e3ccSAndroid Build Coastguard Worker     ref += ref_stride;
296*77c1e3ccSAndroid Build Coastguard Worker 
297*77c1e3ccSAndroid Build Coastguard Worker     diff[0] = vsubl_u8(s[0], r[0]);
298*77c1e3ccSAndroid Build Coastguard Worker     diff[1] = vsubl_u8(s[1], r[1]);
299*77c1e3ccSAndroid Build Coastguard Worker 
300*77c1e3ccSAndroid Build Coastguard Worker     diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
301*77c1e3ccSAndroid Build Coastguard Worker     diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
302*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
303*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
304*77c1e3ccSAndroid Build Coastguard Worker 
305*77c1e3ccSAndroid Build Coastguard Worker     diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
306*77c1e3ccSAndroid Build Coastguard Worker     diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
307*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
308*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
309*77c1e3ccSAndroid Build Coastguard Worker 
310*77c1e3ccSAndroid Build Coastguard Worker     i -= 2;
311*77c1e3ccSAndroid Build Coastguard Worker   } while (i != 0);
312*77c1e3ccSAndroid Build Coastguard Worker 
313*77c1e3ccSAndroid Build Coastguard Worker   sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
314*77c1e3ccSAndroid Build Coastguard Worker 
315*77c1e3ccSAndroid Build Coastguard Worker   *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
316*77c1e3ccSAndroid Build Coastguard Worker   return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
317*77c1e3ccSAndroid Build Coastguard Worker }
318*77c1e3ccSAndroid Build Coastguard Worker 
mse16xh_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride,unsigned int * sse,int h)319*77c1e3ccSAndroid Build Coastguard Worker static inline unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
320*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *ref, int ref_stride,
321*77c1e3ccSAndroid Build Coastguard Worker                                         unsigned int *sse, int h) {
322*77c1e3ccSAndroid Build Coastguard Worker   uint8x16_t s[2], r[2];
323*77c1e3ccSAndroid Build Coastguard Worker   int16x4_t diff_lo[4], diff_hi[4];
324*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t diff[4];
325*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
326*77c1e3ccSAndroid Build Coastguard Worker                            vdupq_n_s32(0) };
327*77c1e3ccSAndroid Build Coastguard Worker 
328*77c1e3ccSAndroid Build Coastguard Worker   int i = h;
329*77c1e3ccSAndroid Build Coastguard Worker   do {
330*77c1e3ccSAndroid Build Coastguard Worker     s[0] = vld1q_u8(src);
331*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
332*77c1e3ccSAndroid Build Coastguard Worker     s[1] = vld1q_u8(src);
333*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
334*77c1e3ccSAndroid Build Coastguard Worker     r[0] = vld1q_u8(ref);
335*77c1e3ccSAndroid Build Coastguard Worker     ref += ref_stride;
336*77c1e3ccSAndroid Build Coastguard Worker     r[1] = vld1q_u8(ref);
337*77c1e3ccSAndroid Build Coastguard Worker     ref += ref_stride;
338*77c1e3ccSAndroid Build Coastguard Worker 
339*77c1e3ccSAndroid Build Coastguard Worker     diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0]));
340*77c1e3ccSAndroid Build Coastguard Worker     diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0]));
341*77c1e3ccSAndroid Build Coastguard Worker     diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1]));
342*77c1e3ccSAndroid Build Coastguard Worker     diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1]));
343*77c1e3ccSAndroid Build Coastguard Worker 
344*77c1e3ccSAndroid Build Coastguard Worker     diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
345*77c1e3ccSAndroid Build Coastguard Worker     diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
346*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
347*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
348*77c1e3ccSAndroid Build Coastguard Worker 
349*77c1e3ccSAndroid Build Coastguard Worker     diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
350*77c1e3ccSAndroid Build Coastguard Worker     diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
351*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]);
352*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]);
353*77c1e3ccSAndroid Build Coastguard Worker 
354*77c1e3ccSAndroid Build Coastguard Worker     diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
355*77c1e3ccSAndroid Build Coastguard Worker     diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
356*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
357*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
358*77c1e3ccSAndroid Build Coastguard Worker 
359*77c1e3ccSAndroid Build Coastguard Worker     diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
360*77c1e3ccSAndroid Build Coastguard Worker     diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
361*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]);
362*77c1e3ccSAndroid Build Coastguard Worker     sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]);
363*77c1e3ccSAndroid Build Coastguard Worker 
364*77c1e3ccSAndroid Build Coastguard Worker     i -= 2;
365*77c1e3ccSAndroid Build Coastguard Worker   } while (i != 0);
366*77c1e3ccSAndroid Build Coastguard Worker 
367*77c1e3ccSAndroid Build Coastguard Worker   sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
368*77c1e3ccSAndroid Build Coastguard Worker   sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]);
369*77c1e3ccSAndroid Build Coastguard Worker   sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]);
370*77c1e3ccSAndroid Build Coastguard Worker 
371*77c1e3ccSAndroid Build Coastguard Worker   *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
372*77c1e3ccSAndroid Build Coastguard Worker   return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
373*77c1e3ccSAndroid Build Coastguard Worker }
374*77c1e3ccSAndroid Build Coastguard Worker 
375*77c1e3ccSAndroid Build Coastguard Worker #define MSE_WXH_NEON(w, h)                                                 \
376*77c1e3ccSAndroid Build Coastguard Worker   unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
377*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *ref, int ref_stride, \
378*77c1e3ccSAndroid Build Coastguard Worker                                        unsigned int *sse) {                \
379*77c1e3ccSAndroid Build Coastguard Worker     return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h);      \
380*77c1e3ccSAndroid Build Coastguard Worker   }
381*77c1e3ccSAndroid Build Coastguard Worker 
382*77c1e3ccSAndroid Build Coastguard Worker MSE_WXH_NEON(8, 8)
383*77c1e3ccSAndroid Build Coastguard Worker MSE_WXH_NEON(8, 16)
384*77c1e3ccSAndroid Build Coastguard Worker 
385*77c1e3ccSAndroid Build Coastguard Worker MSE_WXH_NEON(16, 8)
386*77c1e3ccSAndroid Build Coastguard Worker MSE_WXH_NEON(16, 16)
387*77c1e3ccSAndroid Build Coastguard Worker 
388*77c1e3ccSAndroid Build Coastguard Worker #undef MSE_WXH_NEON
389*77c1e3ccSAndroid Build Coastguard Worker 
mse_accumulate_u16_u8_8x2(uint64x2_t sum,uint16x8_t s0,uint16x8_t s1,uint8x8_t d0,uint8x8_t d1)390*77c1e3ccSAndroid Build Coastguard Worker static inline uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
391*77c1e3ccSAndroid Build Coastguard Worker                                                    uint16x8_t s0, uint16x8_t s1,
392*77c1e3ccSAndroid Build Coastguard Worker                                                    uint8x8_t d0, uint8x8_t d1) {
393*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
394*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1));
395*77c1e3ccSAndroid Build Coastguard Worker 
396*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0));
397*77c1e3ccSAndroid Build Coastguard Worker   mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0));
398*77c1e3ccSAndroid Build Coastguard Worker   mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1));
399*77c1e3ccSAndroid Build Coastguard Worker   mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1));
400*77c1e3ccSAndroid Build Coastguard Worker 
401*77c1e3ccSAndroid Build Coastguard Worker   return vpadalq_u32(sum, vreinterpretq_u32_s32(mse));
402*77c1e3ccSAndroid Build Coastguard Worker }
403*77c1e3ccSAndroid Build Coastguard Worker 
mse_wxh_16bit(uint8_t * dst,int dstride,const uint16_t * src,int sstride,int w,int h)404*77c1e3ccSAndroid Build Coastguard Worker static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src,
405*77c1e3ccSAndroid Build Coastguard Worker                                 int sstride, int w, int h) {
406*77c1e3ccSAndroid Build Coastguard Worker   assert((w == 8 || w == 4) && (h == 8 || h == 4));
407*77c1e3ccSAndroid Build Coastguard Worker 
408*77c1e3ccSAndroid Build Coastguard Worker   uint64x2_t sum = vdupq_n_u64(0);
409*77c1e3ccSAndroid Build Coastguard Worker 
410*77c1e3ccSAndroid Build Coastguard Worker   if (w == 8) {
411*77c1e3ccSAndroid Build Coastguard Worker     do {
412*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d0 = vld1_u8(dst + 0 * dstride);
413*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d1 = vld1_u8(dst + 1 * dstride);
414*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
415*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
416*77c1e3ccSAndroid Build Coastguard Worker 
417*77c1e3ccSAndroid Build Coastguard Worker       sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
418*77c1e3ccSAndroid Build Coastguard Worker 
419*77c1e3ccSAndroid Build Coastguard Worker       dst += 2 * dstride;
420*77c1e3ccSAndroid Build Coastguard Worker       src += 2 * sstride;
421*77c1e3ccSAndroid Build Coastguard Worker       h -= 2;
422*77c1e3ccSAndroid Build Coastguard Worker     } while (h != 0);
423*77c1e3ccSAndroid Build Coastguard Worker   } else {
424*77c1e3ccSAndroid Build Coastguard Worker     do {
425*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride);
426*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride);
427*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
428*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
429*77c1e3ccSAndroid Build Coastguard Worker 
430*77c1e3ccSAndroid Build Coastguard Worker       sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
431*77c1e3ccSAndroid Build Coastguard Worker 
432*77c1e3ccSAndroid Build Coastguard Worker       dst += 4 * dstride;
433*77c1e3ccSAndroid Build Coastguard Worker       src += 4 * sstride;
434*77c1e3ccSAndroid Build Coastguard Worker       h -= 4;
435*77c1e3ccSAndroid Build Coastguard Worker     } while (h != 0);
436*77c1e3ccSAndroid Build Coastguard Worker   }
437*77c1e3ccSAndroid Build Coastguard Worker 
438*77c1e3ccSAndroid Build Coastguard Worker   return sum;
439*77c1e3ccSAndroid Build Coastguard Worker }
440*77c1e3ccSAndroid Build Coastguard Worker 
441*77c1e3ccSAndroid Build Coastguard Worker // Computes mse for a given block size. This function gets called for specific
442*77c1e3ccSAndroid Build Coastguard Worker // block sizes, which are 8x8, 8x4, 4x8 and 4x4.
aom_mse_wxh_16bit_neon(uint8_t * dst,int dstride,uint16_t * src,int sstride,int w,int h)443*77c1e3ccSAndroid Build Coastguard Worker uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
444*77c1e3ccSAndroid Build Coastguard Worker                                 int sstride, int w, int h) {
445*77c1e3ccSAndroid Build Coastguard Worker   return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h));
446*77c1e3ccSAndroid Build Coastguard Worker }
447*77c1e3ccSAndroid Build Coastguard Worker 
448*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY
aom_get_mb_ss_neon(const int16_t * a)449*77c1e3ccSAndroid Build Coastguard Worker uint32_t aom_get_mb_ss_neon(const int16_t *a) {
450*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
451*77c1e3ccSAndroid Build Coastguard Worker 
452*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 256; i = i + 8) {
453*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t a_s16 = vld1q_s16(a + i);
454*77c1e3ccSAndroid Build Coastguard Worker 
455*77c1e3ccSAndroid Build Coastguard Worker     sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16));
456*77c1e3ccSAndroid Build Coastguard Worker     sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16));
457*77c1e3ccSAndroid Build Coastguard Worker   }
458*77c1e3ccSAndroid Build Coastguard Worker 
459*77c1e3ccSAndroid Build Coastguard Worker   return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
460*77c1e3ccSAndroid Build Coastguard Worker }
461*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY
462*77c1e3ccSAndroid Build Coastguard Worker 
aom_mse_16xh_16bit_neon(uint8_t * dst,int dstride,uint16_t * src,int w,int h)463*77c1e3ccSAndroid Build Coastguard Worker uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
464*77c1e3ccSAndroid Build Coastguard Worker                                  int w, int h) {
465*77c1e3ccSAndroid Build Coastguard Worker   uint64x2_t sum = vdupq_n_u64(0);
466*77c1e3ccSAndroid Build Coastguard Worker 
467*77c1e3ccSAndroid Build Coastguard Worker   int num_blks = 16 / w;
468*77c1e3ccSAndroid Build Coastguard Worker   do {
469*77c1e3ccSAndroid Build Coastguard Worker     sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h));
470*77c1e3ccSAndroid Build Coastguard Worker     dst += w;
471*77c1e3ccSAndroid Build Coastguard Worker     src += w * h;
472*77c1e3ccSAndroid Build Coastguard Worker   } while (--num_blks != 0);
473*77c1e3ccSAndroid Build Coastguard Worker 
474*77c1e3ccSAndroid Build Coastguard Worker   return horizontal_add_u64x2(sum);
475*77c1e3ccSAndroid Build Coastguard Worker }
476