xref: /aosp_15_r20/external/libaom/av1/encoder/arm/pickrst_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
13 #define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
14 
15 #include <arm_neon.h>
16 
17 #include "av1/common/restoration.h"
18 
19 #define WIN_7 ((WIENER_WIN - 1) * 2)
20 #define WIN_CHROMA ((WIENER_WIN_CHROMA - 1) * 2)
21 
22 // Aligned sizes for Wiener filters.
23 #define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
24 #define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
25 #define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
26 #define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
27 #define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
28 
29 // Compute 8 values of M (cross correlation) for a single source pixel and
30 // accumulate.
update_M_1pixel(int32_t * M_s32,int16x4_t src_avg,int16x8_t dgd_avg)31 static inline void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
32                                    int16x8_t dgd_avg) {
33   int32x4_t lo = vld1q_s32(M_s32 + 0);
34   int32x4_t hi = vld1q_s32(M_s32 + 4);
35 
36   lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
37   hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
38 
39   vst1q_s32(M_s32 + 0, lo);
40   vst1q_s32(M_s32 + 4, hi);
41 }
42 
43 // Compute 8 values of M (cross correlation) for two source pixels and
44 // accumulate.
update_M_2pixels(int32_t * M_s32,int16x4_t src_avg0,int16x4_t src_avg1,int16x8_t dgd_avg0,int16x8_t dgd_avg1)45 static inline void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
46                                     int16x4_t src_avg1, int16x8_t dgd_avg0,
47                                     int16x8_t dgd_avg1) {
48   int32x4_t lo = vld1q_s32(M_s32 + 0);
49   int32x4_t hi = vld1q_s32(M_s32 + 4);
50 
51   lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
52   hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
53   lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
54   hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
55 
56   vst1q_s32(M_s32 + 0, lo);
57   vst1q_s32(M_s32 + 4, hi);
58 }
59 
update_H_1pixel(int32_t * H_s32,const int16_t * dgd_avg,int width,int height)60 static inline void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
61                                    int width, int height) {
62   for (int i = 0; i < height; i += 4) {
63     int16x4_t di = vld1_s16(dgd_avg + i);
64 
65     for (int j = i; j < width; j += 4) {
66       int16x4_t dj = vld1_s16(dgd_avg + j);
67       int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
68       int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
69       int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
70       int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
71 
72       h0 = vmlal_lane_s16(h0, dj, di, 0);
73       h1 = vmlal_lane_s16(h1, dj, di, 1);
74       h2 = vmlal_lane_s16(h2, dj, di, 2);
75       h3 = vmlal_lane_s16(h3, dj, di, 3);
76 
77       vst1q_s32(H_s32 + 0 * width + j, h0);
78       vst1q_s32(H_s32 + 1 * width + j, h1);
79       vst1q_s32(H_s32 + 2 * width + j, h2);
80       vst1q_s32(H_s32 + 3 * width + j, h3);
81     }
82     H_s32 += 4 * width;
83   }
84 }
85 
update_H_5x5_2pixels(int32_t * H_s32,const int16_t * dgd_avg0,const int16_t * dgd_avg1)86 static inline void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
87                                         const int16_t *dgd_avg1) {
88   for (int i = 0; i < 24; i += 4) {
89     int16x4_t di0 = vld1_s16(dgd_avg0 + i);
90     int16x4_t di1 = vld1_s16(dgd_avg1 + i);
91 
92     for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
93       int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
94       int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
95       int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
96       int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
97       int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
98       int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
99 
100       h0 = vmlal_lane_s16(h0, dj0, di0, 0);
101       h0 = vmlal_lane_s16(h0, dj1, di1, 0);
102       h1 = vmlal_lane_s16(h1, dj0, di0, 1);
103       h1 = vmlal_lane_s16(h1, dj1, di1, 1);
104       h2 = vmlal_lane_s16(h2, dj0, di0, 2);
105       h2 = vmlal_lane_s16(h2, dj1, di1, 2);
106       h3 = vmlal_lane_s16(h3, dj0, di0, 3);
107       h3 = vmlal_lane_s16(h3, dj1, di1, 3);
108 
109       vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
110       vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
111       vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
112       vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
113     }
114     H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
115   }
116 }
117 
update_H_7x7_2pixels(int32_t * H_s32,const int16_t * dgd_avg0,const int16_t * dgd_avg1)118 static inline void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
119                                         const int16_t *dgd_avg1) {
120   for (int i = 0; i < 48; i += 4) {
121     int16x4_t di0 = vld1_s16(dgd_avg0 + i);
122     int16x4_t di1 = vld1_s16(dgd_avg1 + i);
123 
124     int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
125     int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
126     int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
127     int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
128 
129     h0 = vmlal_lane_s16(h0, di0, di0, 0);
130     h0 = vmlal_lane_s16(h0, di1, di1, 0);
131     h1 = vmlal_lane_s16(h1, di0, di0, 1);
132     h1 = vmlal_lane_s16(h1, di1, di1, 1);
133     h2 = vmlal_lane_s16(h2, di0, di0, 2);
134     h2 = vmlal_lane_s16(h2, di1, di1, 2);
135     h3 = vmlal_lane_s16(h3, di0, di0, 3);
136     h3 = vmlal_lane_s16(h3, di1, di1, 3);
137 
138     vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
139     vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
140     vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
141     vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
142 
143     for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
144       int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
145       int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
146       h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
147       h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
148       h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
149       h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
150 
151       h0 = vmlal_lane_s16(h0, dj0, di0, 0);
152       h0 = vmlal_lane_s16(h0, dj1, di1, 0);
153       h1 = vmlal_lane_s16(h1, dj0, di0, 1);
154       h1 = vmlal_lane_s16(h1, dj1, di1, 1);
155       h2 = vmlal_lane_s16(h2, dj0, di0, 2);
156       h2 = vmlal_lane_s16(h2, dj1, di1, 2);
157       h3 = vmlal_lane_s16(h3, dj0, di0, 3);
158       h3 = vmlal_lane_s16(h3, dj1, di1, 3);
159 
160       vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
161       vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
162       vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
163       vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
164     }
165     H_s32 += 4 * WIENER_WIN2_ALIGN2;
166   }
167 }
168 
169 // Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
accumulate_and_clear(int64_t * dst,int32_t * src,int length)170 static inline void accumulate_and_clear(int64_t *dst, int32_t *src,
171                                         int length) {
172   do {
173     int32x4_t s32 = vld1q_s32(src);
174     vst1q_s32(src, vdupq_n_s32(0));
175     src += 4;
176 
177     int64x2_t d_lo = vld1q_s64(dst + 0);
178     int64x2_t d_hi = vld1q_s64(dst + 2);
179 
180     d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
181     d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
182 
183     vst1q_s64(dst + 0, d_lo);
184     vst1q_s64(dst + 2, d_hi);
185 
186     dst += 4;
187     length -= 4;
188   } while (length > 0);
189 }
190 
191 // clang-format off
192 // Constant pool to act as a mask to zero n top elements in an int16x8_t vector.
193 // The index we load from depends on n.
194 static const int16_t mask_16bit[32] = {
195   0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
196   0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
197        0,      0,      0,      0,      0,      0,      0,      0,
198        0,      0,      0,      0,      0,      0,      0,      0,
199 };
200 // clang-format on
201 
madd_neon_pairwise(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)202 static inline void madd_neon_pairwise(int32x4_t *sum, const int16x8_t src,
203                                       const int16x8_t dgd) {
204   const int32x4_t sd =
205       horizontal_add_2d_s32(vmull_s16(vget_low_s16(src), vget_low_s16(dgd)),
206                             vmull_s16(vget_high_s16(src), vget_high_s16(dgd)));
207   *sum = vaddq_s32(*sum, sd);
208 }
209 
madd_neon(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)210 static inline void madd_neon(int32x4_t *sum, const int16x8_t src,
211                              const int16x8_t dgd) {
212   *sum = vmlal_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
213   *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
214 }
215 
msub_neon(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)216 static inline void msub_neon(int32x4_t *sum, const int16x8_t src,
217                              const int16x8_t dgd) {
218   *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
219   *sum = vmlsl_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
220 }
221 
compute_delta_step3(int32x4_t * sum0,int32x4_t * sum1,const int16x8_t src0,const int16x8_t src1,const int16x8_t dgd0,const int16x8_t dgd1)222 static inline void compute_delta_step3(int32x4_t *sum0, int32x4_t *sum1,
223                                        const int16x8_t src0,
224                                        const int16x8_t src1,
225                                        const int16x8_t dgd0,
226                                        const int16x8_t dgd1) {
227   *sum0 = vmlsl_s16(*sum0, vget_low_s16(src0), vget_low_s16(dgd0));
228   *sum0 = vmlal_s16(*sum0, vget_low_s16(src1), vget_low_s16(dgd1));
229   *sum1 = vmlsl_s16(*sum1, vget_high_s16(src0), vget_high_s16(dgd0));
230   *sum1 = vmlal_s16(*sum1, vget_high_s16(src1), vget_high_s16(dgd1));
231 }
232 
hadd_four_32_neon(const int32x4_t src0,const int32x4_t src1,const int32x4_t src2,const int32x4_t src3)233 static inline int32x4_t hadd_four_32_neon(const int32x4_t src0,
234                                           const int32x4_t src1,
235                                           const int32x4_t src2,
236                                           const int32x4_t src3) {
237   int32x4_t src[4] = { src0, src1, src2, src3 };
238   return horizontal_add_4d_s32x4(src);
239 }
240 
update_4_stats_neon(const int64_t * const src,const int32x4_t delta,int64_t * const dst)241 static inline void update_4_stats_neon(const int64_t *const src,
242                                        const int32x4_t delta,
243                                        int64_t *const dst) {
244   const int64x2_t s1 = vld1q_s64(src);
245   const int64x2_t s2 = vld1q_s64(src + 2);
246 
247   const int64x2_t d1 = vaddw_s32(s1, vget_low_s32(delta));
248   const int64x2_t d2 = vaddw_s32(s2, vget_high_s32(delta));
249 
250   vst1q_s64(dst, d1);
251   vst1q_s64(dst + 2, d2);
252 }
253 
load_more_16_neon(const int16_t * const src,const int32_t width,const int16x8_t org[2],int16x8_t dst[2])254 static inline void load_more_16_neon(const int16_t *const src,
255                                      const int32_t width,
256                                      const int16x8_t org[2], int16x8_t dst[2]) {
257   int16x8_t s0 = vld1q_dup_s16(src);
258   int16x8_t s1 = vld1q_dup_s16(src + width);
259   dst[0] = vextq_s16(org[0], s0, 1);
260   dst[1] = vextq_s16(org[1], s1, 1);
261 }
262 
stats_top_win5_neon(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int32x4_t * sum_m,int32x4_t * sum_h)263 static inline void stats_top_win5_neon(const int16x8_t src[2],
264                                        const int16x8_t dgd[2],
265                                        const int16_t *const d,
266                                        const int32_t d_stride, int32x4_t *sum_m,
267                                        int32x4_t *sum_h) {
268   int16x8_t dgds[WIENER_WIN_CHROMA * 2];
269 
270   load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
271                &dgds[8]);
272   load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
273                &dgds[9]);
274 
275   madd_neon(&sum_m[0], src[0], dgds[0]);
276   madd_neon(&sum_m[0], src[1], dgds[1]);
277   madd_neon(&sum_m[1], src[0], dgds[2]);
278   madd_neon(&sum_m[1], src[1], dgds[3]);
279   madd_neon(&sum_m[2], src[0], dgds[4]);
280   madd_neon(&sum_m[2], src[1], dgds[5]);
281   madd_neon(&sum_m[3], src[0], dgds[6]);
282   madd_neon(&sum_m[3], src[1], dgds[7]);
283   madd_neon(&sum_m[4], src[0], dgds[8]);
284   madd_neon(&sum_m[4], src[1], dgds[9]);
285 
286   madd_neon(&sum_h[0], dgd[0], dgds[0]);
287   madd_neon(&sum_h[0], dgd[1], dgds[1]);
288   madd_neon(&sum_h[1], dgd[0], dgds[2]);
289   madd_neon(&sum_h[1], dgd[1], dgds[3]);
290   madd_neon(&sum_h[2], dgd[0], dgds[4]);
291   madd_neon(&sum_h[2], dgd[1], dgds[5]);
292   madd_neon(&sum_h[3], dgd[0], dgds[6]);
293   madd_neon(&sum_h[3], dgd[1], dgds[7]);
294   madd_neon(&sum_h[4], dgd[0], dgds[8]);
295   madd_neon(&sum_h[4], dgd[1], dgds[9]);
296 }
297 
stats_left_win5_neon(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int32x4_t * sum)298 static inline void stats_left_win5_neon(const int16x8_t src[2],
299                                         const int16_t *d,
300                                         const int32_t d_stride,
301                                         int32x4_t *sum) {
302   int16x8_t dgds[WIN_CHROMA];
303 
304   load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
305                &dgds[6]);
306   load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
307                &dgds[7]);
308 
309   madd_neon(&sum[0], src[0], dgds[0]);
310   madd_neon(&sum[0], src[1], dgds[1]);
311   madd_neon(&sum[1], src[0], dgds[2]);
312   madd_neon(&sum[1], src[1], dgds[3]);
313   madd_neon(&sum[2], src[0], dgds[4]);
314   madd_neon(&sum[2], src[1], dgds[5]);
315   madd_neon(&sum[3], src[0], dgds[6]);
316   madd_neon(&sum[3], src[1], dgds[7]);
317 }
318 
derive_square_win5_neon(const int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int32x4_t deltas[WIENER_WIN_CHROMA-1][WIENER_WIN_CHROMA-1])319 static inline void derive_square_win5_neon(
320     const int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
321     const int16x8_t *d_je,
322     int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
323   msub_neon(&deltas[0][0], d_is[0], d_js[0]);
324   msub_neon(&deltas[0][0], d_is[1], d_js[1]);
325   msub_neon(&deltas[0][1], d_is[0], d_js[2]);
326   msub_neon(&deltas[0][1], d_is[1], d_js[3]);
327   msub_neon(&deltas[0][2], d_is[0], d_js[4]);
328   msub_neon(&deltas[0][2], d_is[1], d_js[5]);
329   msub_neon(&deltas[0][3], d_is[0], d_js[6]);
330   msub_neon(&deltas[0][3], d_is[1], d_js[7]);
331 
332   msub_neon(&deltas[1][0], d_is[2], d_js[0]);
333   msub_neon(&deltas[1][0], d_is[3], d_js[1]);
334   msub_neon(&deltas[1][1], d_is[2], d_js[2]);
335   msub_neon(&deltas[1][1], d_is[3], d_js[3]);
336   msub_neon(&deltas[1][2], d_is[2], d_js[4]);
337   msub_neon(&deltas[1][2], d_is[3], d_js[5]);
338   msub_neon(&deltas[1][3], d_is[2], d_js[6]);
339   msub_neon(&deltas[1][3], d_is[3], d_js[7]);
340 
341   msub_neon(&deltas[2][0], d_is[4], d_js[0]);
342   msub_neon(&deltas[2][0], d_is[5], d_js[1]);
343   msub_neon(&deltas[2][1], d_is[4], d_js[2]);
344   msub_neon(&deltas[2][1], d_is[5], d_js[3]);
345   msub_neon(&deltas[2][2], d_is[4], d_js[4]);
346   msub_neon(&deltas[2][2], d_is[5], d_js[5]);
347   msub_neon(&deltas[2][3], d_is[4], d_js[6]);
348   msub_neon(&deltas[2][3], d_is[5], d_js[7]);
349 
350   msub_neon(&deltas[3][0], d_is[6], d_js[0]);
351   msub_neon(&deltas[3][0], d_is[7], d_js[1]);
352   msub_neon(&deltas[3][1], d_is[6], d_js[2]);
353   msub_neon(&deltas[3][1], d_is[7], d_js[3]);
354   msub_neon(&deltas[3][2], d_is[6], d_js[4]);
355   msub_neon(&deltas[3][2], d_is[7], d_js[5]);
356   msub_neon(&deltas[3][3], d_is[6], d_js[6]);
357   msub_neon(&deltas[3][3], d_is[7], d_js[7]);
358 
359   madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
360   madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
361   madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
362   madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
363   madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
364   madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
365   madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
366   madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
367 
368   madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
369   madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
370   madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
371   madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
372   madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
373   madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
374   madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
375   madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
376 
377   madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
378   madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
379   madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
380   madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
381   madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
382   madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
383   madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
384   madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
385 
386   madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
387   madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
388   madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
389   madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
390   madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
391   madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
392   madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
393   madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
394 }
395 
load_square_win5_neon(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je)396 static inline void load_square_win5_neon(const int16_t *const di,
397                                          const int16_t *const dj,
398                                          const int32_t d_stride,
399                                          const int32_t height, int16x8_t *d_is,
400                                          int16x8_t *d_ie, int16x8_t *d_js,
401                                          int16x8_t *d_je) {
402   load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
403   load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
404   load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
405   load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
406 
407   load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
408                &d_ie[4], &d_ie[6]);
409   load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
410                &d_ie[5], &d_ie[7]);
411   load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
412                &d_je[4], &d_je[6]);
413   load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
414                &d_je[5], &d_je[7]);
415 }
416 
update_5_stats_neon(const int64_t * const src,const int32x4_t delta,const int64_t delta4,int64_t * const dst)417 static inline void update_5_stats_neon(const int64_t *const src,
418                                        const int32x4_t delta,
419                                        const int64_t delta4,
420                                        int64_t *const dst) {
421   update_4_stats_neon(src + 0, delta, dst + 0);
422   dst[4] = src[4] + delta4;
423 }
424 
compute_delta_step3_two_lines(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)425 static inline void compute_delta_step3_two_lines(int32x4_t *sum,
426                                                  const int16x8_t src,
427                                                  const int16x8_t dgd) {
428   *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
429   *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
430 }
431 
step3_win5_neon(const int16_t * d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)432 static inline void step3_win5_neon(const int16_t *d, const int32_t d_stride,
433                                    const int32_t width, const int32_t height,
434                                    int16x8_t *ds, int32x4_t *deltas) {
435   int32_t y = height;
436   do {
437     ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
438     ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width);
439 
440     compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]);
441     compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]);
442     compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]);
443     compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]);
444     compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]);
445     compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]);
446     compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]);
447     compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]);
448     compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]);
449     compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]);
450 
451     ds[0] = ds[2];
452     ds[1] = ds[3];
453     ds[2] = ds[4];
454     ds[3] = ds[5];
455 
456     d += 2 * d_stride;
457     y -= 2;
458   } while (y);
459 }
460 
step3_win5_oneline_neon(const int16_t ** const d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)461 static inline void step3_win5_oneline_neon(const int16_t **const d,
462                                            const int32_t d_stride,
463                                            const int32_t width,
464                                            const int32_t height, int16x8_t *ds,
465                                            int32x4_t *deltas) {
466   int32_t y = height;
467   do {
468     ds[8] = vld1q_s16(*d);
469     ds[9] = vld1q_s16(*d + width);
470 
471     compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
472     compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
473     compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
474     compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
475     compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
476 
477     ds[0] = ds[2];
478     ds[1] = ds[3];
479     ds[2] = ds[4];
480     ds[3] = ds[5];
481     ds[4] = ds[6];
482     ds[5] = ds[7];
483     ds[6] = ds[8];
484     ds[7] = ds[9];
485 
486     *d += d_stride;
487   } while (--y);
488 }
489 
derive_triangle_win5_neon(const int16x8_t * d_is,const int16x8_t * d_ie,int32x4_t * deltas)490 static inline void derive_triangle_win5_neon(const int16x8_t *d_is,
491                                              const int16x8_t *d_ie,
492                                              int32x4_t *deltas) {
493   msub_neon(&deltas[0], d_is[0], d_is[0]);
494   msub_neon(&deltas[0], d_is[1], d_is[1]);
495   msub_neon(&deltas[1], d_is[0], d_is[2]);
496   msub_neon(&deltas[1], d_is[1], d_is[3]);
497   msub_neon(&deltas[2], d_is[0], d_is[4]);
498   msub_neon(&deltas[2], d_is[1], d_is[5]);
499   msub_neon(&deltas[3], d_is[0], d_is[6]);
500   msub_neon(&deltas[3], d_is[1], d_is[7]);
501   msub_neon(&deltas[4], d_is[2], d_is[2]);
502   msub_neon(&deltas[4], d_is[3], d_is[3]);
503   msub_neon(&deltas[5], d_is[2], d_is[4]);
504   msub_neon(&deltas[5], d_is[3], d_is[5]);
505   msub_neon(&deltas[6], d_is[2], d_is[6]);
506   msub_neon(&deltas[6], d_is[3], d_is[7]);
507   msub_neon(&deltas[7], d_is[4], d_is[4]);
508   msub_neon(&deltas[7], d_is[5], d_is[5]);
509   msub_neon(&deltas[8], d_is[4], d_is[6]);
510   msub_neon(&deltas[8], d_is[5], d_is[7]);
511   msub_neon(&deltas[9], d_is[6], d_is[6]);
512   msub_neon(&deltas[9], d_is[7], d_is[7]);
513 
514   madd_neon(&deltas[0], d_ie[0], d_ie[0]);
515   madd_neon(&deltas[0], d_ie[1], d_ie[1]);
516   madd_neon(&deltas[1], d_ie[0], d_ie[2]);
517   madd_neon(&deltas[1], d_ie[1], d_ie[3]);
518   madd_neon(&deltas[2], d_ie[0], d_ie[4]);
519   madd_neon(&deltas[2], d_ie[1], d_ie[5]);
520   madd_neon(&deltas[3], d_ie[0], d_ie[6]);
521   madd_neon(&deltas[3], d_ie[1], d_ie[7]);
522   madd_neon(&deltas[4], d_ie[2], d_ie[2]);
523   madd_neon(&deltas[4], d_ie[3], d_ie[3]);
524   madd_neon(&deltas[5], d_ie[2], d_ie[4]);
525   madd_neon(&deltas[5], d_ie[3], d_ie[5]);
526   madd_neon(&deltas[6], d_ie[2], d_ie[6]);
527   madd_neon(&deltas[6], d_ie[3], d_ie[7]);
528   madd_neon(&deltas[7], d_ie[4], d_ie[4]);
529   madd_neon(&deltas[7], d_ie[5], d_ie[5]);
530   madd_neon(&deltas[8], d_ie[4], d_ie[6]);
531   madd_neon(&deltas[8], d_ie[5], d_ie[7]);
532   madd_neon(&deltas[9], d_ie[6], d_ie[6]);
533   madd_neon(&deltas[9], d_ie[7], d_ie[7]);
534 }
535 
load_triangle_win5_neon(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie)536 static inline void load_triangle_win5_neon(const int16_t *const di,
537                                            const int32_t d_stride,
538                                            const int32_t height,
539                                            int16x8_t *d_is, int16x8_t *d_ie) {
540   load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
541   load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
542 
543   load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
544                &d_ie[4], &d_ie[6]);
545   load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
546                &d_ie[5], &d_ie[7]);
547 }
548 
sub_deltas_step4(int16x8_t * A,int16x8_t * B,int32x4_t * deltas)549 static inline void sub_deltas_step4(int16x8_t *A, int16x8_t *B,
550                                     int32x4_t *deltas) {
551   deltas[0] = vmlsl_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
552   deltas[0] = vmlsl_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
553   deltas[1] = vmlsl_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
554   deltas[1] = vmlsl_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
555   deltas[2] = vmlsl_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
556   deltas[2] = vmlsl_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
557   deltas[3] = vmlsl_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
558   deltas[3] = vmlsl_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
559   deltas[4] = vmlsl_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
560   deltas[4] = vmlsl_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
561   deltas[5] = vmlsl_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
562   deltas[5] = vmlsl_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
563   deltas[6] = vmlsl_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
564   deltas[6] = vmlsl_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
565   deltas[7] = vmlsl_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
566   deltas[7] = vmlsl_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
567   deltas[8] = vmlsl_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
568   deltas[8] = vmlsl_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
569 }
570 
add_deltas_step4(int16x8_t * A,int16x8_t * B,int32x4_t * deltas)571 static inline void add_deltas_step4(int16x8_t *A, int16x8_t *B,
572                                     int32x4_t *deltas) {
573   deltas[0] = vmlal_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
574   deltas[0] = vmlal_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
575   deltas[1] = vmlal_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
576   deltas[1] = vmlal_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
577   deltas[2] = vmlal_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
578   deltas[2] = vmlal_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
579   deltas[3] = vmlal_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
580   deltas[3] = vmlal_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
581   deltas[4] = vmlal_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
582   deltas[4] = vmlal_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
583   deltas[5] = vmlal_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
584   deltas[5] = vmlal_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
585   deltas[6] = vmlal_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
586   deltas[6] = vmlal_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
587   deltas[7] = vmlal_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
588   deltas[7] = vmlal_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
589   deltas[8] = vmlal_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
590   deltas[8] = vmlal_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
591 }
592 
stats_top_win7_neon(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int32x4_t * sum_m,int32x4_t * sum_h)593 static inline void stats_top_win7_neon(const int16x8_t src[2],
594                                        const int16x8_t dgd[2],
595                                        const int16_t *const d,
596                                        const int32_t d_stride, int32x4_t *sum_m,
597                                        int32x4_t *sum_h) {
598   int16x8_t dgds[WIENER_WIN * 2];
599 
600   load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
601                &dgds[8], &dgds[10], &dgds[12]);
602   load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
603                &dgds[9], &dgds[11], &dgds[13]);
604 
605   madd_neon(&sum_m[0], src[0], dgds[0]);
606   madd_neon(&sum_m[0], src[1], dgds[1]);
607   madd_neon(&sum_m[1], src[0], dgds[2]);
608   madd_neon(&sum_m[1], src[1], dgds[3]);
609   madd_neon(&sum_m[2], src[0], dgds[4]);
610   madd_neon(&sum_m[2], src[1], dgds[5]);
611   madd_neon(&sum_m[3], src[0], dgds[6]);
612   madd_neon(&sum_m[3], src[1], dgds[7]);
613   madd_neon(&sum_m[4], src[0], dgds[8]);
614   madd_neon(&sum_m[4], src[1], dgds[9]);
615   madd_neon(&sum_m[5], src[0], dgds[10]);
616   madd_neon(&sum_m[5], src[1], dgds[11]);
617   madd_neon(&sum_m[6], src[0], dgds[12]);
618   madd_neon(&sum_m[6], src[1], dgds[13]);
619 
620   madd_neon(&sum_h[0], dgd[0], dgds[0]);
621   madd_neon(&sum_h[0], dgd[1], dgds[1]);
622   madd_neon(&sum_h[1], dgd[0], dgds[2]);
623   madd_neon(&sum_h[1], dgd[1], dgds[3]);
624   madd_neon(&sum_h[2], dgd[0], dgds[4]);
625   madd_neon(&sum_h[2], dgd[1], dgds[5]);
626   madd_neon(&sum_h[3], dgd[0], dgds[6]);
627   madd_neon(&sum_h[3], dgd[1], dgds[7]);
628   madd_neon(&sum_h[4], dgd[0], dgds[8]);
629   madd_neon(&sum_h[4], dgd[1], dgds[9]);
630   madd_neon(&sum_h[5], dgd[0], dgds[10]);
631   madd_neon(&sum_h[5], dgd[1], dgds[11]);
632   madd_neon(&sum_h[6], dgd[0], dgds[12]);
633   madd_neon(&sum_h[6], dgd[1], dgds[13]);
634 }
635 
derive_square_win7_neon(const int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int32x4_t deltas[][WIN_7])636 static inline void derive_square_win7_neon(const int16x8_t *d_is,
637                                            const int16x8_t *d_ie,
638                                            const int16x8_t *d_js,
639                                            const int16x8_t *d_je,
640                                            int32x4_t deltas[][WIN_7]) {
641   msub_neon(&deltas[0][0], d_is[0], d_js[0]);
642   msub_neon(&deltas[0][0], d_is[1], d_js[1]);
643   msub_neon(&deltas[0][1], d_is[0], d_js[2]);
644   msub_neon(&deltas[0][1], d_is[1], d_js[3]);
645   msub_neon(&deltas[0][2], d_is[0], d_js[4]);
646   msub_neon(&deltas[0][2], d_is[1], d_js[5]);
647   msub_neon(&deltas[0][3], d_is[0], d_js[6]);
648   msub_neon(&deltas[0][3], d_is[1], d_js[7]);
649   msub_neon(&deltas[0][4], d_is[0], d_js[8]);
650   msub_neon(&deltas[0][4], d_is[1], d_js[9]);
651   msub_neon(&deltas[0][5], d_is[0], d_js[10]);
652   msub_neon(&deltas[0][5], d_is[1], d_js[11]);
653 
654   msub_neon(&deltas[1][0], d_is[2], d_js[0]);
655   msub_neon(&deltas[1][0], d_is[3], d_js[1]);
656   msub_neon(&deltas[1][1], d_is[2], d_js[2]);
657   msub_neon(&deltas[1][1], d_is[3], d_js[3]);
658   msub_neon(&deltas[1][2], d_is[2], d_js[4]);
659   msub_neon(&deltas[1][2], d_is[3], d_js[5]);
660   msub_neon(&deltas[1][3], d_is[2], d_js[6]);
661   msub_neon(&deltas[1][3], d_is[3], d_js[7]);
662   msub_neon(&deltas[1][4], d_is[2], d_js[8]);
663   msub_neon(&deltas[1][4], d_is[3], d_js[9]);
664   msub_neon(&deltas[1][5], d_is[2], d_js[10]);
665   msub_neon(&deltas[1][5], d_is[3], d_js[11]);
666 
667   msub_neon(&deltas[2][0], d_is[4], d_js[0]);
668   msub_neon(&deltas[2][0], d_is[5], d_js[1]);
669   msub_neon(&deltas[2][1], d_is[4], d_js[2]);
670   msub_neon(&deltas[2][1], d_is[5], d_js[3]);
671   msub_neon(&deltas[2][2], d_is[4], d_js[4]);
672   msub_neon(&deltas[2][2], d_is[5], d_js[5]);
673   msub_neon(&deltas[2][3], d_is[4], d_js[6]);
674   msub_neon(&deltas[2][3], d_is[5], d_js[7]);
675   msub_neon(&deltas[2][4], d_is[4], d_js[8]);
676   msub_neon(&deltas[2][4], d_is[5], d_js[9]);
677   msub_neon(&deltas[2][5], d_is[4], d_js[10]);
678   msub_neon(&deltas[2][5], d_is[5], d_js[11]);
679 
680   msub_neon(&deltas[3][0], d_is[6], d_js[0]);
681   msub_neon(&deltas[3][0], d_is[7], d_js[1]);
682   msub_neon(&deltas[3][1], d_is[6], d_js[2]);
683   msub_neon(&deltas[3][1], d_is[7], d_js[3]);
684   msub_neon(&deltas[3][2], d_is[6], d_js[4]);
685   msub_neon(&deltas[3][2], d_is[7], d_js[5]);
686   msub_neon(&deltas[3][3], d_is[6], d_js[6]);
687   msub_neon(&deltas[3][3], d_is[7], d_js[7]);
688   msub_neon(&deltas[3][4], d_is[6], d_js[8]);
689   msub_neon(&deltas[3][4], d_is[7], d_js[9]);
690   msub_neon(&deltas[3][5], d_is[6], d_js[10]);
691   msub_neon(&deltas[3][5], d_is[7], d_js[11]);
692 
693   msub_neon(&deltas[4][0], d_is[8], d_js[0]);
694   msub_neon(&deltas[4][0], d_is[9], d_js[1]);
695   msub_neon(&deltas[4][1], d_is[8], d_js[2]);
696   msub_neon(&deltas[4][1], d_is[9], d_js[3]);
697   msub_neon(&deltas[4][2], d_is[8], d_js[4]);
698   msub_neon(&deltas[4][2], d_is[9], d_js[5]);
699   msub_neon(&deltas[4][3], d_is[8], d_js[6]);
700   msub_neon(&deltas[4][3], d_is[9], d_js[7]);
701   msub_neon(&deltas[4][4], d_is[8], d_js[8]);
702   msub_neon(&deltas[4][4], d_is[9], d_js[9]);
703   msub_neon(&deltas[4][5], d_is[8], d_js[10]);
704   msub_neon(&deltas[4][5], d_is[9], d_js[11]);
705 
706   msub_neon(&deltas[5][0], d_is[10], d_js[0]);
707   msub_neon(&deltas[5][0], d_is[11], d_js[1]);
708   msub_neon(&deltas[5][1], d_is[10], d_js[2]);
709   msub_neon(&deltas[5][1], d_is[11], d_js[3]);
710   msub_neon(&deltas[5][2], d_is[10], d_js[4]);
711   msub_neon(&deltas[5][2], d_is[11], d_js[5]);
712   msub_neon(&deltas[5][3], d_is[10], d_js[6]);
713   msub_neon(&deltas[5][3], d_is[11], d_js[7]);
714   msub_neon(&deltas[5][4], d_is[10], d_js[8]);
715   msub_neon(&deltas[5][4], d_is[11], d_js[9]);
716   msub_neon(&deltas[5][5], d_is[10], d_js[10]);
717   msub_neon(&deltas[5][5], d_is[11], d_js[11]);
718 
719   madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
720   madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
721   madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
722   madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
723   madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
724   madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
725   madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
726   madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
727   madd_neon(&deltas[0][4], d_ie[0], d_je[8]);
728   madd_neon(&deltas[0][4], d_ie[1], d_je[9]);
729   madd_neon(&deltas[0][5], d_ie[0], d_je[10]);
730   madd_neon(&deltas[0][5], d_ie[1], d_je[11]);
731 
732   madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
733   madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
734   madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
735   madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
736   madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
737   madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
738   madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
739   madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
740   madd_neon(&deltas[1][4], d_ie[2], d_je[8]);
741   madd_neon(&deltas[1][4], d_ie[3], d_je[9]);
742   madd_neon(&deltas[1][5], d_ie[2], d_je[10]);
743   madd_neon(&deltas[1][5], d_ie[3], d_je[11]);
744 
745   madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
746   madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
747   madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
748   madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
749   madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
750   madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
751   madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
752   madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
753   madd_neon(&deltas[2][4], d_ie[4], d_je[8]);
754   madd_neon(&deltas[2][4], d_ie[5], d_je[9]);
755   madd_neon(&deltas[2][5], d_ie[4], d_je[10]);
756   madd_neon(&deltas[2][5], d_ie[5], d_je[11]);
757 
758   madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
759   madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
760   madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
761   madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
762   madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
763   madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
764   madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
765   madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
766   madd_neon(&deltas[3][4], d_ie[6], d_je[8]);
767   madd_neon(&deltas[3][4], d_ie[7], d_je[9]);
768   madd_neon(&deltas[3][5], d_ie[6], d_je[10]);
769   madd_neon(&deltas[3][5], d_ie[7], d_je[11]);
770 
771   madd_neon(&deltas[4][0], d_ie[8], d_je[0]);
772   madd_neon(&deltas[4][0], d_ie[9], d_je[1]);
773   madd_neon(&deltas[4][1], d_ie[8], d_je[2]);
774   madd_neon(&deltas[4][1], d_ie[9], d_je[3]);
775   madd_neon(&deltas[4][2], d_ie[8], d_je[4]);
776   madd_neon(&deltas[4][2], d_ie[9], d_je[5]);
777   madd_neon(&deltas[4][3], d_ie[8], d_je[6]);
778   madd_neon(&deltas[4][3], d_ie[9], d_je[7]);
779   madd_neon(&deltas[4][4], d_ie[8], d_je[8]);
780   madd_neon(&deltas[4][4], d_ie[9], d_je[9]);
781   madd_neon(&deltas[4][5], d_ie[8], d_je[10]);
782   madd_neon(&deltas[4][5], d_ie[9], d_je[11]);
783 
784   madd_neon(&deltas[5][0], d_ie[10], d_je[0]);
785   madd_neon(&deltas[5][0], d_ie[11], d_je[1]);
786   madd_neon(&deltas[5][1], d_ie[10], d_je[2]);
787   madd_neon(&deltas[5][1], d_ie[11], d_je[3]);
788   madd_neon(&deltas[5][2], d_ie[10], d_je[4]);
789   madd_neon(&deltas[5][2], d_ie[11], d_je[5]);
790   madd_neon(&deltas[5][3], d_ie[10], d_je[6]);
791   madd_neon(&deltas[5][3], d_ie[11], d_je[7]);
792   madd_neon(&deltas[5][4], d_ie[10], d_je[8]);
793   madd_neon(&deltas[5][4], d_ie[11], d_je[9]);
794   madd_neon(&deltas[5][5], d_ie[10], d_je[10]);
795   madd_neon(&deltas[5][5], d_ie[11], d_je[11]);
796 }
797 
update_8_stats_neon(const int64_t * const src,const int32x4_t delta0,const int32x4_t delta1,int64_t * const dst)798 static inline void update_8_stats_neon(const int64_t *const src,
799                                        const int32x4_t delta0,
800                                        const int32x4_t delta1,
801                                        int64_t *const dst) {
802   update_4_stats_neon(src + 0, delta0, dst + 0);
803   update_4_stats_neon(src + 4, delta1, dst + 4);
804 }
805 
load_square_win7_neon(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je)806 static inline void load_square_win7_neon(const int16_t *const di,
807                                          const int16_t *const dj,
808                                          const int32_t d_stride,
809                                          const int32_t height, int16x8_t *d_is,
810                                          int16x8_t *d_ie, int16x8_t *d_js,
811                                          int16x8_t *d_je) {
812   load_s16_8x6(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6],
813                &d_is[8], &d_is[10]);
814   load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
815                &d_is[9], &d_is[11]);
816   load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
817                &d_js[8], &d_js[10]);
818   load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
819                &d_js[9], &d_js[11]);
820 
821   load_s16_8x6(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
822                &d_ie[4], &d_ie[6], &d_ie[8], &d_ie[10]);
823   load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
824                &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
825   load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
826                &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
827   load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
828                &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
829 }
830 
load_triangle_win7_neon(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie)831 static inline void load_triangle_win7_neon(const int16_t *const di,
832                                            const int32_t d_stride,
833                                            const int32_t height,
834                                            int16x8_t *d_is, int16x8_t *d_ie) {
835   load_s16_8x6(di, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6], &d_is[8],
836                &d_is[10]);
837   load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
838                &d_is[9], &d_is[11]);
839 
840   load_s16_8x6(di + height * d_stride, d_stride, &d_ie[0], &d_ie[2], &d_ie[4],
841                &d_ie[6], &d_ie[8], &d_ie[10]);
842   load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
843                &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
844 }
845 
stats_left_win7_neon(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int32x4_t * sum)846 static inline void stats_left_win7_neon(const int16x8_t src[2],
847                                         const int16_t *d,
848                                         const int32_t d_stride,
849                                         int32x4_t *sum) {
850   int16x8_t dgds[WIN_7];
851 
852   load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
853                &dgds[6], &dgds[8], &dgds[10]);
854   load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
855                &dgds[7], &dgds[9], &dgds[11]);
856 
857   madd_neon(&sum[0], src[0], dgds[0]);
858   madd_neon(&sum[0], src[1], dgds[1]);
859   madd_neon(&sum[1], src[0], dgds[2]);
860   madd_neon(&sum[1], src[1], dgds[3]);
861   madd_neon(&sum[2], src[0], dgds[4]);
862   madd_neon(&sum[2], src[1], dgds[5]);
863   madd_neon(&sum[3], src[0], dgds[6]);
864   madd_neon(&sum[3], src[1], dgds[7]);
865   madd_neon(&sum[4], src[0], dgds[8]);
866   madd_neon(&sum[4], src[1], dgds[9]);
867   madd_neon(&sum[5], src[0], dgds[10]);
868   madd_neon(&sum[5], src[1], dgds[11]);
869 }
870 
step3_win7_neon(const int16_t * d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)871 static inline void step3_win7_neon(const int16_t *d, const int32_t d_stride,
872                                    const int32_t width, const int32_t height,
873                                    int16x8_t *ds, int32x4_t *deltas) {
874   int32_t y = height;
875   do {
876     ds[12] = vld1q_s16(d);
877     ds[13] = vld1q_s16(d + width);
878 
879     compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
880     compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
881     compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
882     compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
883     compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
884     compute_delta_step3(&deltas[9], &deltas[13], ds[0], ds[1], ds[10], ds[11]);
885     compute_delta_step3(&deltas[10], &deltas[14], ds[0], ds[1], ds[12], ds[13]);
886 
887     ds[0] = ds[2];
888     ds[1] = ds[3];
889     ds[2] = ds[4];
890     ds[3] = ds[5];
891     ds[4] = ds[6];
892     ds[5] = ds[7];
893     ds[6] = ds[8];
894     ds[7] = ds[9];
895     ds[8] = ds[10];
896     ds[9] = ds[11];
897     ds[10] = ds[12];
898     ds[11] = ds[13];
899 
900     d += d_stride;
901   } while (--y);
902 }
903 
derive_triangle_win7_neon(const int16x8_t * d_is,const int16x8_t * d_ie,int32x4_t * deltas)904 static inline void derive_triangle_win7_neon(const int16x8_t *d_is,
905                                              const int16x8_t *d_ie,
906                                              int32x4_t *deltas) {
907   msub_neon(&deltas[0], d_is[0], d_is[0]);
908   msub_neon(&deltas[0], d_is[1], d_is[1]);
909   msub_neon(&deltas[1], d_is[0], d_is[2]);
910   msub_neon(&deltas[1], d_is[1], d_is[3]);
911   msub_neon(&deltas[2], d_is[0], d_is[4]);
912   msub_neon(&deltas[2], d_is[1], d_is[5]);
913   msub_neon(&deltas[3], d_is[0], d_is[6]);
914   msub_neon(&deltas[3], d_is[1], d_is[7]);
915   msub_neon(&deltas[4], d_is[0], d_is[8]);
916   msub_neon(&deltas[4], d_is[1], d_is[9]);
917   msub_neon(&deltas[5], d_is[0], d_is[10]);
918   msub_neon(&deltas[5], d_is[1], d_is[11]);
919 
920   msub_neon(&deltas[6], d_is[2], d_is[2]);
921   msub_neon(&deltas[6], d_is[3], d_is[3]);
922   msub_neon(&deltas[7], d_is[2], d_is[4]);
923   msub_neon(&deltas[7], d_is[3], d_is[5]);
924   msub_neon(&deltas[8], d_is[2], d_is[6]);
925   msub_neon(&deltas[8], d_is[3], d_is[7]);
926   msub_neon(&deltas[9], d_is[2], d_is[8]);
927   msub_neon(&deltas[9], d_is[3], d_is[9]);
928   msub_neon(&deltas[10], d_is[2], d_is[10]);
929   msub_neon(&deltas[10], d_is[3], d_is[11]);
930 
931   msub_neon(&deltas[11], d_is[4], d_is[4]);
932   msub_neon(&deltas[11], d_is[5], d_is[5]);
933   msub_neon(&deltas[12], d_is[4], d_is[6]);
934   msub_neon(&deltas[12], d_is[5], d_is[7]);
935   msub_neon(&deltas[13], d_is[4], d_is[8]);
936   msub_neon(&deltas[13], d_is[5], d_is[9]);
937   msub_neon(&deltas[14], d_is[4], d_is[10]);
938   msub_neon(&deltas[14], d_is[5], d_is[11]);
939 
940   msub_neon(&deltas[15], d_is[6], d_is[6]);
941   msub_neon(&deltas[15], d_is[7], d_is[7]);
942   msub_neon(&deltas[16], d_is[6], d_is[8]);
943   msub_neon(&deltas[16], d_is[7], d_is[9]);
944   msub_neon(&deltas[17], d_is[6], d_is[10]);
945   msub_neon(&deltas[17], d_is[7], d_is[11]);
946 
947   msub_neon(&deltas[18], d_is[8], d_is[8]);
948   msub_neon(&deltas[18], d_is[9], d_is[9]);
949   msub_neon(&deltas[19], d_is[8], d_is[10]);
950   msub_neon(&deltas[19], d_is[9], d_is[11]);
951 
952   msub_neon(&deltas[20], d_is[10], d_is[10]);
953   msub_neon(&deltas[20], d_is[11], d_is[11]);
954 
955   madd_neon(&deltas[0], d_ie[0], d_ie[0]);
956   madd_neon(&deltas[0], d_ie[1], d_ie[1]);
957   madd_neon(&deltas[1], d_ie[0], d_ie[2]);
958   madd_neon(&deltas[1], d_ie[1], d_ie[3]);
959   madd_neon(&deltas[2], d_ie[0], d_ie[4]);
960   madd_neon(&deltas[2], d_ie[1], d_ie[5]);
961   madd_neon(&deltas[3], d_ie[0], d_ie[6]);
962   madd_neon(&deltas[3], d_ie[1], d_ie[7]);
963   madd_neon(&deltas[4], d_ie[0], d_ie[8]);
964   madd_neon(&deltas[4], d_ie[1], d_ie[9]);
965   madd_neon(&deltas[5], d_ie[0], d_ie[10]);
966   madd_neon(&deltas[5], d_ie[1], d_ie[11]);
967 
968   madd_neon(&deltas[6], d_ie[2], d_ie[2]);
969   madd_neon(&deltas[6], d_ie[3], d_ie[3]);
970   madd_neon(&deltas[7], d_ie[2], d_ie[4]);
971   madd_neon(&deltas[7], d_ie[3], d_ie[5]);
972   madd_neon(&deltas[8], d_ie[2], d_ie[6]);
973   madd_neon(&deltas[8], d_ie[3], d_ie[7]);
974   madd_neon(&deltas[9], d_ie[2], d_ie[8]);
975   madd_neon(&deltas[9], d_ie[3], d_ie[9]);
976   madd_neon(&deltas[10], d_ie[2], d_ie[10]);
977   madd_neon(&deltas[10], d_ie[3], d_ie[11]);
978 
979   madd_neon(&deltas[11], d_ie[4], d_ie[4]);
980   madd_neon(&deltas[11], d_ie[5], d_ie[5]);
981   madd_neon(&deltas[12], d_ie[4], d_ie[6]);
982   madd_neon(&deltas[12], d_ie[5], d_ie[7]);
983   madd_neon(&deltas[13], d_ie[4], d_ie[8]);
984   madd_neon(&deltas[13], d_ie[5], d_ie[9]);
985   madd_neon(&deltas[14], d_ie[4], d_ie[10]);
986   madd_neon(&deltas[14], d_ie[5], d_ie[11]);
987 
988   madd_neon(&deltas[15], d_ie[6], d_ie[6]);
989   madd_neon(&deltas[15], d_ie[7], d_ie[7]);
990   madd_neon(&deltas[16], d_ie[6], d_ie[8]);
991   madd_neon(&deltas[16], d_ie[7], d_ie[9]);
992   madd_neon(&deltas[17], d_ie[6], d_ie[10]);
993   madd_neon(&deltas[17], d_ie[7], d_ie[11]);
994 
995   madd_neon(&deltas[18], d_ie[8], d_ie[8]);
996   madd_neon(&deltas[18], d_ie[9], d_ie[9]);
997   madd_neon(&deltas[19], d_ie[8], d_ie[10]);
998   madd_neon(&deltas[19], d_ie[9], d_ie[11]);
999 
1000   madd_neon(&deltas[20], d_ie[10], d_ie[10]);
1001   madd_neon(&deltas[20], d_ie[11], d_ie[11]);
1002 }
1003 
diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1004 static inline void diagonal_copy_stats_neon(const int32_t wiener_win2,
1005                                             int64_t *const H) {
1006   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1007     int64x2_t in[8], out[8];
1008 
1009     in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 1);
1010     in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 3);
1011     in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1);
1012     in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 3);
1013     in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 1);
1014     in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 3);
1015     in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 1);
1016     in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 3);
1017 
1018     transpose_arrays_s64_4x4(in, out);
1019 
1020     vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(out[0]));
1021     vst1q_s64(H + (i + 2) * wiener_win2 + i, out[2]);
1022     vst1q_s64(H + (i + 3) * wiener_win2 + i, out[4]);
1023     vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1024     vst1q_s64(H + (i + 4) * wiener_win2 + i, out[6]);
1025     vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1026 
1027     for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1028       in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + j);
1029       in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + j + 2);
1030       in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + j);
1031       in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + j + 2);
1032       in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + j);
1033       in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + j + 2);
1034       in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + j);
1035       in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + j + 2);
1036 
1037       transpose_arrays_s64_4x4(in, out);
1038 
1039       vst1q_s64(H + (j + 0) * wiener_win2 + i, out[0]);
1040       vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1041       vst1q_s64(H + (j + 1) * wiener_win2 + i, out[2]);
1042       vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1043       vst1q_s64(H + (j + 2) * wiener_win2 + i, out[4]);
1044       vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1045       vst1q_s64(H + (j + 3) * wiener_win2 + i, out[6]);
1046       vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1047     }
1048   }
1049 }
1050 
div4_neon(const int64x2_t src)1051 static inline int64x2_t div4_neon(const int64x2_t src) {
1052 #if AOM_ARCH_AARCH64
1053   uint64x2_t sign = vcltzq_s64(src);
1054   int64x2_t abs = vabsq_s64(src);
1055   // divide by 4
1056   abs = vshrq_n_s64(abs, 2);
1057   // re-apply sign
1058   return vbslq_s64(sign, vnegq_s64(abs), abs);
1059 #else
1060   int64x2_t sign = vshrq_n_s64(src, 63);
1061   int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
1062   // divide by 4
1063   abs = vshrq_n_s64(abs, 2);
1064   // re-apply sign
1065   return vsubq_s64(veorq_s64(abs, sign), sign);
1066 #endif  // AOM_ARCH_AARCH64
1067 }
1068 
div4_4x4_neon(const int32_t wiener_win2,int64_t * const H,int64x2_t out[8])1069 static inline void div4_4x4_neon(const int32_t wiener_win2, int64_t *const H,
1070                                  int64x2_t out[8]) {
1071   out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
1072   out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
1073   out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
1074   out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
1075   out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
1076   out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
1077   out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
1078   out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
1079 
1080   out[0] = div4_neon(out[0]);
1081   out[1] = div4_neon(out[1]);
1082   out[2] = div4_neon(out[2]);
1083   out[3] = div4_neon(out[3]);
1084   out[4] = div4_neon(out[4]);
1085   out[5] = div4_neon(out[5]);
1086   out[6] = div4_neon(out[6]);
1087   out[7] = div4_neon(out[7]);
1088 
1089   vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
1090   vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
1091   vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
1092   vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
1093   vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
1094   vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
1095   vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
1096   vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
1097 }
1098 
div16_neon(const int64x2_t src)1099 static inline int64x2_t div16_neon(const int64x2_t src) {
1100 #if AOM_ARCH_AARCH64
1101   uint64x2_t sign = vcltzq_s64(src);
1102   int64x2_t abs = vabsq_s64(src);
1103   // divide by 16
1104   abs = vshrq_n_s64(abs, 4);
1105   // re-apply sign
1106   return vbslq_s64(sign, vnegq_s64(abs), abs);
1107 #else
1108   int64x2_t sign = vshrq_n_s64(src, 63);
1109   int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
1110   // divide by 16
1111   abs = vshrq_n_s64(abs, 4);
1112   // re-apply sign
1113   return vsubq_s64(veorq_s64(abs, sign), sign);
1114 #endif  // AOM_ARCH_AARCH64
1115 }
1116 
div16_4x4_neon(const int32_t wiener_win2,int64_t * const H,int64x2_t out[8])1117 static inline void div16_4x4_neon(const int32_t wiener_win2, int64_t *const H,
1118                                   int64x2_t out[8]) {
1119   out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
1120   out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
1121   out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
1122   out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
1123   out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
1124   out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
1125   out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
1126   out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
1127 
1128   out[0] = div16_neon(out[0]);
1129   out[1] = div16_neon(out[1]);
1130   out[2] = div16_neon(out[2]);
1131   out[3] = div16_neon(out[3]);
1132   out[4] = div16_neon(out[4]);
1133   out[5] = div16_neon(out[5]);
1134   out[6] = div16_neon(out[6]);
1135   out[7] = div16_neon(out[7]);
1136 
1137   vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
1138   vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
1139   vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
1140   vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
1141   vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
1142   vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
1143   vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
1144   vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
1145 }
1146 
div4_diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1147 static inline void div4_diagonal_copy_stats_neon(const int32_t wiener_win2,
1148                                                  int64_t *const H) {
1149   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1150     int64x2_t in[8], out[8];
1151 
1152     div4_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
1153     transpose_arrays_s64_4x4(in, out);
1154 
1155     vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
1156     vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
1157     vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
1158     vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1159     vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
1160     vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1161 
1162     for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1163       div4_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
1164       transpose_arrays_s64_4x4(in, out);
1165 
1166       vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
1167       vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1168       vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
1169       vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1170       vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
1171       vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1172       vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
1173       vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1174     }
1175   }
1176 }
1177 
div16_diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1178 static inline void div16_diagonal_copy_stats_neon(const int32_t wiener_win2,
1179                                                   int64_t *const H) {
1180   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1181     int64x2_t in[8], out[8];
1182 
1183     div16_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
1184     transpose_arrays_s64_4x4(in, out);
1185 
1186     vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
1187     vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
1188     vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
1189     vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1190     vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
1191     vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1192 
1193     for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1194       div16_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
1195       transpose_arrays_s64_4x4(in, out);
1196 
1197       vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
1198       vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1199       vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
1200       vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1201       vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
1202       vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1203       vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
1204       vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1205     }
1206   }
1207 }
1208 
1209 #endif  // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
1210