1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
13 #define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
14
15 #include <arm_neon.h>
16
17 #include "av1/common/restoration.h"
18
19 #define WIN_7 ((WIENER_WIN - 1) * 2)
20 #define WIN_CHROMA ((WIENER_WIN_CHROMA - 1) * 2)
21
22 // Aligned sizes for Wiener filters.
23 #define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
24 #define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
25 #define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
26 #define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
27 #define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
28
29 // Compute 8 values of M (cross correlation) for a single source pixel and
30 // accumulate.
update_M_1pixel(int32_t * M_s32,int16x4_t src_avg,int16x8_t dgd_avg)31 static inline void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
32 int16x8_t dgd_avg) {
33 int32x4_t lo = vld1q_s32(M_s32 + 0);
34 int32x4_t hi = vld1q_s32(M_s32 + 4);
35
36 lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
37 hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
38
39 vst1q_s32(M_s32 + 0, lo);
40 vst1q_s32(M_s32 + 4, hi);
41 }
42
43 // Compute 8 values of M (cross correlation) for two source pixels and
44 // accumulate.
update_M_2pixels(int32_t * M_s32,int16x4_t src_avg0,int16x4_t src_avg1,int16x8_t dgd_avg0,int16x8_t dgd_avg1)45 static inline void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
46 int16x4_t src_avg1, int16x8_t dgd_avg0,
47 int16x8_t dgd_avg1) {
48 int32x4_t lo = vld1q_s32(M_s32 + 0);
49 int32x4_t hi = vld1q_s32(M_s32 + 4);
50
51 lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
52 hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
53 lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
54 hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
55
56 vst1q_s32(M_s32 + 0, lo);
57 vst1q_s32(M_s32 + 4, hi);
58 }
59
update_H_1pixel(int32_t * H_s32,const int16_t * dgd_avg,int width,int height)60 static inline void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
61 int width, int height) {
62 for (int i = 0; i < height; i += 4) {
63 int16x4_t di = vld1_s16(dgd_avg + i);
64
65 for (int j = i; j < width; j += 4) {
66 int16x4_t dj = vld1_s16(dgd_avg + j);
67 int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
68 int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
69 int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
70 int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
71
72 h0 = vmlal_lane_s16(h0, dj, di, 0);
73 h1 = vmlal_lane_s16(h1, dj, di, 1);
74 h2 = vmlal_lane_s16(h2, dj, di, 2);
75 h3 = vmlal_lane_s16(h3, dj, di, 3);
76
77 vst1q_s32(H_s32 + 0 * width + j, h0);
78 vst1q_s32(H_s32 + 1 * width + j, h1);
79 vst1q_s32(H_s32 + 2 * width + j, h2);
80 vst1q_s32(H_s32 + 3 * width + j, h3);
81 }
82 H_s32 += 4 * width;
83 }
84 }
85
update_H_5x5_2pixels(int32_t * H_s32,const int16_t * dgd_avg0,const int16_t * dgd_avg1)86 static inline void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
87 const int16_t *dgd_avg1) {
88 for (int i = 0; i < 24; i += 4) {
89 int16x4_t di0 = vld1_s16(dgd_avg0 + i);
90 int16x4_t di1 = vld1_s16(dgd_avg1 + i);
91
92 for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
93 int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
94 int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
95 int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
96 int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
97 int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
98 int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
99
100 h0 = vmlal_lane_s16(h0, dj0, di0, 0);
101 h0 = vmlal_lane_s16(h0, dj1, di1, 0);
102 h1 = vmlal_lane_s16(h1, dj0, di0, 1);
103 h1 = vmlal_lane_s16(h1, dj1, di1, 1);
104 h2 = vmlal_lane_s16(h2, dj0, di0, 2);
105 h2 = vmlal_lane_s16(h2, dj1, di1, 2);
106 h3 = vmlal_lane_s16(h3, dj0, di0, 3);
107 h3 = vmlal_lane_s16(h3, dj1, di1, 3);
108
109 vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
110 vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
111 vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
112 vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
113 }
114 H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
115 }
116 }
117
update_H_7x7_2pixels(int32_t * H_s32,const int16_t * dgd_avg0,const int16_t * dgd_avg1)118 static inline void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
119 const int16_t *dgd_avg1) {
120 for (int i = 0; i < 48; i += 4) {
121 int16x4_t di0 = vld1_s16(dgd_avg0 + i);
122 int16x4_t di1 = vld1_s16(dgd_avg1 + i);
123
124 int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
125 int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
126 int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
127 int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
128
129 h0 = vmlal_lane_s16(h0, di0, di0, 0);
130 h0 = vmlal_lane_s16(h0, di1, di1, 0);
131 h1 = vmlal_lane_s16(h1, di0, di0, 1);
132 h1 = vmlal_lane_s16(h1, di1, di1, 1);
133 h2 = vmlal_lane_s16(h2, di0, di0, 2);
134 h2 = vmlal_lane_s16(h2, di1, di1, 2);
135 h3 = vmlal_lane_s16(h3, di0, di0, 3);
136 h3 = vmlal_lane_s16(h3, di1, di1, 3);
137
138 vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
139 vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
140 vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
141 vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
142
143 for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
144 int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
145 int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
146 h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
147 h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
148 h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
149 h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
150
151 h0 = vmlal_lane_s16(h0, dj0, di0, 0);
152 h0 = vmlal_lane_s16(h0, dj1, di1, 0);
153 h1 = vmlal_lane_s16(h1, dj0, di0, 1);
154 h1 = vmlal_lane_s16(h1, dj1, di1, 1);
155 h2 = vmlal_lane_s16(h2, dj0, di0, 2);
156 h2 = vmlal_lane_s16(h2, dj1, di1, 2);
157 h3 = vmlal_lane_s16(h3, dj0, di0, 3);
158 h3 = vmlal_lane_s16(h3, dj1, di1, 3);
159
160 vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
161 vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
162 vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
163 vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
164 }
165 H_s32 += 4 * WIENER_WIN2_ALIGN2;
166 }
167 }
168
169 // Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
accumulate_and_clear(int64_t * dst,int32_t * src,int length)170 static inline void accumulate_and_clear(int64_t *dst, int32_t *src,
171 int length) {
172 do {
173 int32x4_t s32 = vld1q_s32(src);
174 vst1q_s32(src, vdupq_n_s32(0));
175 src += 4;
176
177 int64x2_t d_lo = vld1q_s64(dst + 0);
178 int64x2_t d_hi = vld1q_s64(dst + 2);
179
180 d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
181 d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
182
183 vst1q_s64(dst + 0, d_lo);
184 vst1q_s64(dst + 2, d_hi);
185
186 dst += 4;
187 length -= 4;
188 } while (length > 0);
189 }
190
191 // clang-format off
192 // Constant pool to act as a mask to zero n top elements in an int16x8_t vector.
193 // The index we load from depends on n.
194 static const int16_t mask_16bit[32] = {
195 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
196 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 };
200 // clang-format on
201
madd_neon_pairwise(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)202 static inline void madd_neon_pairwise(int32x4_t *sum, const int16x8_t src,
203 const int16x8_t dgd) {
204 const int32x4_t sd =
205 horizontal_add_2d_s32(vmull_s16(vget_low_s16(src), vget_low_s16(dgd)),
206 vmull_s16(vget_high_s16(src), vget_high_s16(dgd)));
207 *sum = vaddq_s32(*sum, sd);
208 }
209
madd_neon(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)210 static inline void madd_neon(int32x4_t *sum, const int16x8_t src,
211 const int16x8_t dgd) {
212 *sum = vmlal_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
213 *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
214 }
215
msub_neon(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)216 static inline void msub_neon(int32x4_t *sum, const int16x8_t src,
217 const int16x8_t dgd) {
218 *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
219 *sum = vmlsl_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
220 }
221
compute_delta_step3(int32x4_t * sum0,int32x4_t * sum1,const int16x8_t src0,const int16x8_t src1,const int16x8_t dgd0,const int16x8_t dgd1)222 static inline void compute_delta_step3(int32x4_t *sum0, int32x4_t *sum1,
223 const int16x8_t src0,
224 const int16x8_t src1,
225 const int16x8_t dgd0,
226 const int16x8_t dgd1) {
227 *sum0 = vmlsl_s16(*sum0, vget_low_s16(src0), vget_low_s16(dgd0));
228 *sum0 = vmlal_s16(*sum0, vget_low_s16(src1), vget_low_s16(dgd1));
229 *sum1 = vmlsl_s16(*sum1, vget_high_s16(src0), vget_high_s16(dgd0));
230 *sum1 = vmlal_s16(*sum1, vget_high_s16(src1), vget_high_s16(dgd1));
231 }
232
hadd_four_32_neon(const int32x4_t src0,const int32x4_t src1,const int32x4_t src2,const int32x4_t src3)233 static inline int32x4_t hadd_four_32_neon(const int32x4_t src0,
234 const int32x4_t src1,
235 const int32x4_t src2,
236 const int32x4_t src3) {
237 int32x4_t src[4] = { src0, src1, src2, src3 };
238 return horizontal_add_4d_s32x4(src);
239 }
240
update_4_stats_neon(const int64_t * const src,const int32x4_t delta,int64_t * const dst)241 static inline void update_4_stats_neon(const int64_t *const src,
242 const int32x4_t delta,
243 int64_t *const dst) {
244 const int64x2_t s1 = vld1q_s64(src);
245 const int64x2_t s2 = vld1q_s64(src + 2);
246
247 const int64x2_t d1 = vaddw_s32(s1, vget_low_s32(delta));
248 const int64x2_t d2 = vaddw_s32(s2, vget_high_s32(delta));
249
250 vst1q_s64(dst, d1);
251 vst1q_s64(dst + 2, d2);
252 }
253
load_more_16_neon(const int16_t * const src,const int32_t width,const int16x8_t org[2],int16x8_t dst[2])254 static inline void load_more_16_neon(const int16_t *const src,
255 const int32_t width,
256 const int16x8_t org[2], int16x8_t dst[2]) {
257 int16x8_t s0 = vld1q_dup_s16(src);
258 int16x8_t s1 = vld1q_dup_s16(src + width);
259 dst[0] = vextq_s16(org[0], s0, 1);
260 dst[1] = vextq_s16(org[1], s1, 1);
261 }
262
stats_top_win5_neon(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int32x4_t * sum_m,int32x4_t * sum_h)263 static inline void stats_top_win5_neon(const int16x8_t src[2],
264 const int16x8_t dgd[2],
265 const int16_t *const d,
266 const int32_t d_stride, int32x4_t *sum_m,
267 int32x4_t *sum_h) {
268 int16x8_t dgds[WIENER_WIN_CHROMA * 2];
269
270 load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
271 &dgds[8]);
272 load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
273 &dgds[9]);
274
275 madd_neon(&sum_m[0], src[0], dgds[0]);
276 madd_neon(&sum_m[0], src[1], dgds[1]);
277 madd_neon(&sum_m[1], src[0], dgds[2]);
278 madd_neon(&sum_m[1], src[1], dgds[3]);
279 madd_neon(&sum_m[2], src[0], dgds[4]);
280 madd_neon(&sum_m[2], src[1], dgds[5]);
281 madd_neon(&sum_m[3], src[0], dgds[6]);
282 madd_neon(&sum_m[3], src[1], dgds[7]);
283 madd_neon(&sum_m[4], src[0], dgds[8]);
284 madd_neon(&sum_m[4], src[1], dgds[9]);
285
286 madd_neon(&sum_h[0], dgd[0], dgds[0]);
287 madd_neon(&sum_h[0], dgd[1], dgds[1]);
288 madd_neon(&sum_h[1], dgd[0], dgds[2]);
289 madd_neon(&sum_h[1], dgd[1], dgds[3]);
290 madd_neon(&sum_h[2], dgd[0], dgds[4]);
291 madd_neon(&sum_h[2], dgd[1], dgds[5]);
292 madd_neon(&sum_h[3], dgd[0], dgds[6]);
293 madd_neon(&sum_h[3], dgd[1], dgds[7]);
294 madd_neon(&sum_h[4], dgd[0], dgds[8]);
295 madd_neon(&sum_h[4], dgd[1], dgds[9]);
296 }
297
stats_left_win5_neon(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int32x4_t * sum)298 static inline void stats_left_win5_neon(const int16x8_t src[2],
299 const int16_t *d,
300 const int32_t d_stride,
301 int32x4_t *sum) {
302 int16x8_t dgds[WIN_CHROMA];
303
304 load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
305 &dgds[6]);
306 load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
307 &dgds[7]);
308
309 madd_neon(&sum[0], src[0], dgds[0]);
310 madd_neon(&sum[0], src[1], dgds[1]);
311 madd_neon(&sum[1], src[0], dgds[2]);
312 madd_neon(&sum[1], src[1], dgds[3]);
313 madd_neon(&sum[2], src[0], dgds[4]);
314 madd_neon(&sum[2], src[1], dgds[5]);
315 madd_neon(&sum[3], src[0], dgds[6]);
316 madd_neon(&sum[3], src[1], dgds[7]);
317 }
318
derive_square_win5_neon(const int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int32x4_t deltas[WIENER_WIN_CHROMA-1][WIENER_WIN_CHROMA-1])319 static inline void derive_square_win5_neon(
320 const int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
321 const int16x8_t *d_je,
322 int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
323 msub_neon(&deltas[0][0], d_is[0], d_js[0]);
324 msub_neon(&deltas[0][0], d_is[1], d_js[1]);
325 msub_neon(&deltas[0][1], d_is[0], d_js[2]);
326 msub_neon(&deltas[0][1], d_is[1], d_js[3]);
327 msub_neon(&deltas[0][2], d_is[0], d_js[4]);
328 msub_neon(&deltas[0][2], d_is[1], d_js[5]);
329 msub_neon(&deltas[0][3], d_is[0], d_js[6]);
330 msub_neon(&deltas[0][3], d_is[1], d_js[7]);
331
332 msub_neon(&deltas[1][0], d_is[2], d_js[0]);
333 msub_neon(&deltas[1][0], d_is[3], d_js[1]);
334 msub_neon(&deltas[1][1], d_is[2], d_js[2]);
335 msub_neon(&deltas[1][1], d_is[3], d_js[3]);
336 msub_neon(&deltas[1][2], d_is[2], d_js[4]);
337 msub_neon(&deltas[1][2], d_is[3], d_js[5]);
338 msub_neon(&deltas[1][3], d_is[2], d_js[6]);
339 msub_neon(&deltas[1][3], d_is[3], d_js[7]);
340
341 msub_neon(&deltas[2][0], d_is[4], d_js[0]);
342 msub_neon(&deltas[2][0], d_is[5], d_js[1]);
343 msub_neon(&deltas[2][1], d_is[4], d_js[2]);
344 msub_neon(&deltas[2][1], d_is[5], d_js[3]);
345 msub_neon(&deltas[2][2], d_is[4], d_js[4]);
346 msub_neon(&deltas[2][2], d_is[5], d_js[5]);
347 msub_neon(&deltas[2][3], d_is[4], d_js[6]);
348 msub_neon(&deltas[2][3], d_is[5], d_js[7]);
349
350 msub_neon(&deltas[3][0], d_is[6], d_js[0]);
351 msub_neon(&deltas[3][0], d_is[7], d_js[1]);
352 msub_neon(&deltas[3][1], d_is[6], d_js[2]);
353 msub_neon(&deltas[3][1], d_is[7], d_js[3]);
354 msub_neon(&deltas[3][2], d_is[6], d_js[4]);
355 msub_neon(&deltas[3][2], d_is[7], d_js[5]);
356 msub_neon(&deltas[3][3], d_is[6], d_js[6]);
357 msub_neon(&deltas[3][3], d_is[7], d_js[7]);
358
359 madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
360 madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
361 madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
362 madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
363 madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
364 madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
365 madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
366 madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
367
368 madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
369 madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
370 madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
371 madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
372 madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
373 madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
374 madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
375 madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
376
377 madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
378 madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
379 madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
380 madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
381 madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
382 madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
383 madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
384 madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
385
386 madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
387 madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
388 madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
389 madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
390 madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
391 madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
392 madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
393 madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
394 }
395
load_square_win5_neon(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je)396 static inline void load_square_win5_neon(const int16_t *const di,
397 const int16_t *const dj,
398 const int32_t d_stride,
399 const int32_t height, int16x8_t *d_is,
400 int16x8_t *d_ie, int16x8_t *d_js,
401 int16x8_t *d_je) {
402 load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
403 load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
404 load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
405 load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
406
407 load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
408 &d_ie[4], &d_ie[6]);
409 load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
410 &d_ie[5], &d_ie[7]);
411 load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
412 &d_je[4], &d_je[6]);
413 load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
414 &d_je[5], &d_je[7]);
415 }
416
update_5_stats_neon(const int64_t * const src,const int32x4_t delta,const int64_t delta4,int64_t * const dst)417 static inline void update_5_stats_neon(const int64_t *const src,
418 const int32x4_t delta,
419 const int64_t delta4,
420 int64_t *const dst) {
421 update_4_stats_neon(src + 0, delta, dst + 0);
422 dst[4] = src[4] + delta4;
423 }
424
compute_delta_step3_two_lines(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)425 static inline void compute_delta_step3_two_lines(int32x4_t *sum,
426 const int16x8_t src,
427 const int16x8_t dgd) {
428 *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
429 *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
430 }
431
step3_win5_neon(const int16_t * d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)432 static inline void step3_win5_neon(const int16_t *d, const int32_t d_stride,
433 const int32_t width, const int32_t height,
434 int16x8_t *ds, int32x4_t *deltas) {
435 int32_t y = height;
436 do {
437 ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
438 ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width);
439
440 compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]);
441 compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]);
442 compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]);
443 compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]);
444 compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]);
445 compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]);
446 compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]);
447 compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]);
448 compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]);
449 compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]);
450
451 ds[0] = ds[2];
452 ds[1] = ds[3];
453 ds[2] = ds[4];
454 ds[3] = ds[5];
455
456 d += 2 * d_stride;
457 y -= 2;
458 } while (y);
459 }
460
step3_win5_oneline_neon(const int16_t ** const d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)461 static inline void step3_win5_oneline_neon(const int16_t **const d,
462 const int32_t d_stride,
463 const int32_t width,
464 const int32_t height, int16x8_t *ds,
465 int32x4_t *deltas) {
466 int32_t y = height;
467 do {
468 ds[8] = vld1q_s16(*d);
469 ds[9] = vld1q_s16(*d + width);
470
471 compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
472 compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
473 compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
474 compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
475 compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
476
477 ds[0] = ds[2];
478 ds[1] = ds[3];
479 ds[2] = ds[4];
480 ds[3] = ds[5];
481 ds[4] = ds[6];
482 ds[5] = ds[7];
483 ds[6] = ds[8];
484 ds[7] = ds[9];
485
486 *d += d_stride;
487 } while (--y);
488 }
489
derive_triangle_win5_neon(const int16x8_t * d_is,const int16x8_t * d_ie,int32x4_t * deltas)490 static inline void derive_triangle_win5_neon(const int16x8_t *d_is,
491 const int16x8_t *d_ie,
492 int32x4_t *deltas) {
493 msub_neon(&deltas[0], d_is[0], d_is[0]);
494 msub_neon(&deltas[0], d_is[1], d_is[1]);
495 msub_neon(&deltas[1], d_is[0], d_is[2]);
496 msub_neon(&deltas[1], d_is[1], d_is[3]);
497 msub_neon(&deltas[2], d_is[0], d_is[4]);
498 msub_neon(&deltas[2], d_is[1], d_is[5]);
499 msub_neon(&deltas[3], d_is[0], d_is[6]);
500 msub_neon(&deltas[3], d_is[1], d_is[7]);
501 msub_neon(&deltas[4], d_is[2], d_is[2]);
502 msub_neon(&deltas[4], d_is[3], d_is[3]);
503 msub_neon(&deltas[5], d_is[2], d_is[4]);
504 msub_neon(&deltas[5], d_is[3], d_is[5]);
505 msub_neon(&deltas[6], d_is[2], d_is[6]);
506 msub_neon(&deltas[6], d_is[3], d_is[7]);
507 msub_neon(&deltas[7], d_is[4], d_is[4]);
508 msub_neon(&deltas[7], d_is[5], d_is[5]);
509 msub_neon(&deltas[8], d_is[4], d_is[6]);
510 msub_neon(&deltas[8], d_is[5], d_is[7]);
511 msub_neon(&deltas[9], d_is[6], d_is[6]);
512 msub_neon(&deltas[9], d_is[7], d_is[7]);
513
514 madd_neon(&deltas[0], d_ie[0], d_ie[0]);
515 madd_neon(&deltas[0], d_ie[1], d_ie[1]);
516 madd_neon(&deltas[1], d_ie[0], d_ie[2]);
517 madd_neon(&deltas[1], d_ie[1], d_ie[3]);
518 madd_neon(&deltas[2], d_ie[0], d_ie[4]);
519 madd_neon(&deltas[2], d_ie[1], d_ie[5]);
520 madd_neon(&deltas[3], d_ie[0], d_ie[6]);
521 madd_neon(&deltas[3], d_ie[1], d_ie[7]);
522 madd_neon(&deltas[4], d_ie[2], d_ie[2]);
523 madd_neon(&deltas[4], d_ie[3], d_ie[3]);
524 madd_neon(&deltas[5], d_ie[2], d_ie[4]);
525 madd_neon(&deltas[5], d_ie[3], d_ie[5]);
526 madd_neon(&deltas[6], d_ie[2], d_ie[6]);
527 madd_neon(&deltas[6], d_ie[3], d_ie[7]);
528 madd_neon(&deltas[7], d_ie[4], d_ie[4]);
529 madd_neon(&deltas[7], d_ie[5], d_ie[5]);
530 madd_neon(&deltas[8], d_ie[4], d_ie[6]);
531 madd_neon(&deltas[8], d_ie[5], d_ie[7]);
532 madd_neon(&deltas[9], d_ie[6], d_ie[6]);
533 madd_neon(&deltas[9], d_ie[7], d_ie[7]);
534 }
535
load_triangle_win5_neon(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie)536 static inline void load_triangle_win5_neon(const int16_t *const di,
537 const int32_t d_stride,
538 const int32_t height,
539 int16x8_t *d_is, int16x8_t *d_ie) {
540 load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
541 load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
542
543 load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
544 &d_ie[4], &d_ie[6]);
545 load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
546 &d_ie[5], &d_ie[7]);
547 }
548
sub_deltas_step4(int16x8_t * A,int16x8_t * B,int32x4_t * deltas)549 static inline void sub_deltas_step4(int16x8_t *A, int16x8_t *B,
550 int32x4_t *deltas) {
551 deltas[0] = vmlsl_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
552 deltas[0] = vmlsl_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
553 deltas[1] = vmlsl_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
554 deltas[1] = vmlsl_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
555 deltas[2] = vmlsl_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
556 deltas[2] = vmlsl_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
557 deltas[3] = vmlsl_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
558 deltas[3] = vmlsl_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
559 deltas[4] = vmlsl_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
560 deltas[4] = vmlsl_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
561 deltas[5] = vmlsl_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
562 deltas[5] = vmlsl_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
563 deltas[6] = vmlsl_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
564 deltas[6] = vmlsl_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
565 deltas[7] = vmlsl_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
566 deltas[7] = vmlsl_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
567 deltas[8] = vmlsl_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
568 deltas[8] = vmlsl_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
569 }
570
add_deltas_step4(int16x8_t * A,int16x8_t * B,int32x4_t * deltas)571 static inline void add_deltas_step4(int16x8_t *A, int16x8_t *B,
572 int32x4_t *deltas) {
573 deltas[0] = vmlal_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
574 deltas[0] = vmlal_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
575 deltas[1] = vmlal_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
576 deltas[1] = vmlal_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
577 deltas[2] = vmlal_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
578 deltas[2] = vmlal_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
579 deltas[3] = vmlal_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
580 deltas[3] = vmlal_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
581 deltas[4] = vmlal_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
582 deltas[4] = vmlal_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
583 deltas[5] = vmlal_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
584 deltas[5] = vmlal_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
585 deltas[6] = vmlal_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
586 deltas[6] = vmlal_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
587 deltas[7] = vmlal_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
588 deltas[7] = vmlal_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
589 deltas[8] = vmlal_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
590 deltas[8] = vmlal_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
591 }
592
stats_top_win7_neon(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int32x4_t * sum_m,int32x4_t * sum_h)593 static inline void stats_top_win7_neon(const int16x8_t src[2],
594 const int16x8_t dgd[2],
595 const int16_t *const d,
596 const int32_t d_stride, int32x4_t *sum_m,
597 int32x4_t *sum_h) {
598 int16x8_t dgds[WIENER_WIN * 2];
599
600 load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
601 &dgds[8], &dgds[10], &dgds[12]);
602 load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
603 &dgds[9], &dgds[11], &dgds[13]);
604
605 madd_neon(&sum_m[0], src[0], dgds[0]);
606 madd_neon(&sum_m[0], src[1], dgds[1]);
607 madd_neon(&sum_m[1], src[0], dgds[2]);
608 madd_neon(&sum_m[1], src[1], dgds[3]);
609 madd_neon(&sum_m[2], src[0], dgds[4]);
610 madd_neon(&sum_m[2], src[1], dgds[5]);
611 madd_neon(&sum_m[3], src[0], dgds[6]);
612 madd_neon(&sum_m[3], src[1], dgds[7]);
613 madd_neon(&sum_m[4], src[0], dgds[8]);
614 madd_neon(&sum_m[4], src[1], dgds[9]);
615 madd_neon(&sum_m[5], src[0], dgds[10]);
616 madd_neon(&sum_m[5], src[1], dgds[11]);
617 madd_neon(&sum_m[6], src[0], dgds[12]);
618 madd_neon(&sum_m[6], src[1], dgds[13]);
619
620 madd_neon(&sum_h[0], dgd[0], dgds[0]);
621 madd_neon(&sum_h[0], dgd[1], dgds[1]);
622 madd_neon(&sum_h[1], dgd[0], dgds[2]);
623 madd_neon(&sum_h[1], dgd[1], dgds[3]);
624 madd_neon(&sum_h[2], dgd[0], dgds[4]);
625 madd_neon(&sum_h[2], dgd[1], dgds[5]);
626 madd_neon(&sum_h[3], dgd[0], dgds[6]);
627 madd_neon(&sum_h[3], dgd[1], dgds[7]);
628 madd_neon(&sum_h[4], dgd[0], dgds[8]);
629 madd_neon(&sum_h[4], dgd[1], dgds[9]);
630 madd_neon(&sum_h[5], dgd[0], dgds[10]);
631 madd_neon(&sum_h[5], dgd[1], dgds[11]);
632 madd_neon(&sum_h[6], dgd[0], dgds[12]);
633 madd_neon(&sum_h[6], dgd[1], dgds[13]);
634 }
635
derive_square_win7_neon(const int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int32x4_t deltas[][WIN_7])636 static inline void derive_square_win7_neon(const int16x8_t *d_is,
637 const int16x8_t *d_ie,
638 const int16x8_t *d_js,
639 const int16x8_t *d_je,
640 int32x4_t deltas[][WIN_7]) {
641 msub_neon(&deltas[0][0], d_is[0], d_js[0]);
642 msub_neon(&deltas[0][0], d_is[1], d_js[1]);
643 msub_neon(&deltas[0][1], d_is[0], d_js[2]);
644 msub_neon(&deltas[0][1], d_is[1], d_js[3]);
645 msub_neon(&deltas[0][2], d_is[0], d_js[4]);
646 msub_neon(&deltas[0][2], d_is[1], d_js[5]);
647 msub_neon(&deltas[0][3], d_is[0], d_js[6]);
648 msub_neon(&deltas[0][3], d_is[1], d_js[7]);
649 msub_neon(&deltas[0][4], d_is[0], d_js[8]);
650 msub_neon(&deltas[0][4], d_is[1], d_js[9]);
651 msub_neon(&deltas[0][5], d_is[0], d_js[10]);
652 msub_neon(&deltas[0][5], d_is[1], d_js[11]);
653
654 msub_neon(&deltas[1][0], d_is[2], d_js[0]);
655 msub_neon(&deltas[1][0], d_is[3], d_js[1]);
656 msub_neon(&deltas[1][1], d_is[2], d_js[2]);
657 msub_neon(&deltas[1][1], d_is[3], d_js[3]);
658 msub_neon(&deltas[1][2], d_is[2], d_js[4]);
659 msub_neon(&deltas[1][2], d_is[3], d_js[5]);
660 msub_neon(&deltas[1][3], d_is[2], d_js[6]);
661 msub_neon(&deltas[1][3], d_is[3], d_js[7]);
662 msub_neon(&deltas[1][4], d_is[2], d_js[8]);
663 msub_neon(&deltas[1][4], d_is[3], d_js[9]);
664 msub_neon(&deltas[1][5], d_is[2], d_js[10]);
665 msub_neon(&deltas[1][5], d_is[3], d_js[11]);
666
667 msub_neon(&deltas[2][0], d_is[4], d_js[0]);
668 msub_neon(&deltas[2][0], d_is[5], d_js[1]);
669 msub_neon(&deltas[2][1], d_is[4], d_js[2]);
670 msub_neon(&deltas[2][1], d_is[5], d_js[3]);
671 msub_neon(&deltas[2][2], d_is[4], d_js[4]);
672 msub_neon(&deltas[2][2], d_is[5], d_js[5]);
673 msub_neon(&deltas[2][3], d_is[4], d_js[6]);
674 msub_neon(&deltas[2][3], d_is[5], d_js[7]);
675 msub_neon(&deltas[2][4], d_is[4], d_js[8]);
676 msub_neon(&deltas[2][4], d_is[5], d_js[9]);
677 msub_neon(&deltas[2][5], d_is[4], d_js[10]);
678 msub_neon(&deltas[2][5], d_is[5], d_js[11]);
679
680 msub_neon(&deltas[3][0], d_is[6], d_js[0]);
681 msub_neon(&deltas[3][0], d_is[7], d_js[1]);
682 msub_neon(&deltas[3][1], d_is[6], d_js[2]);
683 msub_neon(&deltas[3][1], d_is[7], d_js[3]);
684 msub_neon(&deltas[3][2], d_is[6], d_js[4]);
685 msub_neon(&deltas[3][2], d_is[7], d_js[5]);
686 msub_neon(&deltas[3][3], d_is[6], d_js[6]);
687 msub_neon(&deltas[3][3], d_is[7], d_js[7]);
688 msub_neon(&deltas[3][4], d_is[6], d_js[8]);
689 msub_neon(&deltas[3][4], d_is[7], d_js[9]);
690 msub_neon(&deltas[3][5], d_is[6], d_js[10]);
691 msub_neon(&deltas[3][5], d_is[7], d_js[11]);
692
693 msub_neon(&deltas[4][0], d_is[8], d_js[0]);
694 msub_neon(&deltas[4][0], d_is[9], d_js[1]);
695 msub_neon(&deltas[4][1], d_is[8], d_js[2]);
696 msub_neon(&deltas[4][1], d_is[9], d_js[3]);
697 msub_neon(&deltas[4][2], d_is[8], d_js[4]);
698 msub_neon(&deltas[4][2], d_is[9], d_js[5]);
699 msub_neon(&deltas[4][3], d_is[8], d_js[6]);
700 msub_neon(&deltas[4][3], d_is[9], d_js[7]);
701 msub_neon(&deltas[4][4], d_is[8], d_js[8]);
702 msub_neon(&deltas[4][4], d_is[9], d_js[9]);
703 msub_neon(&deltas[4][5], d_is[8], d_js[10]);
704 msub_neon(&deltas[4][5], d_is[9], d_js[11]);
705
706 msub_neon(&deltas[5][0], d_is[10], d_js[0]);
707 msub_neon(&deltas[5][0], d_is[11], d_js[1]);
708 msub_neon(&deltas[5][1], d_is[10], d_js[2]);
709 msub_neon(&deltas[5][1], d_is[11], d_js[3]);
710 msub_neon(&deltas[5][2], d_is[10], d_js[4]);
711 msub_neon(&deltas[5][2], d_is[11], d_js[5]);
712 msub_neon(&deltas[5][3], d_is[10], d_js[6]);
713 msub_neon(&deltas[5][3], d_is[11], d_js[7]);
714 msub_neon(&deltas[5][4], d_is[10], d_js[8]);
715 msub_neon(&deltas[5][4], d_is[11], d_js[9]);
716 msub_neon(&deltas[5][5], d_is[10], d_js[10]);
717 msub_neon(&deltas[5][5], d_is[11], d_js[11]);
718
719 madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
720 madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
721 madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
722 madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
723 madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
724 madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
725 madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
726 madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
727 madd_neon(&deltas[0][4], d_ie[0], d_je[8]);
728 madd_neon(&deltas[0][4], d_ie[1], d_je[9]);
729 madd_neon(&deltas[0][5], d_ie[0], d_je[10]);
730 madd_neon(&deltas[0][5], d_ie[1], d_je[11]);
731
732 madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
733 madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
734 madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
735 madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
736 madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
737 madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
738 madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
739 madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
740 madd_neon(&deltas[1][4], d_ie[2], d_je[8]);
741 madd_neon(&deltas[1][4], d_ie[3], d_je[9]);
742 madd_neon(&deltas[1][5], d_ie[2], d_je[10]);
743 madd_neon(&deltas[1][5], d_ie[3], d_je[11]);
744
745 madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
746 madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
747 madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
748 madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
749 madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
750 madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
751 madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
752 madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
753 madd_neon(&deltas[2][4], d_ie[4], d_je[8]);
754 madd_neon(&deltas[2][4], d_ie[5], d_je[9]);
755 madd_neon(&deltas[2][5], d_ie[4], d_je[10]);
756 madd_neon(&deltas[2][5], d_ie[5], d_je[11]);
757
758 madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
759 madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
760 madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
761 madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
762 madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
763 madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
764 madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
765 madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
766 madd_neon(&deltas[3][4], d_ie[6], d_je[8]);
767 madd_neon(&deltas[3][4], d_ie[7], d_je[9]);
768 madd_neon(&deltas[3][5], d_ie[6], d_je[10]);
769 madd_neon(&deltas[3][5], d_ie[7], d_je[11]);
770
771 madd_neon(&deltas[4][0], d_ie[8], d_je[0]);
772 madd_neon(&deltas[4][0], d_ie[9], d_je[1]);
773 madd_neon(&deltas[4][1], d_ie[8], d_je[2]);
774 madd_neon(&deltas[4][1], d_ie[9], d_je[3]);
775 madd_neon(&deltas[4][2], d_ie[8], d_je[4]);
776 madd_neon(&deltas[4][2], d_ie[9], d_je[5]);
777 madd_neon(&deltas[4][3], d_ie[8], d_je[6]);
778 madd_neon(&deltas[4][3], d_ie[9], d_je[7]);
779 madd_neon(&deltas[4][4], d_ie[8], d_je[8]);
780 madd_neon(&deltas[4][4], d_ie[9], d_je[9]);
781 madd_neon(&deltas[4][5], d_ie[8], d_je[10]);
782 madd_neon(&deltas[4][5], d_ie[9], d_je[11]);
783
784 madd_neon(&deltas[5][0], d_ie[10], d_je[0]);
785 madd_neon(&deltas[5][0], d_ie[11], d_je[1]);
786 madd_neon(&deltas[5][1], d_ie[10], d_je[2]);
787 madd_neon(&deltas[5][1], d_ie[11], d_je[3]);
788 madd_neon(&deltas[5][2], d_ie[10], d_je[4]);
789 madd_neon(&deltas[5][2], d_ie[11], d_je[5]);
790 madd_neon(&deltas[5][3], d_ie[10], d_je[6]);
791 madd_neon(&deltas[5][3], d_ie[11], d_je[7]);
792 madd_neon(&deltas[5][4], d_ie[10], d_je[8]);
793 madd_neon(&deltas[5][4], d_ie[11], d_je[9]);
794 madd_neon(&deltas[5][5], d_ie[10], d_je[10]);
795 madd_neon(&deltas[5][5], d_ie[11], d_je[11]);
796 }
797
update_8_stats_neon(const int64_t * const src,const int32x4_t delta0,const int32x4_t delta1,int64_t * const dst)798 static inline void update_8_stats_neon(const int64_t *const src,
799 const int32x4_t delta0,
800 const int32x4_t delta1,
801 int64_t *const dst) {
802 update_4_stats_neon(src + 0, delta0, dst + 0);
803 update_4_stats_neon(src + 4, delta1, dst + 4);
804 }
805
load_square_win7_neon(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je)806 static inline void load_square_win7_neon(const int16_t *const di,
807 const int16_t *const dj,
808 const int32_t d_stride,
809 const int32_t height, int16x8_t *d_is,
810 int16x8_t *d_ie, int16x8_t *d_js,
811 int16x8_t *d_je) {
812 load_s16_8x6(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6],
813 &d_is[8], &d_is[10]);
814 load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
815 &d_is[9], &d_is[11]);
816 load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
817 &d_js[8], &d_js[10]);
818 load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
819 &d_js[9], &d_js[11]);
820
821 load_s16_8x6(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
822 &d_ie[4], &d_ie[6], &d_ie[8], &d_ie[10]);
823 load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
824 &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
825 load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
826 &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
827 load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
828 &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
829 }
830
load_triangle_win7_neon(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie)831 static inline void load_triangle_win7_neon(const int16_t *const di,
832 const int32_t d_stride,
833 const int32_t height,
834 int16x8_t *d_is, int16x8_t *d_ie) {
835 load_s16_8x6(di, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6], &d_is[8],
836 &d_is[10]);
837 load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
838 &d_is[9], &d_is[11]);
839
840 load_s16_8x6(di + height * d_stride, d_stride, &d_ie[0], &d_ie[2], &d_ie[4],
841 &d_ie[6], &d_ie[8], &d_ie[10]);
842 load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
843 &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
844 }
845
stats_left_win7_neon(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int32x4_t * sum)846 static inline void stats_left_win7_neon(const int16x8_t src[2],
847 const int16_t *d,
848 const int32_t d_stride,
849 int32x4_t *sum) {
850 int16x8_t dgds[WIN_7];
851
852 load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
853 &dgds[6], &dgds[8], &dgds[10]);
854 load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
855 &dgds[7], &dgds[9], &dgds[11]);
856
857 madd_neon(&sum[0], src[0], dgds[0]);
858 madd_neon(&sum[0], src[1], dgds[1]);
859 madd_neon(&sum[1], src[0], dgds[2]);
860 madd_neon(&sum[1], src[1], dgds[3]);
861 madd_neon(&sum[2], src[0], dgds[4]);
862 madd_neon(&sum[2], src[1], dgds[5]);
863 madd_neon(&sum[3], src[0], dgds[6]);
864 madd_neon(&sum[3], src[1], dgds[7]);
865 madd_neon(&sum[4], src[0], dgds[8]);
866 madd_neon(&sum[4], src[1], dgds[9]);
867 madd_neon(&sum[5], src[0], dgds[10]);
868 madd_neon(&sum[5], src[1], dgds[11]);
869 }
870
step3_win7_neon(const int16_t * d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)871 static inline void step3_win7_neon(const int16_t *d, const int32_t d_stride,
872 const int32_t width, const int32_t height,
873 int16x8_t *ds, int32x4_t *deltas) {
874 int32_t y = height;
875 do {
876 ds[12] = vld1q_s16(d);
877 ds[13] = vld1q_s16(d + width);
878
879 compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
880 compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
881 compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
882 compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
883 compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
884 compute_delta_step3(&deltas[9], &deltas[13], ds[0], ds[1], ds[10], ds[11]);
885 compute_delta_step3(&deltas[10], &deltas[14], ds[0], ds[1], ds[12], ds[13]);
886
887 ds[0] = ds[2];
888 ds[1] = ds[3];
889 ds[2] = ds[4];
890 ds[3] = ds[5];
891 ds[4] = ds[6];
892 ds[5] = ds[7];
893 ds[6] = ds[8];
894 ds[7] = ds[9];
895 ds[8] = ds[10];
896 ds[9] = ds[11];
897 ds[10] = ds[12];
898 ds[11] = ds[13];
899
900 d += d_stride;
901 } while (--y);
902 }
903
derive_triangle_win7_neon(const int16x8_t * d_is,const int16x8_t * d_ie,int32x4_t * deltas)904 static inline void derive_triangle_win7_neon(const int16x8_t *d_is,
905 const int16x8_t *d_ie,
906 int32x4_t *deltas) {
907 msub_neon(&deltas[0], d_is[0], d_is[0]);
908 msub_neon(&deltas[0], d_is[1], d_is[1]);
909 msub_neon(&deltas[1], d_is[0], d_is[2]);
910 msub_neon(&deltas[1], d_is[1], d_is[3]);
911 msub_neon(&deltas[2], d_is[0], d_is[4]);
912 msub_neon(&deltas[2], d_is[1], d_is[5]);
913 msub_neon(&deltas[3], d_is[0], d_is[6]);
914 msub_neon(&deltas[3], d_is[1], d_is[7]);
915 msub_neon(&deltas[4], d_is[0], d_is[8]);
916 msub_neon(&deltas[4], d_is[1], d_is[9]);
917 msub_neon(&deltas[5], d_is[0], d_is[10]);
918 msub_neon(&deltas[5], d_is[1], d_is[11]);
919
920 msub_neon(&deltas[6], d_is[2], d_is[2]);
921 msub_neon(&deltas[6], d_is[3], d_is[3]);
922 msub_neon(&deltas[7], d_is[2], d_is[4]);
923 msub_neon(&deltas[7], d_is[3], d_is[5]);
924 msub_neon(&deltas[8], d_is[2], d_is[6]);
925 msub_neon(&deltas[8], d_is[3], d_is[7]);
926 msub_neon(&deltas[9], d_is[2], d_is[8]);
927 msub_neon(&deltas[9], d_is[3], d_is[9]);
928 msub_neon(&deltas[10], d_is[2], d_is[10]);
929 msub_neon(&deltas[10], d_is[3], d_is[11]);
930
931 msub_neon(&deltas[11], d_is[4], d_is[4]);
932 msub_neon(&deltas[11], d_is[5], d_is[5]);
933 msub_neon(&deltas[12], d_is[4], d_is[6]);
934 msub_neon(&deltas[12], d_is[5], d_is[7]);
935 msub_neon(&deltas[13], d_is[4], d_is[8]);
936 msub_neon(&deltas[13], d_is[5], d_is[9]);
937 msub_neon(&deltas[14], d_is[4], d_is[10]);
938 msub_neon(&deltas[14], d_is[5], d_is[11]);
939
940 msub_neon(&deltas[15], d_is[6], d_is[6]);
941 msub_neon(&deltas[15], d_is[7], d_is[7]);
942 msub_neon(&deltas[16], d_is[6], d_is[8]);
943 msub_neon(&deltas[16], d_is[7], d_is[9]);
944 msub_neon(&deltas[17], d_is[6], d_is[10]);
945 msub_neon(&deltas[17], d_is[7], d_is[11]);
946
947 msub_neon(&deltas[18], d_is[8], d_is[8]);
948 msub_neon(&deltas[18], d_is[9], d_is[9]);
949 msub_neon(&deltas[19], d_is[8], d_is[10]);
950 msub_neon(&deltas[19], d_is[9], d_is[11]);
951
952 msub_neon(&deltas[20], d_is[10], d_is[10]);
953 msub_neon(&deltas[20], d_is[11], d_is[11]);
954
955 madd_neon(&deltas[0], d_ie[0], d_ie[0]);
956 madd_neon(&deltas[0], d_ie[1], d_ie[1]);
957 madd_neon(&deltas[1], d_ie[0], d_ie[2]);
958 madd_neon(&deltas[1], d_ie[1], d_ie[3]);
959 madd_neon(&deltas[2], d_ie[0], d_ie[4]);
960 madd_neon(&deltas[2], d_ie[1], d_ie[5]);
961 madd_neon(&deltas[3], d_ie[0], d_ie[6]);
962 madd_neon(&deltas[3], d_ie[1], d_ie[7]);
963 madd_neon(&deltas[4], d_ie[0], d_ie[8]);
964 madd_neon(&deltas[4], d_ie[1], d_ie[9]);
965 madd_neon(&deltas[5], d_ie[0], d_ie[10]);
966 madd_neon(&deltas[5], d_ie[1], d_ie[11]);
967
968 madd_neon(&deltas[6], d_ie[2], d_ie[2]);
969 madd_neon(&deltas[6], d_ie[3], d_ie[3]);
970 madd_neon(&deltas[7], d_ie[2], d_ie[4]);
971 madd_neon(&deltas[7], d_ie[3], d_ie[5]);
972 madd_neon(&deltas[8], d_ie[2], d_ie[6]);
973 madd_neon(&deltas[8], d_ie[3], d_ie[7]);
974 madd_neon(&deltas[9], d_ie[2], d_ie[8]);
975 madd_neon(&deltas[9], d_ie[3], d_ie[9]);
976 madd_neon(&deltas[10], d_ie[2], d_ie[10]);
977 madd_neon(&deltas[10], d_ie[3], d_ie[11]);
978
979 madd_neon(&deltas[11], d_ie[4], d_ie[4]);
980 madd_neon(&deltas[11], d_ie[5], d_ie[5]);
981 madd_neon(&deltas[12], d_ie[4], d_ie[6]);
982 madd_neon(&deltas[12], d_ie[5], d_ie[7]);
983 madd_neon(&deltas[13], d_ie[4], d_ie[8]);
984 madd_neon(&deltas[13], d_ie[5], d_ie[9]);
985 madd_neon(&deltas[14], d_ie[4], d_ie[10]);
986 madd_neon(&deltas[14], d_ie[5], d_ie[11]);
987
988 madd_neon(&deltas[15], d_ie[6], d_ie[6]);
989 madd_neon(&deltas[15], d_ie[7], d_ie[7]);
990 madd_neon(&deltas[16], d_ie[6], d_ie[8]);
991 madd_neon(&deltas[16], d_ie[7], d_ie[9]);
992 madd_neon(&deltas[17], d_ie[6], d_ie[10]);
993 madd_neon(&deltas[17], d_ie[7], d_ie[11]);
994
995 madd_neon(&deltas[18], d_ie[8], d_ie[8]);
996 madd_neon(&deltas[18], d_ie[9], d_ie[9]);
997 madd_neon(&deltas[19], d_ie[8], d_ie[10]);
998 madd_neon(&deltas[19], d_ie[9], d_ie[11]);
999
1000 madd_neon(&deltas[20], d_ie[10], d_ie[10]);
1001 madd_neon(&deltas[20], d_ie[11], d_ie[11]);
1002 }
1003
diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1004 static inline void diagonal_copy_stats_neon(const int32_t wiener_win2,
1005 int64_t *const H) {
1006 for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1007 int64x2_t in[8], out[8];
1008
1009 in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 1);
1010 in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 3);
1011 in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1);
1012 in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 3);
1013 in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 1);
1014 in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 3);
1015 in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 1);
1016 in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 3);
1017
1018 transpose_arrays_s64_4x4(in, out);
1019
1020 vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(out[0]));
1021 vst1q_s64(H + (i + 2) * wiener_win2 + i, out[2]);
1022 vst1q_s64(H + (i + 3) * wiener_win2 + i, out[4]);
1023 vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1024 vst1q_s64(H + (i + 4) * wiener_win2 + i, out[6]);
1025 vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1026
1027 for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1028 in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + j);
1029 in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + j + 2);
1030 in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + j);
1031 in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + j + 2);
1032 in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + j);
1033 in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + j + 2);
1034 in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + j);
1035 in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + j + 2);
1036
1037 transpose_arrays_s64_4x4(in, out);
1038
1039 vst1q_s64(H + (j + 0) * wiener_win2 + i, out[0]);
1040 vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1041 vst1q_s64(H + (j + 1) * wiener_win2 + i, out[2]);
1042 vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1043 vst1q_s64(H + (j + 2) * wiener_win2 + i, out[4]);
1044 vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1045 vst1q_s64(H + (j + 3) * wiener_win2 + i, out[6]);
1046 vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1047 }
1048 }
1049 }
1050
div4_neon(const int64x2_t src)1051 static inline int64x2_t div4_neon(const int64x2_t src) {
1052 #if AOM_ARCH_AARCH64
1053 uint64x2_t sign = vcltzq_s64(src);
1054 int64x2_t abs = vabsq_s64(src);
1055 // divide by 4
1056 abs = vshrq_n_s64(abs, 2);
1057 // re-apply sign
1058 return vbslq_s64(sign, vnegq_s64(abs), abs);
1059 #else
1060 int64x2_t sign = vshrq_n_s64(src, 63);
1061 int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
1062 // divide by 4
1063 abs = vshrq_n_s64(abs, 2);
1064 // re-apply sign
1065 return vsubq_s64(veorq_s64(abs, sign), sign);
1066 #endif // AOM_ARCH_AARCH64
1067 }
1068
div4_4x4_neon(const int32_t wiener_win2,int64_t * const H,int64x2_t out[8])1069 static inline void div4_4x4_neon(const int32_t wiener_win2, int64_t *const H,
1070 int64x2_t out[8]) {
1071 out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
1072 out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
1073 out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
1074 out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
1075 out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
1076 out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
1077 out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
1078 out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
1079
1080 out[0] = div4_neon(out[0]);
1081 out[1] = div4_neon(out[1]);
1082 out[2] = div4_neon(out[2]);
1083 out[3] = div4_neon(out[3]);
1084 out[4] = div4_neon(out[4]);
1085 out[5] = div4_neon(out[5]);
1086 out[6] = div4_neon(out[6]);
1087 out[7] = div4_neon(out[7]);
1088
1089 vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
1090 vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
1091 vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
1092 vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
1093 vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
1094 vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
1095 vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
1096 vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
1097 }
1098
div16_neon(const int64x2_t src)1099 static inline int64x2_t div16_neon(const int64x2_t src) {
1100 #if AOM_ARCH_AARCH64
1101 uint64x2_t sign = vcltzq_s64(src);
1102 int64x2_t abs = vabsq_s64(src);
1103 // divide by 16
1104 abs = vshrq_n_s64(abs, 4);
1105 // re-apply sign
1106 return vbslq_s64(sign, vnegq_s64(abs), abs);
1107 #else
1108 int64x2_t sign = vshrq_n_s64(src, 63);
1109 int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
1110 // divide by 16
1111 abs = vshrq_n_s64(abs, 4);
1112 // re-apply sign
1113 return vsubq_s64(veorq_s64(abs, sign), sign);
1114 #endif // AOM_ARCH_AARCH64
1115 }
1116
div16_4x4_neon(const int32_t wiener_win2,int64_t * const H,int64x2_t out[8])1117 static inline void div16_4x4_neon(const int32_t wiener_win2, int64_t *const H,
1118 int64x2_t out[8]) {
1119 out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
1120 out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
1121 out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
1122 out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
1123 out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
1124 out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
1125 out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
1126 out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
1127
1128 out[0] = div16_neon(out[0]);
1129 out[1] = div16_neon(out[1]);
1130 out[2] = div16_neon(out[2]);
1131 out[3] = div16_neon(out[3]);
1132 out[4] = div16_neon(out[4]);
1133 out[5] = div16_neon(out[5]);
1134 out[6] = div16_neon(out[6]);
1135 out[7] = div16_neon(out[7]);
1136
1137 vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
1138 vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
1139 vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
1140 vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
1141 vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
1142 vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
1143 vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
1144 vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
1145 }
1146
div4_diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1147 static inline void div4_diagonal_copy_stats_neon(const int32_t wiener_win2,
1148 int64_t *const H) {
1149 for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1150 int64x2_t in[8], out[8];
1151
1152 div4_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
1153 transpose_arrays_s64_4x4(in, out);
1154
1155 vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
1156 vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
1157 vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
1158 vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1159 vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
1160 vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1161
1162 for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1163 div4_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
1164 transpose_arrays_s64_4x4(in, out);
1165
1166 vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
1167 vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1168 vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
1169 vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1170 vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
1171 vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1172 vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
1173 vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1174 }
1175 }
1176 }
1177
div16_diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1178 static inline void div16_diagonal_copy_stats_neon(const int32_t wiener_win2,
1179 int64_t *const H) {
1180 for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1181 int64x2_t in[8], out[8];
1182
1183 div16_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
1184 transpose_arrays_s64_4x4(in, out);
1185
1186 vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
1187 vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
1188 vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
1189 vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1190 vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
1191 vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1192
1193 for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1194 div16_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
1195 transpose_arrays_s64_4x4(in, out);
1196
1197 vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
1198 vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1199 vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
1200 vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1201 vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
1202 vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1203 vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
1204 vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1205 }
1206 }
1207 }
1208
1209 #endif // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
1210