xref: /aosp_15_r20/external/libaom/av1/encoder/arm/pickrst_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
16*77c1e3ccSAndroid Build Coastguard Worker 
17*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/restoration.h"
18*77c1e3ccSAndroid Build Coastguard Worker 
19*77c1e3ccSAndroid Build Coastguard Worker #define WIN_7 ((WIENER_WIN - 1) * 2)
20*77c1e3ccSAndroid Build Coastguard Worker #define WIN_CHROMA ((WIENER_WIN_CHROMA - 1) * 2)
21*77c1e3ccSAndroid Build Coastguard Worker 
22*77c1e3ccSAndroid Build Coastguard Worker // Aligned sizes for Wiener filters.
23*77c1e3ccSAndroid Build Coastguard Worker #define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
24*77c1e3ccSAndroid Build Coastguard Worker #define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
25*77c1e3ccSAndroid Build Coastguard Worker #define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
26*77c1e3ccSAndroid Build Coastguard Worker #define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
27*77c1e3ccSAndroid Build Coastguard Worker #define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
28*77c1e3ccSAndroid Build Coastguard Worker 
29*77c1e3ccSAndroid Build Coastguard Worker // Compute 8 values of M (cross correlation) for a single source pixel and
30*77c1e3ccSAndroid Build Coastguard Worker // accumulate.
update_M_1pixel(int32_t * M_s32,int16x4_t src_avg,int16x8_t dgd_avg)31*77c1e3ccSAndroid Build Coastguard Worker static inline void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
32*77c1e3ccSAndroid Build Coastguard Worker                                    int16x8_t dgd_avg) {
33*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t lo = vld1q_s32(M_s32 + 0);
34*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t hi = vld1q_s32(M_s32 + 4);
35*77c1e3ccSAndroid Build Coastguard Worker 
36*77c1e3ccSAndroid Build Coastguard Worker   lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
37*77c1e3ccSAndroid Build Coastguard Worker   hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
38*77c1e3ccSAndroid Build Coastguard Worker 
39*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(M_s32 + 0, lo);
40*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(M_s32 + 4, hi);
41*77c1e3ccSAndroid Build Coastguard Worker }
42*77c1e3ccSAndroid Build Coastguard Worker 
43*77c1e3ccSAndroid Build Coastguard Worker // Compute 8 values of M (cross correlation) for two source pixels and
44*77c1e3ccSAndroid Build Coastguard Worker // accumulate.
update_M_2pixels(int32_t * M_s32,int16x4_t src_avg0,int16x4_t src_avg1,int16x8_t dgd_avg0,int16x8_t dgd_avg1)45*77c1e3ccSAndroid Build Coastguard Worker static inline void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
46*77c1e3ccSAndroid Build Coastguard Worker                                     int16x4_t src_avg1, int16x8_t dgd_avg0,
47*77c1e3ccSAndroid Build Coastguard Worker                                     int16x8_t dgd_avg1) {
48*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t lo = vld1q_s32(M_s32 + 0);
49*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t hi = vld1q_s32(M_s32 + 4);
50*77c1e3ccSAndroid Build Coastguard Worker 
51*77c1e3ccSAndroid Build Coastguard Worker   lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
52*77c1e3ccSAndroid Build Coastguard Worker   hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
53*77c1e3ccSAndroid Build Coastguard Worker   lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
54*77c1e3ccSAndroid Build Coastguard Worker   hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
55*77c1e3ccSAndroid Build Coastguard Worker 
56*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(M_s32 + 0, lo);
57*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s32(M_s32 + 4, hi);
58*77c1e3ccSAndroid Build Coastguard Worker }
59*77c1e3ccSAndroid Build Coastguard Worker 
update_H_1pixel(int32_t * H_s32,const int16_t * dgd_avg,int width,int height)60*77c1e3ccSAndroid Build Coastguard Worker static inline void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
61*77c1e3ccSAndroid Build Coastguard Worker                                    int width, int height) {
62*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; i += 4) {
63*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t di = vld1_s16(dgd_avg + i);
64*77c1e3ccSAndroid Build Coastguard Worker 
65*77c1e3ccSAndroid Build Coastguard Worker     for (int j = i; j < width; j += 4) {
66*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t dj = vld1_s16(dgd_avg + j);
67*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
68*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
69*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
70*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
71*77c1e3ccSAndroid Build Coastguard Worker 
72*77c1e3ccSAndroid Build Coastguard Worker       h0 = vmlal_lane_s16(h0, dj, di, 0);
73*77c1e3ccSAndroid Build Coastguard Worker       h1 = vmlal_lane_s16(h1, dj, di, 1);
74*77c1e3ccSAndroid Build Coastguard Worker       h2 = vmlal_lane_s16(h2, dj, di, 2);
75*77c1e3ccSAndroid Build Coastguard Worker       h3 = vmlal_lane_s16(h3, dj, di, 3);
76*77c1e3ccSAndroid Build Coastguard Worker 
77*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 0 * width + j, h0);
78*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 1 * width + j, h1);
79*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 2 * width + j, h2);
80*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 3 * width + j, h3);
81*77c1e3ccSAndroid Build Coastguard Worker     }
82*77c1e3ccSAndroid Build Coastguard Worker     H_s32 += 4 * width;
83*77c1e3ccSAndroid Build Coastguard Worker   }
84*77c1e3ccSAndroid Build Coastguard Worker }
85*77c1e3ccSAndroid Build Coastguard Worker 
update_H_5x5_2pixels(int32_t * H_s32,const int16_t * dgd_avg0,const int16_t * dgd_avg1)86*77c1e3ccSAndroid Build Coastguard Worker static inline void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
87*77c1e3ccSAndroid Build Coastguard Worker                                         const int16_t *dgd_avg1) {
88*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 24; i += 4) {
89*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t di0 = vld1_s16(dgd_avg0 + i);
90*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t di1 = vld1_s16(dgd_avg1 + i);
91*77c1e3ccSAndroid Build Coastguard Worker 
92*77c1e3ccSAndroid Build Coastguard Worker     for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
93*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
94*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
95*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
96*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
97*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
98*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
99*77c1e3ccSAndroid Build Coastguard Worker 
100*77c1e3ccSAndroid Build Coastguard Worker       h0 = vmlal_lane_s16(h0, dj0, di0, 0);
101*77c1e3ccSAndroid Build Coastguard Worker       h0 = vmlal_lane_s16(h0, dj1, di1, 0);
102*77c1e3ccSAndroid Build Coastguard Worker       h1 = vmlal_lane_s16(h1, dj0, di0, 1);
103*77c1e3ccSAndroid Build Coastguard Worker       h1 = vmlal_lane_s16(h1, dj1, di1, 1);
104*77c1e3ccSAndroid Build Coastguard Worker       h2 = vmlal_lane_s16(h2, dj0, di0, 2);
105*77c1e3ccSAndroid Build Coastguard Worker       h2 = vmlal_lane_s16(h2, dj1, di1, 2);
106*77c1e3ccSAndroid Build Coastguard Worker       h3 = vmlal_lane_s16(h3, dj0, di0, 3);
107*77c1e3ccSAndroid Build Coastguard Worker       h3 = vmlal_lane_s16(h3, dj1, di1, 3);
108*77c1e3ccSAndroid Build Coastguard Worker 
109*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
110*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
111*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
112*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
113*77c1e3ccSAndroid Build Coastguard Worker     }
114*77c1e3ccSAndroid Build Coastguard Worker     H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
115*77c1e3ccSAndroid Build Coastguard Worker   }
116*77c1e3ccSAndroid Build Coastguard Worker }
117*77c1e3ccSAndroid Build Coastguard Worker 
update_H_7x7_2pixels(int32_t * H_s32,const int16_t * dgd_avg0,const int16_t * dgd_avg1)118*77c1e3ccSAndroid Build Coastguard Worker static inline void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
119*77c1e3ccSAndroid Build Coastguard Worker                                         const int16_t *dgd_avg1) {
120*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 48; i += 4) {
121*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t di0 = vld1_s16(dgd_avg0 + i);
122*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t di1 = vld1_s16(dgd_avg1 + i);
123*77c1e3ccSAndroid Build Coastguard Worker 
124*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
125*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
126*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
127*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
128*77c1e3ccSAndroid Build Coastguard Worker 
129*77c1e3ccSAndroid Build Coastguard Worker     h0 = vmlal_lane_s16(h0, di0, di0, 0);
130*77c1e3ccSAndroid Build Coastguard Worker     h0 = vmlal_lane_s16(h0, di1, di1, 0);
131*77c1e3ccSAndroid Build Coastguard Worker     h1 = vmlal_lane_s16(h1, di0, di0, 1);
132*77c1e3ccSAndroid Build Coastguard Worker     h1 = vmlal_lane_s16(h1, di1, di1, 1);
133*77c1e3ccSAndroid Build Coastguard Worker     h2 = vmlal_lane_s16(h2, di0, di0, 2);
134*77c1e3ccSAndroid Build Coastguard Worker     h2 = vmlal_lane_s16(h2, di1, di1, 2);
135*77c1e3ccSAndroid Build Coastguard Worker     h3 = vmlal_lane_s16(h3, di0, di0, 3);
136*77c1e3ccSAndroid Build Coastguard Worker     h3 = vmlal_lane_s16(h3, di1, di1, 3);
137*77c1e3ccSAndroid Build Coastguard Worker 
138*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
139*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
140*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
141*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
142*77c1e3ccSAndroid Build Coastguard Worker 
143*77c1e3ccSAndroid Build Coastguard Worker     for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
144*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
145*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
146*77c1e3ccSAndroid Build Coastguard Worker       h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
147*77c1e3ccSAndroid Build Coastguard Worker       h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
148*77c1e3ccSAndroid Build Coastguard Worker       h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
149*77c1e3ccSAndroid Build Coastguard Worker       h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
150*77c1e3ccSAndroid Build Coastguard Worker 
151*77c1e3ccSAndroid Build Coastguard Worker       h0 = vmlal_lane_s16(h0, dj0, di0, 0);
152*77c1e3ccSAndroid Build Coastguard Worker       h0 = vmlal_lane_s16(h0, dj1, di1, 0);
153*77c1e3ccSAndroid Build Coastguard Worker       h1 = vmlal_lane_s16(h1, dj0, di0, 1);
154*77c1e3ccSAndroid Build Coastguard Worker       h1 = vmlal_lane_s16(h1, dj1, di1, 1);
155*77c1e3ccSAndroid Build Coastguard Worker       h2 = vmlal_lane_s16(h2, dj0, di0, 2);
156*77c1e3ccSAndroid Build Coastguard Worker       h2 = vmlal_lane_s16(h2, dj1, di1, 2);
157*77c1e3ccSAndroid Build Coastguard Worker       h3 = vmlal_lane_s16(h3, dj0, di0, 3);
158*77c1e3ccSAndroid Build Coastguard Worker       h3 = vmlal_lane_s16(h3, dj1, di1, 3);
159*77c1e3ccSAndroid Build Coastguard Worker 
160*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
161*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
162*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
163*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
164*77c1e3ccSAndroid Build Coastguard Worker     }
165*77c1e3ccSAndroid Build Coastguard Worker     H_s32 += 4 * WIENER_WIN2_ALIGN2;
166*77c1e3ccSAndroid Build Coastguard Worker   }
167*77c1e3ccSAndroid Build Coastguard Worker }
168*77c1e3ccSAndroid Build Coastguard Worker 
169*77c1e3ccSAndroid Build Coastguard Worker // Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
accumulate_and_clear(int64_t * dst,int32_t * src,int length)170*77c1e3ccSAndroid Build Coastguard Worker static inline void accumulate_and_clear(int64_t *dst, int32_t *src,
171*77c1e3ccSAndroid Build Coastguard Worker                                         int length) {
172*77c1e3ccSAndroid Build Coastguard Worker   do {
173*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t s32 = vld1q_s32(src);
174*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s32(src, vdupq_n_s32(0));
175*77c1e3ccSAndroid Build Coastguard Worker     src += 4;
176*77c1e3ccSAndroid Build Coastguard Worker 
177*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t d_lo = vld1q_s64(dst + 0);
178*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t d_hi = vld1q_s64(dst + 2);
179*77c1e3ccSAndroid Build Coastguard Worker 
180*77c1e3ccSAndroid Build Coastguard Worker     d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
181*77c1e3ccSAndroid Build Coastguard Worker     d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
182*77c1e3ccSAndroid Build Coastguard Worker 
183*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(dst + 0, d_lo);
184*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(dst + 2, d_hi);
185*77c1e3ccSAndroid Build Coastguard Worker 
186*77c1e3ccSAndroid Build Coastguard Worker     dst += 4;
187*77c1e3ccSAndroid Build Coastguard Worker     length -= 4;
188*77c1e3ccSAndroid Build Coastguard Worker   } while (length > 0);
189*77c1e3ccSAndroid Build Coastguard Worker }
190*77c1e3ccSAndroid Build Coastguard Worker 
191*77c1e3ccSAndroid Build Coastguard Worker // clang-format off
192*77c1e3ccSAndroid Build Coastguard Worker // Constant pool to act as a mask to zero n top elements in an int16x8_t vector.
193*77c1e3ccSAndroid Build Coastguard Worker // The index we load from depends on n.
194*77c1e3ccSAndroid Build Coastguard Worker static const int16_t mask_16bit[32] = {
195*77c1e3ccSAndroid Build Coastguard Worker   0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
196*77c1e3ccSAndroid Build Coastguard Worker   0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
197*77c1e3ccSAndroid Build Coastguard Worker        0,      0,      0,      0,      0,      0,      0,      0,
198*77c1e3ccSAndroid Build Coastguard Worker        0,      0,      0,      0,      0,      0,      0,      0,
199*77c1e3ccSAndroid Build Coastguard Worker };
200*77c1e3ccSAndroid Build Coastguard Worker // clang-format on
201*77c1e3ccSAndroid Build Coastguard Worker 
madd_neon_pairwise(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)202*77c1e3ccSAndroid Build Coastguard Worker static inline void madd_neon_pairwise(int32x4_t *sum, const int16x8_t src,
203*77c1e3ccSAndroid Build Coastguard Worker                                       const int16x8_t dgd) {
204*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t sd =
205*77c1e3ccSAndroid Build Coastguard Worker       horizontal_add_2d_s32(vmull_s16(vget_low_s16(src), vget_low_s16(dgd)),
206*77c1e3ccSAndroid Build Coastguard Worker                             vmull_s16(vget_high_s16(src), vget_high_s16(dgd)));
207*77c1e3ccSAndroid Build Coastguard Worker   *sum = vaddq_s32(*sum, sd);
208*77c1e3ccSAndroid Build Coastguard Worker }
209*77c1e3ccSAndroid Build Coastguard Worker 
madd_neon(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)210*77c1e3ccSAndroid Build Coastguard Worker static inline void madd_neon(int32x4_t *sum, const int16x8_t src,
211*77c1e3ccSAndroid Build Coastguard Worker                              const int16x8_t dgd) {
212*77c1e3ccSAndroid Build Coastguard Worker   *sum = vmlal_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
213*77c1e3ccSAndroid Build Coastguard Worker   *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
214*77c1e3ccSAndroid Build Coastguard Worker }
215*77c1e3ccSAndroid Build Coastguard Worker 
msub_neon(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)216*77c1e3ccSAndroid Build Coastguard Worker static inline void msub_neon(int32x4_t *sum, const int16x8_t src,
217*77c1e3ccSAndroid Build Coastguard Worker                              const int16x8_t dgd) {
218*77c1e3ccSAndroid Build Coastguard Worker   *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
219*77c1e3ccSAndroid Build Coastguard Worker   *sum = vmlsl_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
220*77c1e3ccSAndroid Build Coastguard Worker }
221*77c1e3ccSAndroid Build Coastguard Worker 
compute_delta_step3(int32x4_t * sum0,int32x4_t * sum1,const int16x8_t src0,const int16x8_t src1,const int16x8_t dgd0,const int16x8_t dgd1)222*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_delta_step3(int32x4_t *sum0, int32x4_t *sum1,
223*77c1e3ccSAndroid Build Coastguard Worker                                        const int16x8_t src0,
224*77c1e3ccSAndroid Build Coastguard Worker                                        const int16x8_t src1,
225*77c1e3ccSAndroid Build Coastguard Worker                                        const int16x8_t dgd0,
226*77c1e3ccSAndroid Build Coastguard Worker                                        const int16x8_t dgd1) {
227*77c1e3ccSAndroid Build Coastguard Worker   *sum0 = vmlsl_s16(*sum0, vget_low_s16(src0), vget_low_s16(dgd0));
228*77c1e3ccSAndroid Build Coastguard Worker   *sum0 = vmlal_s16(*sum0, vget_low_s16(src1), vget_low_s16(dgd1));
229*77c1e3ccSAndroid Build Coastguard Worker   *sum1 = vmlsl_s16(*sum1, vget_high_s16(src0), vget_high_s16(dgd0));
230*77c1e3ccSAndroid Build Coastguard Worker   *sum1 = vmlal_s16(*sum1, vget_high_s16(src1), vget_high_s16(dgd1));
231*77c1e3ccSAndroid Build Coastguard Worker }
232*77c1e3ccSAndroid Build Coastguard Worker 
hadd_four_32_neon(const int32x4_t src0,const int32x4_t src1,const int32x4_t src2,const int32x4_t src3)233*77c1e3ccSAndroid Build Coastguard Worker static inline int32x4_t hadd_four_32_neon(const int32x4_t src0,
234*77c1e3ccSAndroid Build Coastguard Worker                                           const int32x4_t src1,
235*77c1e3ccSAndroid Build Coastguard Worker                                           const int32x4_t src2,
236*77c1e3ccSAndroid Build Coastguard Worker                                           const int32x4_t src3) {
237*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t src[4] = { src0, src1, src2, src3 };
238*77c1e3ccSAndroid Build Coastguard Worker   return horizontal_add_4d_s32x4(src);
239*77c1e3ccSAndroid Build Coastguard Worker }
240*77c1e3ccSAndroid Build Coastguard Worker 
update_4_stats_neon(const int64_t * const src,const int32x4_t delta,int64_t * const dst)241*77c1e3ccSAndroid Build Coastguard Worker static inline void update_4_stats_neon(const int64_t *const src,
242*77c1e3ccSAndroid Build Coastguard Worker                                        const int32x4_t delta,
243*77c1e3ccSAndroid Build Coastguard Worker                                        int64_t *const dst) {
244*77c1e3ccSAndroid Build Coastguard Worker   const int64x2_t s1 = vld1q_s64(src);
245*77c1e3ccSAndroid Build Coastguard Worker   const int64x2_t s2 = vld1q_s64(src + 2);
246*77c1e3ccSAndroid Build Coastguard Worker 
247*77c1e3ccSAndroid Build Coastguard Worker   const int64x2_t d1 = vaddw_s32(s1, vget_low_s32(delta));
248*77c1e3ccSAndroid Build Coastguard Worker   const int64x2_t d2 = vaddw_s32(s2, vget_high_s32(delta));
249*77c1e3ccSAndroid Build Coastguard Worker 
250*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst, d1);
251*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst + 2, d2);
252*77c1e3ccSAndroid Build Coastguard Worker }
253*77c1e3ccSAndroid Build Coastguard Worker 
load_more_16_neon(const int16_t * const src,const int32_t width,const int16x8_t org[2],int16x8_t dst[2])254*77c1e3ccSAndroid Build Coastguard Worker static inline void load_more_16_neon(const int16_t *const src,
255*77c1e3ccSAndroid Build Coastguard Worker                                      const int32_t width,
256*77c1e3ccSAndroid Build Coastguard Worker                                      const int16x8_t org[2], int16x8_t dst[2]) {
257*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t s0 = vld1q_dup_s16(src);
258*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t s1 = vld1q_dup_s16(src + width);
259*77c1e3ccSAndroid Build Coastguard Worker   dst[0] = vextq_s16(org[0], s0, 1);
260*77c1e3ccSAndroid Build Coastguard Worker   dst[1] = vextq_s16(org[1], s1, 1);
261*77c1e3ccSAndroid Build Coastguard Worker }
262*77c1e3ccSAndroid Build Coastguard Worker 
stats_top_win5_neon(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int32x4_t * sum_m,int32x4_t * sum_h)263*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_top_win5_neon(const int16x8_t src[2],
264*77c1e3ccSAndroid Build Coastguard Worker                                        const int16x8_t dgd[2],
265*77c1e3ccSAndroid Build Coastguard Worker                                        const int16_t *const d,
266*77c1e3ccSAndroid Build Coastguard Worker                                        const int32_t d_stride, int32x4_t *sum_m,
267*77c1e3ccSAndroid Build Coastguard Worker                                        int32x4_t *sum_h) {
268*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t dgds[WIENER_WIN_CHROMA * 2];
269*77c1e3ccSAndroid Build Coastguard Worker 
270*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
271*77c1e3ccSAndroid Build Coastguard Worker                &dgds[8]);
272*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
273*77c1e3ccSAndroid Build Coastguard Worker                &dgds[9]);
274*77c1e3ccSAndroid Build Coastguard Worker 
275*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[0], src[0], dgds[0]);
276*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[0], src[1], dgds[1]);
277*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[1], src[0], dgds[2]);
278*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[1], src[1], dgds[3]);
279*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[2], src[0], dgds[4]);
280*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[2], src[1], dgds[5]);
281*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[3], src[0], dgds[6]);
282*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[3], src[1], dgds[7]);
283*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[4], src[0], dgds[8]);
284*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[4], src[1], dgds[9]);
285*77c1e3ccSAndroid Build Coastguard Worker 
286*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[0], dgd[0], dgds[0]);
287*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[0], dgd[1], dgds[1]);
288*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[1], dgd[0], dgds[2]);
289*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[1], dgd[1], dgds[3]);
290*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[2], dgd[0], dgds[4]);
291*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[2], dgd[1], dgds[5]);
292*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[3], dgd[0], dgds[6]);
293*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[3], dgd[1], dgds[7]);
294*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[4], dgd[0], dgds[8]);
295*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[4], dgd[1], dgds[9]);
296*77c1e3ccSAndroid Build Coastguard Worker }
297*77c1e3ccSAndroid Build Coastguard Worker 
stats_left_win5_neon(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int32x4_t * sum)298*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_left_win5_neon(const int16x8_t src[2],
299*77c1e3ccSAndroid Build Coastguard Worker                                         const int16_t *d,
300*77c1e3ccSAndroid Build Coastguard Worker                                         const int32_t d_stride,
301*77c1e3ccSAndroid Build Coastguard Worker                                         int32x4_t *sum) {
302*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t dgds[WIN_CHROMA];
303*77c1e3ccSAndroid Build Coastguard Worker 
304*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
305*77c1e3ccSAndroid Build Coastguard Worker                &dgds[6]);
306*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
307*77c1e3ccSAndroid Build Coastguard Worker                &dgds[7]);
308*77c1e3ccSAndroid Build Coastguard Worker 
309*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[0], src[0], dgds[0]);
310*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[0], src[1], dgds[1]);
311*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[1], src[0], dgds[2]);
312*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[1], src[1], dgds[3]);
313*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[2], src[0], dgds[4]);
314*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[2], src[1], dgds[5]);
315*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[3], src[0], dgds[6]);
316*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[3], src[1], dgds[7]);
317*77c1e3ccSAndroid Build Coastguard Worker }
318*77c1e3ccSAndroid Build Coastguard Worker 
derive_square_win5_neon(const int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int32x4_t deltas[WIENER_WIN_CHROMA-1][WIENER_WIN_CHROMA-1])319*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_square_win5_neon(
320*77c1e3ccSAndroid Build Coastguard Worker     const int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
321*77c1e3ccSAndroid Build Coastguard Worker     const int16x8_t *d_je,
322*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
323*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][0], d_is[0], d_js[0]);
324*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][0], d_is[1], d_js[1]);
325*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][1], d_is[0], d_js[2]);
326*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][1], d_is[1], d_js[3]);
327*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][2], d_is[0], d_js[4]);
328*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][2], d_is[1], d_js[5]);
329*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][3], d_is[0], d_js[6]);
330*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][3], d_is[1], d_js[7]);
331*77c1e3ccSAndroid Build Coastguard Worker 
332*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][0], d_is[2], d_js[0]);
333*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][0], d_is[3], d_js[1]);
334*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][1], d_is[2], d_js[2]);
335*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][1], d_is[3], d_js[3]);
336*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][2], d_is[2], d_js[4]);
337*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][2], d_is[3], d_js[5]);
338*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][3], d_is[2], d_js[6]);
339*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][3], d_is[3], d_js[7]);
340*77c1e3ccSAndroid Build Coastguard Worker 
341*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][0], d_is[4], d_js[0]);
342*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][0], d_is[5], d_js[1]);
343*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][1], d_is[4], d_js[2]);
344*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][1], d_is[5], d_js[3]);
345*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][2], d_is[4], d_js[4]);
346*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][2], d_is[5], d_js[5]);
347*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][3], d_is[4], d_js[6]);
348*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][3], d_is[5], d_js[7]);
349*77c1e3ccSAndroid Build Coastguard Worker 
350*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][0], d_is[6], d_js[0]);
351*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][0], d_is[7], d_js[1]);
352*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][1], d_is[6], d_js[2]);
353*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][1], d_is[7], d_js[3]);
354*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][2], d_is[6], d_js[4]);
355*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][2], d_is[7], d_js[5]);
356*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][3], d_is[6], d_js[6]);
357*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][3], d_is[7], d_js[7]);
358*77c1e3ccSAndroid Build Coastguard Worker 
359*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
360*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
361*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
362*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
363*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
364*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
365*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
366*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
367*77c1e3ccSAndroid Build Coastguard Worker 
368*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
369*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
370*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
371*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
372*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
373*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
374*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
375*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
376*77c1e3ccSAndroid Build Coastguard Worker 
377*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
378*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
379*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
380*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
381*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
382*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
383*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
384*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
385*77c1e3ccSAndroid Build Coastguard Worker 
386*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
387*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
388*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
389*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
390*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
391*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
392*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
393*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
394*77c1e3ccSAndroid Build Coastguard Worker }
395*77c1e3ccSAndroid Build Coastguard Worker 
load_square_win5_neon(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je)396*77c1e3ccSAndroid Build Coastguard Worker static inline void load_square_win5_neon(const int16_t *const di,
397*77c1e3ccSAndroid Build Coastguard Worker                                          const int16_t *const dj,
398*77c1e3ccSAndroid Build Coastguard Worker                                          const int32_t d_stride,
399*77c1e3ccSAndroid Build Coastguard Worker                                          const int32_t height, int16x8_t *d_is,
400*77c1e3ccSAndroid Build Coastguard Worker                                          int16x8_t *d_ie, int16x8_t *d_js,
401*77c1e3ccSAndroid Build Coastguard Worker                                          int16x8_t *d_je) {
402*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
403*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
404*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
405*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
406*77c1e3ccSAndroid Build Coastguard Worker 
407*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
408*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[4], &d_ie[6]);
409*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
410*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[5], &d_ie[7]);
411*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
412*77c1e3ccSAndroid Build Coastguard Worker                &d_je[4], &d_je[6]);
413*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
414*77c1e3ccSAndroid Build Coastguard Worker                &d_je[5], &d_je[7]);
415*77c1e3ccSAndroid Build Coastguard Worker }
416*77c1e3ccSAndroid Build Coastguard Worker 
update_5_stats_neon(const int64_t * const src,const int32x4_t delta,const int64_t delta4,int64_t * const dst)417*77c1e3ccSAndroid Build Coastguard Worker static inline void update_5_stats_neon(const int64_t *const src,
418*77c1e3ccSAndroid Build Coastguard Worker                                        const int32x4_t delta,
419*77c1e3ccSAndroid Build Coastguard Worker                                        const int64_t delta4,
420*77c1e3ccSAndroid Build Coastguard Worker                                        int64_t *const dst) {
421*77c1e3ccSAndroid Build Coastguard Worker   update_4_stats_neon(src + 0, delta, dst + 0);
422*77c1e3ccSAndroid Build Coastguard Worker   dst[4] = src[4] + delta4;
423*77c1e3ccSAndroid Build Coastguard Worker }
424*77c1e3ccSAndroid Build Coastguard Worker 
compute_delta_step3_two_lines(int32x4_t * sum,const int16x8_t src,const int16x8_t dgd)425*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_delta_step3_two_lines(int32x4_t *sum,
426*77c1e3ccSAndroid Build Coastguard Worker                                                  const int16x8_t src,
427*77c1e3ccSAndroid Build Coastguard Worker                                                  const int16x8_t dgd) {
428*77c1e3ccSAndroid Build Coastguard Worker   *sum = vmlsl_s16(*sum, vget_low_s16(src), vget_low_s16(dgd));
429*77c1e3ccSAndroid Build Coastguard Worker   *sum = vmlal_s16(*sum, vget_high_s16(src), vget_high_s16(dgd));
430*77c1e3ccSAndroid Build Coastguard Worker }
431*77c1e3ccSAndroid Build Coastguard Worker 
step3_win5_neon(const int16_t * d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)432*77c1e3ccSAndroid Build Coastguard Worker static inline void step3_win5_neon(const int16_t *d, const int32_t d_stride,
433*77c1e3ccSAndroid Build Coastguard Worker                                    const int32_t width, const int32_t height,
434*77c1e3ccSAndroid Build Coastguard Worker                                    int16x8_t *ds, int32x4_t *deltas) {
435*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = height;
436*77c1e3ccSAndroid Build Coastguard Worker   do {
437*77c1e3ccSAndroid Build Coastguard Worker     ds[4] = load_unaligned_s16_4x2(d + 0 * d_stride, width);
438*77c1e3ccSAndroid Build Coastguard Worker     ds[5] = load_unaligned_s16_4x2(d + 1 * d_stride, width);
439*77c1e3ccSAndroid Build Coastguard Worker 
440*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[0], ds[0], ds[0]);
441*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[1], ds[0], ds[1]);
442*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[2], ds[0], ds[2]);
443*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[3], ds[0], ds[3]);
444*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[4], ds[0], ds[4]);
445*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[0], ds[1], ds[1]);
446*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[1], ds[1], ds[2]);
447*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[2], ds[1], ds[3]);
448*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[3], ds[1], ds[4]);
449*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3_two_lines(&deltas[4], ds[1], ds[5]);
450*77c1e3ccSAndroid Build Coastguard Worker 
451*77c1e3ccSAndroid Build Coastguard Worker     ds[0] = ds[2];
452*77c1e3ccSAndroid Build Coastguard Worker     ds[1] = ds[3];
453*77c1e3ccSAndroid Build Coastguard Worker     ds[2] = ds[4];
454*77c1e3ccSAndroid Build Coastguard Worker     ds[3] = ds[5];
455*77c1e3ccSAndroid Build Coastguard Worker 
456*77c1e3ccSAndroid Build Coastguard Worker     d += 2 * d_stride;
457*77c1e3ccSAndroid Build Coastguard Worker     y -= 2;
458*77c1e3ccSAndroid Build Coastguard Worker   } while (y);
459*77c1e3ccSAndroid Build Coastguard Worker }
460*77c1e3ccSAndroid Build Coastguard Worker 
step3_win5_oneline_neon(const int16_t ** const d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)461*77c1e3ccSAndroid Build Coastguard Worker static inline void step3_win5_oneline_neon(const int16_t **const d,
462*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t d_stride,
463*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t width,
464*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t height, int16x8_t *ds,
465*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t *deltas) {
466*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = height;
467*77c1e3ccSAndroid Build Coastguard Worker   do {
468*77c1e3ccSAndroid Build Coastguard Worker     ds[8] = vld1q_s16(*d);
469*77c1e3ccSAndroid Build Coastguard Worker     ds[9] = vld1q_s16(*d + width);
470*77c1e3ccSAndroid Build Coastguard Worker 
471*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
472*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
473*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
474*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
475*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
476*77c1e3ccSAndroid Build Coastguard Worker 
477*77c1e3ccSAndroid Build Coastguard Worker     ds[0] = ds[2];
478*77c1e3ccSAndroid Build Coastguard Worker     ds[1] = ds[3];
479*77c1e3ccSAndroid Build Coastguard Worker     ds[2] = ds[4];
480*77c1e3ccSAndroid Build Coastguard Worker     ds[3] = ds[5];
481*77c1e3ccSAndroid Build Coastguard Worker     ds[4] = ds[6];
482*77c1e3ccSAndroid Build Coastguard Worker     ds[5] = ds[7];
483*77c1e3ccSAndroid Build Coastguard Worker     ds[6] = ds[8];
484*77c1e3ccSAndroid Build Coastguard Worker     ds[7] = ds[9];
485*77c1e3ccSAndroid Build Coastguard Worker 
486*77c1e3ccSAndroid Build Coastguard Worker     *d += d_stride;
487*77c1e3ccSAndroid Build Coastguard Worker   } while (--y);
488*77c1e3ccSAndroid Build Coastguard Worker }
489*77c1e3ccSAndroid Build Coastguard Worker 
derive_triangle_win5_neon(const int16x8_t * d_is,const int16x8_t * d_ie,int32x4_t * deltas)490*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_triangle_win5_neon(const int16x8_t *d_is,
491*77c1e3ccSAndroid Build Coastguard Worker                                              const int16x8_t *d_ie,
492*77c1e3ccSAndroid Build Coastguard Worker                                              int32x4_t *deltas) {
493*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0], d_is[0], d_is[0]);
494*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0], d_is[1], d_is[1]);
495*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1], d_is[0], d_is[2]);
496*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1], d_is[1], d_is[3]);
497*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2], d_is[0], d_is[4]);
498*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2], d_is[1], d_is[5]);
499*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3], d_is[0], d_is[6]);
500*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3], d_is[1], d_is[7]);
501*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4], d_is[2], d_is[2]);
502*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4], d_is[3], d_is[3]);
503*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5], d_is[2], d_is[4]);
504*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5], d_is[3], d_is[5]);
505*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[6], d_is[2], d_is[6]);
506*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[6], d_is[3], d_is[7]);
507*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[7], d_is[4], d_is[4]);
508*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[7], d_is[5], d_is[5]);
509*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[8], d_is[4], d_is[6]);
510*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[8], d_is[5], d_is[7]);
511*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[9], d_is[6], d_is[6]);
512*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[9], d_is[7], d_is[7]);
513*77c1e3ccSAndroid Build Coastguard Worker 
514*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0], d_ie[0], d_ie[0]);
515*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0], d_ie[1], d_ie[1]);
516*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1], d_ie[0], d_ie[2]);
517*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1], d_ie[1], d_ie[3]);
518*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2], d_ie[0], d_ie[4]);
519*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2], d_ie[1], d_ie[5]);
520*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3], d_ie[0], d_ie[6]);
521*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3], d_ie[1], d_ie[7]);
522*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4], d_ie[2], d_ie[2]);
523*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4], d_ie[3], d_ie[3]);
524*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5], d_ie[2], d_ie[4]);
525*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5], d_ie[3], d_ie[5]);
526*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[6], d_ie[2], d_ie[6]);
527*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[6], d_ie[3], d_ie[7]);
528*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[7], d_ie[4], d_ie[4]);
529*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[7], d_ie[5], d_ie[5]);
530*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[8], d_ie[4], d_ie[6]);
531*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[8], d_ie[5], d_ie[7]);
532*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[9], d_ie[6], d_ie[6]);
533*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[9], d_ie[7], d_ie[7]);
534*77c1e3ccSAndroid Build Coastguard Worker }
535*77c1e3ccSAndroid Build Coastguard Worker 
load_triangle_win5_neon(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie)536*77c1e3ccSAndroid Build Coastguard Worker static inline void load_triangle_win5_neon(const int16_t *const di,
537*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t d_stride,
538*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t height,
539*77c1e3ccSAndroid Build Coastguard Worker                                            int16x8_t *d_is, int16x8_t *d_ie) {
540*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6]);
541*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7]);
542*77c1e3ccSAndroid Build Coastguard Worker 
543*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
544*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[4], &d_ie[6]);
545*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x4(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
546*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[5], &d_ie[7]);
547*77c1e3ccSAndroid Build Coastguard Worker }
548*77c1e3ccSAndroid Build Coastguard Worker 
sub_deltas_step4(int16x8_t * A,int16x8_t * B,int32x4_t * deltas)549*77c1e3ccSAndroid Build Coastguard Worker static inline void sub_deltas_step4(int16x8_t *A, int16x8_t *B,
550*77c1e3ccSAndroid Build Coastguard Worker                                     int32x4_t *deltas) {
551*77c1e3ccSAndroid Build Coastguard Worker   deltas[0] = vmlsl_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
552*77c1e3ccSAndroid Build Coastguard Worker   deltas[0] = vmlsl_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
553*77c1e3ccSAndroid Build Coastguard Worker   deltas[1] = vmlsl_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
554*77c1e3ccSAndroid Build Coastguard Worker   deltas[1] = vmlsl_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
555*77c1e3ccSAndroid Build Coastguard Worker   deltas[2] = vmlsl_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
556*77c1e3ccSAndroid Build Coastguard Worker   deltas[2] = vmlsl_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
557*77c1e3ccSAndroid Build Coastguard Worker   deltas[3] = vmlsl_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
558*77c1e3ccSAndroid Build Coastguard Worker   deltas[3] = vmlsl_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
559*77c1e3ccSAndroid Build Coastguard Worker   deltas[4] = vmlsl_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
560*77c1e3ccSAndroid Build Coastguard Worker   deltas[4] = vmlsl_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
561*77c1e3ccSAndroid Build Coastguard Worker   deltas[5] = vmlsl_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
562*77c1e3ccSAndroid Build Coastguard Worker   deltas[5] = vmlsl_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
563*77c1e3ccSAndroid Build Coastguard Worker   deltas[6] = vmlsl_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
564*77c1e3ccSAndroid Build Coastguard Worker   deltas[6] = vmlsl_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
565*77c1e3ccSAndroid Build Coastguard Worker   deltas[7] = vmlsl_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
566*77c1e3ccSAndroid Build Coastguard Worker   deltas[7] = vmlsl_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
567*77c1e3ccSAndroid Build Coastguard Worker   deltas[8] = vmlsl_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
568*77c1e3ccSAndroid Build Coastguard Worker   deltas[8] = vmlsl_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
569*77c1e3ccSAndroid Build Coastguard Worker }
570*77c1e3ccSAndroid Build Coastguard Worker 
add_deltas_step4(int16x8_t * A,int16x8_t * B,int32x4_t * deltas)571*77c1e3ccSAndroid Build Coastguard Worker static inline void add_deltas_step4(int16x8_t *A, int16x8_t *B,
572*77c1e3ccSAndroid Build Coastguard Worker                                     int32x4_t *deltas) {
573*77c1e3ccSAndroid Build Coastguard Worker   deltas[0] = vmlal_s16(deltas[0], vget_low_s16(A[0]), vget_low_s16(B[0]));
574*77c1e3ccSAndroid Build Coastguard Worker   deltas[0] = vmlal_s16(deltas[0], vget_high_s16(A[0]), vget_high_s16(B[0]));
575*77c1e3ccSAndroid Build Coastguard Worker   deltas[1] = vmlal_s16(deltas[1], vget_low_s16(A[0]), vget_low_s16(B[1]));
576*77c1e3ccSAndroid Build Coastguard Worker   deltas[1] = vmlal_s16(deltas[1], vget_high_s16(A[0]), vget_high_s16(B[1]));
577*77c1e3ccSAndroid Build Coastguard Worker   deltas[2] = vmlal_s16(deltas[2], vget_low_s16(A[0]), vget_low_s16(B[2]));
578*77c1e3ccSAndroid Build Coastguard Worker   deltas[2] = vmlal_s16(deltas[2], vget_high_s16(A[0]), vget_high_s16(B[2]));
579*77c1e3ccSAndroid Build Coastguard Worker   deltas[3] = vmlal_s16(deltas[3], vget_low_s16(A[0]), vget_low_s16(B[3]));
580*77c1e3ccSAndroid Build Coastguard Worker   deltas[3] = vmlal_s16(deltas[3], vget_high_s16(A[0]), vget_high_s16(B[3]));
581*77c1e3ccSAndroid Build Coastguard Worker   deltas[4] = vmlal_s16(deltas[4], vget_low_s16(A[0]), vget_low_s16(B[4]));
582*77c1e3ccSAndroid Build Coastguard Worker   deltas[4] = vmlal_s16(deltas[4], vget_high_s16(A[0]), vget_high_s16(B[4]));
583*77c1e3ccSAndroid Build Coastguard Worker   deltas[5] = vmlal_s16(deltas[5], vget_low_s16(A[1]), vget_low_s16(B[0]));
584*77c1e3ccSAndroid Build Coastguard Worker   deltas[5] = vmlal_s16(deltas[5], vget_high_s16(A[1]), vget_high_s16(B[0]));
585*77c1e3ccSAndroid Build Coastguard Worker   deltas[6] = vmlal_s16(deltas[6], vget_low_s16(A[2]), vget_low_s16(B[0]));
586*77c1e3ccSAndroid Build Coastguard Worker   deltas[6] = vmlal_s16(deltas[6], vget_high_s16(A[2]), vget_high_s16(B[0]));
587*77c1e3ccSAndroid Build Coastguard Worker   deltas[7] = vmlal_s16(deltas[7], vget_low_s16(A[3]), vget_low_s16(B[0]));
588*77c1e3ccSAndroid Build Coastguard Worker   deltas[7] = vmlal_s16(deltas[7], vget_high_s16(A[3]), vget_high_s16(B[0]));
589*77c1e3ccSAndroid Build Coastguard Worker   deltas[8] = vmlal_s16(deltas[8], vget_low_s16(A[4]), vget_low_s16(B[0]));
590*77c1e3ccSAndroid Build Coastguard Worker   deltas[8] = vmlal_s16(deltas[8], vget_high_s16(A[4]), vget_high_s16(B[0]));
591*77c1e3ccSAndroid Build Coastguard Worker }
592*77c1e3ccSAndroid Build Coastguard Worker 
stats_top_win7_neon(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int32x4_t * sum_m,int32x4_t * sum_h)593*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_top_win7_neon(const int16x8_t src[2],
594*77c1e3ccSAndroid Build Coastguard Worker                                        const int16x8_t dgd[2],
595*77c1e3ccSAndroid Build Coastguard Worker                                        const int16_t *const d,
596*77c1e3ccSAndroid Build Coastguard Worker                                        const int32_t d_stride, int32x4_t *sum_m,
597*77c1e3ccSAndroid Build Coastguard Worker                                        int32x4_t *sum_h) {
598*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t dgds[WIENER_WIN * 2];
599*77c1e3ccSAndroid Build Coastguard Worker 
600*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
601*77c1e3ccSAndroid Build Coastguard Worker                &dgds[8], &dgds[10], &dgds[12]);
602*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
603*77c1e3ccSAndroid Build Coastguard Worker                &dgds[9], &dgds[11], &dgds[13]);
604*77c1e3ccSAndroid Build Coastguard Worker 
605*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[0], src[0], dgds[0]);
606*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[0], src[1], dgds[1]);
607*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[1], src[0], dgds[2]);
608*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[1], src[1], dgds[3]);
609*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[2], src[0], dgds[4]);
610*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[2], src[1], dgds[5]);
611*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[3], src[0], dgds[6]);
612*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[3], src[1], dgds[7]);
613*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[4], src[0], dgds[8]);
614*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[4], src[1], dgds[9]);
615*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[5], src[0], dgds[10]);
616*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[5], src[1], dgds[11]);
617*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[6], src[0], dgds[12]);
618*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_m[6], src[1], dgds[13]);
619*77c1e3ccSAndroid Build Coastguard Worker 
620*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[0], dgd[0], dgds[0]);
621*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[0], dgd[1], dgds[1]);
622*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[1], dgd[0], dgds[2]);
623*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[1], dgd[1], dgds[3]);
624*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[2], dgd[0], dgds[4]);
625*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[2], dgd[1], dgds[5]);
626*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[3], dgd[0], dgds[6]);
627*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[3], dgd[1], dgds[7]);
628*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[4], dgd[0], dgds[8]);
629*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[4], dgd[1], dgds[9]);
630*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[5], dgd[0], dgds[10]);
631*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[5], dgd[1], dgds[11]);
632*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[6], dgd[0], dgds[12]);
633*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum_h[6], dgd[1], dgds[13]);
634*77c1e3ccSAndroid Build Coastguard Worker }
635*77c1e3ccSAndroid Build Coastguard Worker 
derive_square_win7_neon(const int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int32x4_t deltas[][WIN_7])636*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_square_win7_neon(const int16x8_t *d_is,
637*77c1e3ccSAndroid Build Coastguard Worker                                            const int16x8_t *d_ie,
638*77c1e3ccSAndroid Build Coastguard Worker                                            const int16x8_t *d_js,
639*77c1e3ccSAndroid Build Coastguard Worker                                            const int16x8_t *d_je,
640*77c1e3ccSAndroid Build Coastguard Worker                                            int32x4_t deltas[][WIN_7]) {
641*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][0], d_is[0], d_js[0]);
642*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][0], d_is[1], d_js[1]);
643*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][1], d_is[0], d_js[2]);
644*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][1], d_is[1], d_js[3]);
645*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][2], d_is[0], d_js[4]);
646*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][2], d_is[1], d_js[5]);
647*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][3], d_is[0], d_js[6]);
648*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][3], d_is[1], d_js[7]);
649*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][4], d_is[0], d_js[8]);
650*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][4], d_is[1], d_js[9]);
651*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][5], d_is[0], d_js[10]);
652*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0][5], d_is[1], d_js[11]);
653*77c1e3ccSAndroid Build Coastguard Worker 
654*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][0], d_is[2], d_js[0]);
655*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][0], d_is[3], d_js[1]);
656*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][1], d_is[2], d_js[2]);
657*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][1], d_is[3], d_js[3]);
658*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][2], d_is[2], d_js[4]);
659*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][2], d_is[3], d_js[5]);
660*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][3], d_is[2], d_js[6]);
661*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][3], d_is[3], d_js[7]);
662*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][4], d_is[2], d_js[8]);
663*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][4], d_is[3], d_js[9]);
664*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][5], d_is[2], d_js[10]);
665*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1][5], d_is[3], d_js[11]);
666*77c1e3ccSAndroid Build Coastguard Worker 
667*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][0], d_is[4], d_js[0]);
668*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][0], d_is[5], d_js[1]);
669*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][1], d_is[4], d_js[2]);
670*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][1], d_is[5], d_js[3]);
671*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][2], d_is[4], d_js[4]);
672*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][2], d_is[5], d_js[5]);
673*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][3], d_is[4], d_js[6]);
674*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][3], d_is[5], d_js[7]);
675*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][4], d_is[4], d_js[8]);
676*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][4], d_is[5], d_js[9]);
677*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][5], d_is[4], d_js[10]);
678*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2][5], d_is[5], d_js[11]);
679*77c1e3ccSAndroid Build Coastguard Worker 
680*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][0], d_is[6], d_js[0]);
681*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][0], d_is[7], d_js[1]);
682*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][1], d_is[6], d_js[2]);
683*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][1], d_is[7], d_js[3]);
684*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][2], d_is[6], d_js[4]);
685*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][2], d_is[7], d_js[5]);
686*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][3], d_is[6], d_js[6]);
687*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][3], d_is[7], d_js[7]);
688*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][4], d_is[6], d_js[8]);
689*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][4], d_is[7], d_js[9]);
690*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][5], d_is[6], d_js[10]);
691*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3][5], d_is[7], d_js[11]);
692*77c1e3ccSAndroid Build Coastguard Worker 
693*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][0], d_is[8], d_js[0]);
694*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][0], d_is[9], d_js[1]);
695*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][1], d_is[8], d_js[2]);
696*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][1], d_is[9], d_js[3]);
697*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][2], d_is[8], d_js[4]);
698*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][2], d_is[9], d_js[5]);
699*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][3], d_is[8], d_js[6]);
700*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][3], d_is[9], d_js[7]);
701*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][4], d_is[8], d_js[8]);
702*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][4], d_is[9], d_js[9]);
703*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][5], d_is[8], d_js[10]);
704*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4][5], d_is[9], d_js[11]);
705*77c1e3ccSAndroid Build Coastguard Worker 
706*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][0], d_is[10], d_js[0]);
707*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][0], d_is[11], d_js[1]);
708*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][1], d_is[10], d_js[2]);
709*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][1], d_is[11], d_js[3]);
710*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][2], d_is[10], d_js[4]);
711*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][2], d_is[11], d_js[5]);
712*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][3], d_is[10], d_js[6]);
713*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][3], d_is[11], d_js[7]);
714*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][4], d_is[10], d_js[8]);
715*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][4], d_is[11], d_js[9]);
716*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][5], d_is[10], d_js[10]);
717*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5][5], d_is[11], d_js[11]);
718*77c1e3ccSAndroid Build Coastguard Worker 
719*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][0], d_ie[0], d_je[0]);
720*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][0], d_ie[1], d_je[1]);
721*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][1], d_ie[0], d_je[2]);
722*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][1], d_ie[1], d_je[3]);
723*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][2], d_ie[0], d_je[4]);
724*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][2], d_ie[1], d_je[5]);
725*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][3], d_ie[0], d_je[6]);
726*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][3], d_ie[1], d_je[7]);
727*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][4], d_ie[0], d_je[8]);
728*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][4], d_ie[1], d_je[9]);
729*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][5], d_ie[0], d_je[10]);
730*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0][5], d_ie[1], d_je[11]);
731*77c1e3ccSAndroid Build Coastguard Worker 
732*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][0], d_ie[2], d_je[0]);
733*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][0], d_ie[3], d_je[1]);
734*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][1], d_ie[2], d_je[2]);
735*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][1], d_ie[3], d_je[3]);
736*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][2], d_ie[2], d_je[4]);
737*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][2], d_ie[3], d_je[5]);
738*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][3], d_ie[2], d_je[6]);
739*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][3], d_ie[3], d_je[7]);
740*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][4], d_ie[2], d_je[8]);
741*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][4], d_ie[3], d_je[9]);
742*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][5], d_ie[2], d_je[10]);
743*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1][5], d_ie[3], d_je[11]);
744*77c1e3ccSAndroid Build Coastguard Worker 
745*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][0], d_ie[4], d_je[0]);
746*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][0], d_ie[5], d_je[1]);
747*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][1], d_ie[4], d_je[2]);
748*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][1], d_ie[5], d_je[3]);
749*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][2], d_ie[4], d_je[4]);
750*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][2], d_ie[5], d_je[5]);
751*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][3], d_ie[4], d_je[6]);
752*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][3], d_ie[5], d_je[7]);
753*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][4], d_ie[4], d_je[8]);
754*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][4], d_ie[5], d_je[9]);
755*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][5], d_ie[4], d_je[10]);
756*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2][5], d_ie[5], d_je[11]);
757*77c1e3ccSAndroid Build Coastguard Worker 
758*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][0], d_ie[6], d_je[0]);
759*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][0], d_ie[7], d_je[1]);
760*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][1], d_ie[6], d_je[2]);
761*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][1], d_ie[7], d_je[3]);
762*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][2], d_ie[6], d_je[4]);
763*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][2], d_ie[7], d_je[5]);
764*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][3], d_ie[6], d_je[6]);
765*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][3], d_ie[7], d_je[7]);
766*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][4], d_ie[6], d_je[8]);
767*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][4], d_ie[7], d_je[9]);
768*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][5], d_ie[6], d_je[10]);
769*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3][5], d_ie[7], d_je[11]);
770*77c1e3ccSAndroid Build Coastguard Worker 
771*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][0], d_ie[8], d_je[0]);
772*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][0], d_ie[9], d_je[1]);
773*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][1], d_ie[8], d_je[2]);
774*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][1], d_ie[9], d_je[3]);
775*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][2], d_ie[8], d_je[4]);
776*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][2], d_ie[9], d_je[5]);
777*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][3], d_ie[8], d_je[6]);
778*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][3], d_ie[9], d_je[7]);
779*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][4], d_ie[8], d_je[8]);
780*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][4], d_ie[9], d_je[9]);
781*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][5], d_ie[8], d_je[10]);
782*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4][5], d_ie[9], d_je[11]);
783*77c1e3ccSAndroid Build Coastguard Worker 
784*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][0], d_ie[10], d_je[0]);
785*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][0], d_ie[11], d_je[1]);
786*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][1], d_ie[10], d_je[2]);
787*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][1], d_ie[11], d_je[3]);
788*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][2], d_ie[10], d_je[4]);
789*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][2], d_ie[11], d_je[5]);
790*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][3], d_ie[10], d_je[6]);
791*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][3], d_ie[11], d_je[7]);
792*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][4], d_ie[10], d_je[8]);
793*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][4], d_ie[11], d_je[9]);
794*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][5], d_ie[10], d_je[10]);
795*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5][5], d_ie[11], d_je[11]);
796*77c1e3ccSAndroid Build Coastguard Worker }
797*77c1e3ccSAndroid Build Coastguard Worker 
update_8_stats_neon(const int64_t * const src,const int32x4_t delta0,const int32x4_t delta1,int64_t * const dst)798*77c1e3ccSAndroid Build Coastguard Worker static inline void update_8_stats_neon(const int64_t *const src,
799*77c1e3ccSAndroid Build Coastguard Worker                                        const int32x4_t delta0,
800*77c1e3ccSAndroid Build Coastguard Worker                                        const int32x4_t delta1,
801*77c1e3ccSAndroid Build Coastguard Worker                                        int64_t *const dst) {
802*77c1e3ccSAndroid Build Coastguard Worker   update_4_stats_neon(src + 0, delta0, dst + 0);
803*77c1e3ccSAndroid Build Coastguard Worker   update_4_stats_neon(src + 4, delta1, dst + 4);
804*77c1e3ccSAndroid Build Coastguard Worker }
805*77c1e3ccSAndroid Build Coastguard Worker 
load_square_win7_neon(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je)806*77c1e3ccSAndroid Build Coastguard Worker static inline void load_square_win7_neon(const int16_t *const di,
807*77c1e3ccSAndroid Build Coastguard Worker                                          const int16_t *const dj,
808*77c1e3ccSAndroid Build Coastguard Worker                                          const int32_t d_stride,
809*77c1e3ccSAndroid Build Coastguard Worker                                          const int32_t height, int16x8_t *d_is,
810*77c1e3ccSAndroid Build Coastguard Worker                                          int16x8_t *d_ie, int16x8_t *d_js,
811*77c1e3ccSAndroid Build Coastguard Worker                                          int16x8_t *d_je) {
812*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + 0, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6],
813*77c1e3ccSAndroid Build Coastguard Worker                &d_is[8], &d_is[10]);
814*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
815*77c1e3ccSAndroid Build Coastguard Worker                &d_is[9], &d_is[11]);
816*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
817*77c1e3ccSAndroid Build Coastguard Worker                &d_js[8], &d_js[10]);
818*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
819*77c1e3ccSAndroid Build Coastguard Worker                &d_js[9], &d_js[11]);
820*77c1e3ccSAndroid Build Coastguard Worker 
821*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + height * d_stride + 0, d_stride, &d_ie[0], &d_ie[2],
822*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[4], &d_ie[6], &d_ie[8], &d_ie[10]);
823*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
824*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
825*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
826*77c1e3ccSAndroid Build Coastguard Worker                &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
827*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
828*77c1e3ccSAndroid Build Coastguard Worker                &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
829*77c1e3ccSAndroid Build Coastguard Worker }
830*77c1e3ccSAndroid Build Coastguard Worker 
load_triangle_win7_neon(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie)831*77c1e3ccSAndroid Build Coastguard Worker static inline void load_triangle_win7_neon(const int16_t *const di,
832*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t d_stride,
833*77c1e3ccSAndroid Build Coastguard Worker                                            const int32_t height,
834*77c1e3ccSAndroid Build Coastguard Worker                                            int16x8_t *d_is, int16x8_t *d_ie) {
835*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di, d_stride, &d_is[0], &d_is[2], &d_is[4], &d_is[6], &d_is[8],
836*77c1e3ccSAndroid Build Coastguard Worker                &d_is[10]);
837*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + 8, d_stride, &d_is[1], &d_is[3], &d_is[5], &d_is[7],
838*77c1e3ccSAndroid Build Coastguard Worker                &d_is[9], &d_is[11]);
839*77c1e3ccSAndroid Build Coastguard Worker 
840*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + height * d_stride, d_stride, &d_ie[0], &d_ie[2], &d_ie[4],
841*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[6], &d_ie[8], &d_ie[10]);
842*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(di + height * d_stride + 8, d_stride, &d_ie[1], &d_ie[3],
843*77c1e3ccSAndroid Build Coastguard Worker                &d_ie[5], &d_ie[7], &d_ie[9], &d_ie[11]);
844*77c1e3ccSAndroid Build Coastguard Worker }
845*77c1e3ccSAndroid Build Coastguard Worker 
stats_left_win7_neon(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int32x4_t * sum)846*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_left_win7_neon(const int16x8_t src[2],
847*77c1e3ccSAndroid Build Coastguard Worker                                         const int16_t *d,
848*77c1e3ccSAndroid Build Coastguard Worker                                         const int32_t d_stride,
849*77c1e3ccSAndroid Build Coastguard Worker                                         int32x4_t *sum) {
850*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t dgds[WIN_7];
851*77c1e3ccSAndroid Build Coastguard Worker 
852*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
853*77c1e3ccSAndroid Build Coastguard Worker                &dgds[6], &dgds[8], &dgds[10]);
854*77c1e3ccSAndroid Build Coastguard Worker   load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
855*77c1e3ccSAndroid Build Coastguard Worker                &dgds[7], &dgds[9], &dgds[11]);
856*77c1e3ccSAndroid Build Coastguard Worker 
857*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[0], src[0], dgds[0]);
858*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[0], src[1], dgds[1]);
859*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[1], src[0], dgds[2]);
860*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[1], src[1], dgds[3]);
861*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[2], src[0], dgds[4]);
862*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[2], src[1], dgds[5]);
863*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[3], src[0], dgds[6]);
864*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[3], src[1], dgds[7]);
865*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[4], src[0], dgds[8]);
866*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[4], src[1], dgds[9]);
867*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[5], src[0], dgds[10]);
868*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&sum[5], src[1], dgds[11]);
869*77c1e3ccSAndroid Build Coastguard Worker }
870*77c1e3ccSAndroid Build Coastguard Worker 
step3_win7_neon(const int16_t * d,const int32_t d_stride,const int32_t width,const int32_t height,int16x8_t * ds,int32x4_t * deltas)871*77c1e3ccSAndroid Build Coastguard Worker static inline void step3_win7_neon(const int16_t *d, const int32_t d_stride,
872*77c1e3ccSAndroid Build Coastguard Worker                                    const int32_t width, const int32_t height,
873*77c1e3ccSAndroid Build Coastguard Worker                                    int16x8_t *ds, int32x4_t *deltas) {
874*77c1e3ccSAndroid Build Coastguard Worker   int32_t y = height;
875*77c1e3ccSAndroid Build Coastguard Worker   do {
876*77c1e3ccSAndroid Build Coastguard Worker     ds[12] = vld1q_s16(d);
877*77c1e3ccSAndroid Build Coastguard Worker     ds[13] = vld1q_s16(d + width);
878*77c1e3ccSAndroid Build Coastguard Worker 
879*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[0], &deltas[4], ds[0], ds[1], ds[0], ds[1]);
880*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[1], &deltas[5], ds[0], ds[1], ds[2], ds[3]);
881*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[2], &deltas[6], ds[0], ds[1], ds[4], ds[5]);
882*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[3], &deltas[7], ds[0], ds[1], ds[6], ds[7]);
883*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[8], &deltas[12], ds[0], ds[1], ds[8], ds[9]);
884*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[9], &deltas[13], ds[0], ds[1], ds[10], ds[11]);
885*77c1e3ccSAndroid Build Coastguard Worker     compute_delta_step3(&deltas[10], &deltas[14], ds[0], ds[1], ds[12], ds[13]);
886*77c1e3ccSAndroid Build Coastguard Worker 
887*77c1e3ccSAndroid Build Coastguard Worker     ds[0] = ds[2];
888*77c1e3ccSAndroid Build Coastguard Worker     ds[1] = ds[3];
889*77c1e3ccSAndroid Build Coastguard Worker     ds[2] = ds[4];
890*77c1e3ccSAndroid Build Coastguard Worker     ds[3] = ds[5];
891*77c1e3ccSAndroid Build Coastguard Worker     ds[4] = ds[6];
892*77c1e3ccSAndroid Build Coastguard Worker     ds[5] = ds[7];
893*77c1e3ccSAndroid Build Coastguard Worker     ds[6] = ds[8];
894*77c1e3ccSAndroid Build Coastguard Worker     ds[7] = ds[9];
895*77c1e3ccSAndroid Build Coastguard Worker     ds[8] = ds[10];
896*77c1e3ccSAndroid Build Coastguard Worker     ds[9] = ds[11];
897*77c1e3ccSAndroid Build Coastguard Worker     ds[10] = ds[12];
898*77c1e3ccSAndroid Build Coastguard Worker     ds[11] = ds[13];
899*77c1e3ccSAndroid Build Coastguard Worker 
900*77c1e3ccSAndroid Build Coastguard Worker     d += d_stride;
901*77c1e3ccSAndroid Build Coastguard Worker   } while (--y);
902*77c1e3ccSAndroid Build Coastguard Worker }
903*77c1e3ccSAndroid Build Coastguard Worker 
derive_triangle_win7_neon(const int16x8_t * d_is,const int16x8_t * d_ie,int32x4_t * deltas)904*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_triangle_win7_neon(const int16x8_t *d_is,
905*77c1e3ccSAndroid Build Coastguard Worker                                              const int16x8_t *d_ie,
906*77c1e3ccSAndroid Build Coastguard Worker                                              int32x4_t *deltas) {
907*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0], d_is[0], d_is[0]);
908*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[0], d_is[1], d_is[1]);
909*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1], d_is[0], d_is[2]);
910*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[1], d_is[1], d_is[3]);
911*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2], d_is[0], d_is[4]);
912*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[2], d_is[1], d_is[5]);
913*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3], d_is[0], d_is[6]);
914*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[3], d_is[1], d_is[7]);
915*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4], d_is[0], d_is[8]);
916*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[4], d_is[1], d_is[9]);
917*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5], d_is[0], d_is[10]);
918*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[5], d_is[1], d_is[11]);
919*77c1e3ccSAndroid Build Coastguard Worker 
920*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[6], d_is[2], d_is[2]);
921*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[6], d_is[3], d_is[3]);
922*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[7], d_is[2], d_is[4]);
923*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[7], d_is[3], d_is[5]);
924*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[8], d_is[2], d_is[6]);
925*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[8], d_is[3], d_is[7]);
926*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[9], d_is[2], d_is[8]);
927*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[9], d_is[3], d_is[9]);
928*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[10], d_is[2], d_is[10]);
929*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[10], d_is[3], d_is[11]);
930*77c1e3ccSAndroid Build Coastguard Worker 
931*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[11], d_is[4], d_is[4]);
932*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[11], d_is[5], d_is[5]);
933*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[12], d_is[4], d_is[6]);
934*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[12], d_is[5], d_is[7]);
935*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[13], d_is[4], d_is[8]);
936*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[13], d_is[5], d_is[9]);
937*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[14], d_is[4], d_is[10]);
938*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[14], d_is[5], d_is[11]);
939*77c1e3ccSAndroid Build Coastguard Worker 
940*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[15], d_is[6], d_is[6]);
941*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[15], d_is[7], d_is[7]);
942*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[16], d_is[6], d_is[8]);
943*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[16], d_is[7], d_is[9]);
944*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[17], d_is[6], d_is[10]);
945*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[17], d_is[7], d_is[11]);
946*77c1e3ccSAndroid Build Coastguard Worker 
947*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[18], d_is[8], d_is[8]);
948*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[18], d_is[9], d_is[9]);
949*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[19], d_is[8], d_is[10]);
950*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[19], d_is[9], d_is[11]);
951*77c1e3ccSAndroid Build Coastguard Worker 
952*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[20], d_is[10], d_is[10]);
953*77c1e3ccSAndroid Build Coastguard Worker   msub_neon(&deltas[20], d_is[11], d_is[11]);
954*77c1e3ccSAndroid Build Coastguard Worker 
955*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0], d_ie[0], d_ie[0]);
956*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[0], d_ie[1], d_ie[1]);
957*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1], d_ie[0], d_ie[2]);
958*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[1], d_ie[1], d_ie[3]);
959*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2], d_ie[0], d_ie[4]);
960*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[2], d_ie[1], d_ie[5]);
961*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3], d_ie[0], d_ie[6]);
962*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[3], d_ie[1], d_ie[7]);
963*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4], d_ie[0], d_ie[8]);
964*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[4], d_ie[1], d_ie[9]);
965*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5], d_ie[0], d_ie[10]);
966*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[5], d_ie[1], d_ie[11]);
967*77c1e3ccSAndroid Build Coastguard Worker 
968*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[6], d_ie[2], d_ie[2]);
969*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[6], d_ie[3], d_ie[3]);
970*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[7], d_ie[2], d_ie[4]);
971*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[7], d_ie[3], d_ie[5]);
972*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[8], d_ie[2], d_ie[6]);
973*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[8], d_ie[3], d_ie[7]);
974*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[9], d_ie[2], d_ie[8]);
975*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[9], d_ie[3], d_ie[9]);
976*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[10], d_ie[2], d_ie[10]);
977*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[10], d_ie[3], d_ie[11]);
978*77c1e3ccSAndroid Build Coastguard Worker 
979*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[11], d_ie[4], d_ie[4]);
980*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[11], d_ie[5], d_ie[5]);
981*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[12], d_ie[4], d_ie[6]);
982*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[12], d_ie[5], d_ie[7]);
983*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[13], d_ie[4], d_ie[8]);
984*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[13], d_ie[5], d_ie[9]);
985*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[14], d_ie[4], d_ie[10]);
986*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[14], d_ie[5], d_ie[11]);
987*77c1e3ccSAndroid Build Coastguard Worker 
988*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[15], d_ie[6], d_ie[6]);
989*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[15], d_ie[7], d_ie[7]);
990*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[16], d_ie[6], d_ie[8]);
991*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[16], d_ie[7], d_ie[9]);
992*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[17], d_ie[6], d_ie[10]);
993*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[17], d_ie[7], d_ie[11]);
994*77c1e3ccSAndroid Build Coastguard Worker 
995*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[18], d_ie[8], d_ie[8]);
996*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[18], d_ie[9], d_ie[9]);
997*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[19], d_ie[8], d_ie[10]);
998*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[19], d_ie[9], d_ie[11]);
999*77c1e3ccSAndroid Build Coastguard Worker 
1000*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[20], d_ie[10], d_ie[10]);
1001*77c1e3ccSAndroid Build Coastguard Worker   madd_neon(&deltas[20], d_ie[11], d_ie[11]);
1002*77c1e3ccSAndroid Build Coastguard Worker }
1003*77c1e3ccSAndroid Build Coastguard Worker 
diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1004*77c1e3ccSAndroid Build Coastguard Worker static inline void diagonal_copy_stats_neon(const int32_t wiener_win2,
1005*77c1e3ccSAndroid Build Coastguard Worker                                             int64_t *const H) {
1006*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1007*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t in[8], out[8];
1008*77c1e3ccSAndroid Build Coastguard Worker 
1009*77c1e3ccSAndroid Build Coastguard Worker     in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 1);
1010*77c1e3ccSAndroid Build Coastguard Worker     in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + i + 3);
1011*77c1e3ccSAndroid Build Coastguard Worker     in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1);
1012*77c1e3ccSAndroid Build Coastguard Worker     in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + i + 3);
1013*77c1e3ccSAndroid Build Coastguard Worker     in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 1);
1014*77c1e3ccSAndroid Build Coastguard Worker     in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + i + 3);
1015*77c1e3ccSAndroid Build Coastguard Worker     in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 1);
1016*77c1e3ccSAndroid Build Coastguard Worker     in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + i + 3);
1017*77c1e3ccSAndroid Build Coastguard Worker 
1018*77c1e3ccSAndroid Build Coastguard Worker     transpose_arrays_s64_4x4(in, out);
1019*77c1e3ccSAndroid Build Coastguard Worker 
1020*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(out[0]));
1021*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 2) * wiener_win2 + i, out[2]);
1022*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 3) * wiener_win2 + i, out[4]);
1023*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1024*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 4) * wiener_win2 + i, out[6]);
1025*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1026*77c1e3ccSAndroid Build Coastguard Worker 
1027*77c1e3ccSAndroid Build Coastguard Worker     for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1028*77c1e3ccSAndroid Build Coastguard Worker       in[0] = vld1q_s64(H + (i + 0) * wiener_win2 + j);
1029*77c1e3ccSAndroid Build Coastguard Worker       in[1] = vld1q_s64(H + (i + 0) * wiener_win2 + j + 2);
1030*77c1e3ccSAndroid Build Coastguard Worker       in[2] = vld1q_s64(H + (i + 1) * wiener_win2 + j);
1031*77c1e3ccSAndroid Build Coastguard Worker       in[3] = vld1q_s64(H + (i + 1) * wiener_win2 + j + 2);
1032*77c1e3ccSAndroid Build Coastguard Worker       in[4] = vld1q_s64(H + (i + 2) * wiener_win2 + j);
1033*77c1e3ccSAndroid Build Coastguard Worker       in[5] = vld1q_s64(H + (i + 2) * wiener_win2 + j + 2);
1034*77c1e3ccSAndroid Build Coastguard Worker       in[6] = vld1q_s64(H + (i + 3) * wiener_win2 + j);
1035*77c1e3ccSAndroid Build Coastguard Worker       in[7] = vld1q_s64(H + (i + 3) * wiener_win2 + j + 2);
1036*77c1e3ccSAndroid Build Coastguard Worker 
1037*77c1e3ccSAndroid Build Coastguard Worker       transpose_arrays_s64_4x4(in, out);
1038*77c1e3ccSAndroid Build Coastguard Worker 
1039*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 0) * wiener_win2 + i, out[0]);
1040*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1041*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 1) * wiener_win2 + i, out[2]);
1042*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1043*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 2) * wiener_win2 + i, out[4]);
1044*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1045*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 3) * wiener_win2 + i, out[6]);
1046*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1047*77c1e3ccSAndroid Build Coastguard Worker     }
1048*77c1e3ccSAndroid Build Coastguard Worker   }
1049*77c1e3ccSAndroid Build Coastguard Worker }
1050*77c1e3ccSAndroid Build Coastguard Worker 
div4_neon(const int64x2_t src)1051*77c1e3ccSAndroid Build Coastguard Worker static inline int64x2_t div4_neon(const int64x2_t src) {
1052*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1053*77c1e3ccSAndroid Build Coastguard Worker   uint64x2_t sign = vcltzq_s64(src);
1054*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t abs = vabsq_s64(src);
1055*77c1e3ccSAndroid Build Coastguard Worker   // divide by 4
1056*77c1e3ccSAndroid Build Coastguard Worker   abs = vshrq_n_s64(abs, 2);
1057*77c1e3ccSAndroid Build Coastguard Worker   // re-apply sign
1058*77c1e3ccSAndroid Build Coastguard Worker   return vbslq_s64(sign, vnegq_s64(abs), abs);
1059*77c1e3ccSAndroid Build Coastguard Worker #else
1060*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t sign = vshrq_n_s64(src, 63);
1061*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
1062*77c1e3ccSAndroid Build Coastguard Worker   // divide by 4
1063*77c1e3ccSAndroid Build Coastguard Worker   abs = vshrq_n_s64(abs, 2);
1064*77c1e3ccSAndroid Build Coastguard Worker   // re-apply sign
1065*77c1e3ccSAndroid Build Coastguard Worker   return vsubq_s64(veorq_s64(abs, sign), sign);
1066*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_ARCH_AARCH64
1067*77c1e3ccSAndroid Build Coastguard Worker }
1068*77c1e3ccSAndroid Build Coastguard Worker 
div4_4x4_neon(const int32_t wiener_win2,int64_t * const H,int64x2_t out[8])1069*77c1e3ccSAndroid Build Coastguard Worker static inline void div4_4x4_neon(const int32_t wiener_win2, int64_t *const H,
1070*77c1e3ccSAndroid Build Coastguard Worker                                  int64x2_t out[8]) {
1071*77c1e3ccSAndroid Build Coastguard Worker   out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
1072*77c1e3ccSAndroid Build Coastguard Worker   out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
1073*77c1e3ccSAndroid Build Coastguard Worker   out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
1074*77c1e3ccSAndroid Build Coastguard Worker   out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
1075*77c1e3ccSAndroid Build Coastguard Worker   out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
1076*77c1e3ccSAndroid Build Coastguard Worker   out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
1077*77c1e3ccSAndroid Build Coastguard Worker   out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
1078*77c1e3ccSAndroid Build Coastguard Worker   out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
1079*77c1e3ccSAndroid Build Coastguard Worker 
1080*77c1e3ccSAndroid Build Coastguard Worker   out[0] = div4_neon(out[0]);
1081*77c1e3ccSAndroid Build Coastguard Worker   out[1] = div4_neon(out[1]);
1082*77c1e3ccSAndroid Build Coastguard Worker   out[2] = div4_neon(out[2]);
1083*77c1e3ccSAndroid Build Coastguard Worker   out[3] = div4_neon(out[3]);
1084*77c1e3ccSAndroid Build Coastguard Worker   out[4] = div4_neon(out[4]);
1085*77c1e3ccSAndroid Build Coastguard Worker   out[5] = div4_neon(out[5]);
1086*77c1e3ccSAndroid Build Coastguard Worker   out[6] = div4_neon(out[6]);
1087*77c1e3ccSAndroid Build Coastguard Worker   out[7] = div4_neon(out[7]);
1088*77c1e3ccSAndroid Build Coastguard Worker 
1089*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
1090*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
1091*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
1092*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
1093*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
1094*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
1095*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
1096*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
1097*77c1e3ccSAndroid Build Coastguard Worker }
1098*77c1e3ccSAndroid Build Coastguard Worker 
div16_neon(const int64x2_t src)1099*77c1e3ccSAndroid Build Coastguard Worker static inline int64x2_t div16_neon(const int64x2_t src) {
1100*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1101*77c1e3ccSAndroid Build Coastguard Worker   uint64x2_t sign = vcltzq_s64(src);
1102*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t abs = vabsq_s64(src);
1103*77c1e3ccSAndroid Build Coastguard Worker   // divide by 16
1104*77c1e3ccSAndroid Build Coastguard Worker   abs = vshrq_n_s64(abs, 4);
1105*77c1e3ccSAndroid Build Coastguard Worker   // re-apply sign
1106*77c1e3ccSAndroid Build Coastguard Worker   return vbslq_s64(sign, vnegq_s64(abs), abs);
1107*77c1e3ccSAndroid Build Coastguard Worker #else
1108*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t sign = vshrq_n_s64(src, 63);
1109*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t abs = vsubq_s64(veorq_s64(src, sign), sign);
1110*77c1e3ccSAndroid Build Coastguard Worker   // divide by 16
1111*77c1e3ccSAndroid Build Coastguard Worker   abs = vshrq_n_s64(abs, 4);
1112*77c1e3ccSAndroid Build Coastguard Worker   // re-apply sign
1113*77c1e3ccSAndroid Build Coastguard Worker   return vsubq_s64(veorq_s64(abs, sign), sign);
1114*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_ARCH_AARCH64
1115*77c1e3ccSAndroid Build Coastguard Worker }
1116*77c1e3ccSAndroid Build Coastguard Worker 
div16_4x4_neon(const int32_t wiener_win2,int64_t * const H,int64x2_t out[8])1117*77c1e3ccSAndroid Build Coastguard Worker static inline void div16_4x4_neon(const int32_t wiener_win2, int64_t *const H,
1118*77c1e3ccSAndroid Build Coastguard Worker                                   int64x2_t out[8]) {
1119*77c1e3ccSAndroid Build Coastguard Worker   out[0] = vld1q_s64(H + 0 * wiener_win2 + 0);
1120*77c1e3ccSAndroid Build Coastguard Worker   out[1] = vld1q_s64(H + 0 * wiener_win2 + 2);
1121*77c1e3ccSAndroid Build Coastguard Worker   out[2] = vld1q_s64(H + 1 * wiener_win2 + 0);
1122*77c1e3ccSAndroid Build Coastguard Worker   out[3] = vld1q_s64(H + 1 * wiener_win2 + 2);
1123*77c1e3ccSAndroid Build Coastguard Worker   out[4] = vld1q_s64(H + 2 * wiener_win2 + 0);
1124*77c1e3ccSAndroid Build Coastguard Worker   out[5] = vld1q_s64(H + 2 * wiener_win2 + 2);
1125*77c1e3ccSAndroid Build Coastguard Worker   out[6] = vld1q_s64(H + 3 * wiener_win2 + 0);
1126*77c1e3ccSAndroid Build Coastguard Worker   out[7] = vld1q_s64(H + 3 * wiener_win2 + 2);
1127*77c1e3ccSAndroid Build Coastguard Worker 
1128*77c1e3ccSAndroid Build Coastguard Worker   out[0] = div16_neon(out[0]);
1129*77c1e3ccSAndroid Build Coastguard Worker   out[1] = div16_neon(out[1]);
1130*77c1e3ccSAndroid Build Coastguard Worker   out[2] = div16_neon(out[2]);
1131*77c1e3ccSAndroid Build Coastguard Worker   out[3] = div16_neon(out[3]);
1132*77c1e3ccSAndroid Build Coastguard Worker   out[4] = div16_neon(out[4]);
1133*77c1e3ccSAndroid Build Coastguard Worker   out[5] = div16_neon(out[5]);
1134*77c1e3ccSAndroid Build Coastguard Worker   out[6] = div16_neon(out[6]);
1135*77c1e3ccSAndroid Build Coastguard Worker   out[7] = div16_neon(out[7]);
1136*77c1e3ccSAndroid Build Coastguard Worker 
1137*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 0 * wiener_win2 + 0, out[0]);
1138*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 0 * wiener_win2 + 2, out[1]);
1139*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 1 * wiener_win2 + 0, out[2]);
1140*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 1 * wiener_win2 + 2, out[3]);
1141*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 2 * wiener_win2 + 0, out[4]);
1142*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 2 * wiener_win2 + 2, out[5]);
1143*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 3 * wiener_win2 + 0, out[6]);
1144*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(H + 3 * wiener_win2 + 2, out[7]);
1145*77c1e3ccSAndroid Build Coastguard Worker }
1146*77c1e3ccSAndroid Build Coastguard Worker 
div4_diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1147*77c1e3ccSAndroid Build Coastguard Worker static inline void div4_diagonal_copy_stats_neon(const int32_t wiener_win2,
1148*77c1e3ccSAndroid Build Coastguard Worker                                                  int64_t *const H) {
1149*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1150*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t in[8], out[8];
1151*77c1e3ccSAndroid Build Coastguard Worker 
1152*77c1e3ccSAndroid Build Coastguard Worker     div4_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
1153*77c1e3ccSAndroid Build Coastguard Worker     transpose_arrays_s64_4x4(in, out);
1154*77c1e3ccSAndroid Build Coastguard Worker 
1155*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
1156*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
1157*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
1158*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1159*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
1160*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1161*77c1e3ccSAndroid Build Coastguard Worker 
1162*77c1e3ccSAndroid Build Coastguard Worker     for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1163*77c1e3ccSAndroid Build Coastguard Worker       div4_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
1164*77c1e3ccSAndroid Build Coastguard Worker       transpose_arrays_s64_4x4(in, out);
1165*77c1e3ccSAndroid Build Coastguard Worker 
1166*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
1167*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1168*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
1169*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1170*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
1171*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1172*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
1173*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1174*77c1e3ccSAndroid Build Coastguard Worker     }
1175*77c1e3ccSAndroid Build Coastguard Worker   }
1176*77c1e3ccSAndroid Build Coastguard Worker }
1177*77c1e3ccSAndroid Build Coastguard Worker 
div16_diagonal_copy_stats_neon(const int32_t wiener_win2,int64_t * const H)1178*77c1e3ccSAndroid Build Coastguard Worker static inline void div16_diagonal_copy_stats_neon(const int32_t wiener_win2,
1179*77c1e3ccSAndroid Build Coastguard Worker                                                   int64_t *const H) {
1180*77c1e3ccSAndroid Build Coastguard Worker   for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
1181*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t in[8], out[8];
1182*77c1e3ccSAndroid Build Coastguard Worker 
1183*77c1e3ccSAndroid Build Coastguard Worker     div16_4x4_neon(wiener_win2, H + i * wiener_win2 + i + 1, in);
1184*77c1e3ccSAndroid Build Coastguard Worker     transpose_arrays_s64_4x4(in, out);
1185*77c1e3ccSAndroid Build Coastguard Worker 
1186*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(H + (i + 1) * wiener_win2 + i + 0, vget_low_s64(out[0]));
1187*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 2) * wiener_win2 + i + 0, out[2]);
1188*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 3) * wiener_win2 + i + 0, out[4]);
1189*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 3) * wiener_win2 + i + 2, out[5]);
1190*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 4) * wiener_win2 + i + 0, out[6]);
1191*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i + 4) * wiener_win2 + i + 2, out[7]);
1192*77c1e3ccSAndroid Build Coastguard Worker 
1193*77c1e3ccSAndroid Build Coastguard Worker     for (int32_t j = i + 5; j < wiener_win2; j += 4) {
1194*77c1e3ccSAndroid Build Coastguard Worker       div16_4x4_neon(wiener_win2, H + i * wiener_win2 + j, in);
1195*77c1e3ccSAndroid Build Coastguard Worker       transpose_arrays_s64_4x4(in, out);
1196*77c1e3ccSAndroid Build Coastguard Worker 
1197*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 0) * wiener_win2 + i + 0, out[0]);
1198*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 0) * wiener_win2 + i + 2, out[1]);
1199*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 1) * wiener_win2 + i + 0, out[2]);
1200*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 1) * wiener_win2 + i + 2, out[3]);
1201*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 2) * wiener_win2 + i + 0, out[4]);
1202*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 2) * wiener_win2 + i + 2, out[5]);
1203*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 3) * wiener_win2 + i + 0, out[6]);
1204*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s64(H + (j + 3) * wiener_win2 + i + 2, out[7]);
1205*77c1e3ccSAndroid Build Coastguard Worker     }
1206*77c1e3ccSAndroid Build Coastguard Worker   }
1207*77c1e3ccSAndroid Build Coastguard Worker }
1208*77c1e3ccSAndroid Build Coastguard Worker 
1209*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_AV1_ENCODER_ARM_PICKRST_NEON_H_
1210