1*abb65b4bSAndroid Build Coastguard Worker /*
2*abb65b4bSAndroid Build Coastguard Worker * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3*abb65b4bSAndroid Build Coastguard Worker * All Rights Reserved.
4*abb65b4bSAndroid Build Coastguard Worker *
5*abb65b4bSAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
6*abb65b4bSAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
7*abb65b4bSAndroid Build Coastguard Worker *
8*abb65b4bSAndroid Build Coastguard Worker * - Redistributions of source code must retain the above copyright notice,
9*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer.
10*abb65b4bSAndroid Build Coastguard Worker *
11*abb65b4bSAndroid Build Coastguard Worker * - Redistributions in binary form must reproduce the above copyright notice,
12*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation
13*abb65b4bSAndroid Build Coastguard Worker * and/or other materials provided with the distribution.
14*abb65b4bSAndroid Build Coastguard Worker *
15*abb65b4bSAndroid Build Coastguard Worker * - Neither the name of the copyright owner, nor the names of its contributors
16*abb65b4bSAndroid Build Coastguard Worker * may be used to endorse or promote products derived from this software
17*abb65b4bSAndroid Build Coastguard Worker * without specific prior written permission.
18*abb65b4bSAndroid Build Coastguard Worker *
19*abb65b4bSAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20*abb65b4bSAndroid Build Coastguard Worker * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21*abb65b4bSAndroid Build Coastguard Worker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22*abb65b4bSAndroid Build Coastguard Worker * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23*abb65b4bSAndroid Build Coastguard Worker * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*abb65b4bSAndroid Build Coastguard Worker * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*abb65b4bSAndroid Build Coastguard Worker * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*abb65b4bSAndroid Build Coastguard Worker * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*abb65b4bSAndroid Build Coastguard Worker * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*abb65b4bSAndroid Build Coastguard Worker * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*abb65b4bSAndroid Build Coastguard Worker * POSSIBILITY OF SUCH DAMAGE.
30*abb65b4bSAndroid Build Coastguard Worker */
31*abb65b4bSAndroid Build Coastguard Worker
32*abb65b4bSAndroid Build Coastguard Worker #include "oapv_def.h"
33*abb65b4bSAndroid Build Coastguard Worker #include <math.h>
34*abb65b4bSAndroid Build Coastguard Worker
35*abb65b4bSAndroid Build Coastguard Worker #if ARM_NEON
36*abb65b4bSAndroid Build Coastguard Worker
37*abb65b4bSAndroid Build Coastguard Worker /* SSD ***********************************************************************/
ssd_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int bit_depth)38*abb65b4bSAndroid Build Coastguard Worker static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth)
39*abb65b4bSAndroid Build Coastguard Worker {
40*abb65b4bSAndroid Build Coastguard Worker s64 ssd = 0;
41*abb65b4bSAndroid Build Coastguard Worker s16* s1 = (s16*) src1;
42*abb65b4bSAndroid Build Coastguard Worker s16* s2 = (s16*) src2;
43*abb65b4bSAndroid Build Coastguard Worker s16 i;
44*abb65b4bSAndroid Build Coastguard Worker int16x8_t s1_vector, s2_vector;
45*abb65b4bSAndroid Build Coastguard Worker int32x4_t diff1, diff2;
46*abb65b4bSAndroid Build Coastguard Worker int32x2_t diff1_low, diff2_low;
47*abb65b4bSAndroid Build Coastguard Worker int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff;
48*abb65b4bSAndroid Build Coastguard Worker
49*abb65b4bSAndroid Build Coastguard Worker {
50*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
51*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
52*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
53*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
54*abb65b4bSAndroid Build Coastguard Worker
55*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
56*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
57*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
58*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
59*abb65b4bSAndroid Build Coastguard Worker
60*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
61*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
62*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
63*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
64*abb65b4bSAndroid Build Coastguard Worker
65*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff1_low, sq_diff1_high);
66*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
67*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
68*abb65b4bSAndroid Build Coastguard Worker }
69*abb65b4bSAndroid Build Coastguard Worker {
70*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
71*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
72*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
73*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
74*abb65b4bSAndroid Build Coastguard Worker
75*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
76*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
77*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
78*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
79*abb65b4bSAndroid Build Coastguard Worker
80*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
81*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
82*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
83*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
84*abb65b4bSAndroid Build Coastguard Worker
85*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
86*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
87*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
88*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
89*abb65b4bSAndroid Build Coastguard Worker }
90*abb65b4bSAndroid Build Coastguard Worker {
91*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
92*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
93*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
94*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
95*abb65b4bSAndroid Build Coastguard Worker
96*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
97*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
98*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
99*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
100*abb65b4bSAndroid Build Coastguard Worker
101*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
102*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
103*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
104*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
105*abb65b4bSAndroid Build Coastguard Worker
106*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
107*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
108*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
109*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
110*abb65b4bSAndroid Build Coastguard Worker }
111*abb65b4bSAndroid Build Coastguard Worker {
112*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
113*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
114*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
115*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
116*abb65b4bSAndroid Build Coastguard Worker
117*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
118*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
119*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
120*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
121*abb65b4bSAndroid Build Coastguard Worker
122*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
123*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
124*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
125*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
126*abb65b4bSAndroid Build Coastguard Worker
127*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
128*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
129*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
130*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
131*abb65b4bSAndroid Build Coastguard Worker }
132*abb65b4bSAndroid Build Coastguard Worker {
133*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
134*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
135*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
136*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
137*abb65b4bSAndroid Build Coastguard Worker
138*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
139*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
140*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
141*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
142*abb65b4bSAndroid Build Coastguard Worker
143*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
144*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
145*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
146*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
147*abb65b4bSAndroid Build Coastguard Worker
148*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
149*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
150*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
151*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
152*abb65b4bSAndroid Build Coastguard Worker }
153*abb65b4bSAndroid Build Coastguard Worker {
154*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
155*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
156*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
157*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
158*abb65b4bSAndroid Build Coastguard Worker
159*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
160*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
161*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
162*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
163*abb65b4bSAndroid Build Coastguard Worker
164*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
165*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
166*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
167*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
168*abb65b4bSAndroid Build Coastguard Worker
169*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
170*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
171*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
172*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
173*abb65b4bSAndroid Build Coastguard Worker }
174*abb65b4bSAndroid Build Coastguard Worker {
175*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
176*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
177*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
178*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
179*abb65b4bSAndroid Build Coastguard Worker
180*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
181*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
182*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
183*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
184*abb65b4bSAndroid Build Coastguard Worker
185*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
186*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
187*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
188*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
189*abb65b4bSAndroid Build Coastguard Worker
190*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
191*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
192*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
193*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
194*abb65b4bSAndroid Build Coastguard Worker }
195*abb65b4bSAndroid Build Coastguard Worker {
196*abb65b4bSAndroid Build Coastguard Worker s1_vector = vld1q_s16(s1);
197*abb65b4bSAndroid Build Coastguard Worker s1 += s_src1;
198*abb65b4bSAndroid Build Coastguard Worker s2_vector = vld1q_s16(s2);
199*abb65b4bSAndroid Build Coastguard Worker s2 += s_src2;
200*abb65b4bSAndroid Build Coastguard Worker
201*abb65b4bSAndroid Build Coastguard Worker diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
202*abb65b4bSAndroid Build Coastguard Worker diff2 = vsubl_high_s16(s1_vector, s2_vector);
203*abb65b4bSAndroid Build Coastguard Worker diff1_low = vget_low_s32(diff1);
204*abb65b4bSAndroid Build Coastguard Worker diff2_low = vget_low_s32(diff2);
205*abb65b4bSAndroid Build Coastguard Worker
206*abb65b4bSAndroid Build Coastguard Worker sq_diff1_low = vmull_s32(diff1_low, diff1_low);
207*abb65b4bSAndroid Build Coastguard Worker sq_diff1_high = vmull_high_s32(diff1, diff1);
208*abb65b4bSAndroid Build Coastguard Worker sq_diff2_low = vmull_s32(diff2_low, diff2_low);
209*abb65b4bSAndroid Build Coastguard Worker sq_diff2_high = vmull_high_s32(diff2, diff2);
210*abb65b4bSAndroid Build Coastguard Worker
211*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
212*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
213*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
214*abb65b4bSAndroid Build Coastguard Worker sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
215*abb65b4bSAndroid Build Coastguard Worker }
216*abb65b4bSAndroid Build Coastguard Worker ssd += vaddvq_s64(sq_diff);
217*abb65b4bSAndroid Build Coastguard Worker return ssd;
218*abb65b4bSAndroid Build Coastguard Worker }
219*abb65b4bSAndroid Build Coastguard Worker
220*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2] =
221*abb65b4bSAndroid Build Coastguard Worker {
222*abb65b4bSAndroid Build Coastguard Worker ssd_16b_neon_8x8,
223*abb65b4bSAndroid Build Coastguard Worker NULL};
224*abb65b4bSAndroid Build Coastguard Worker
225*abb65b4bSAndroid Build Coastguard Worker
oapv_dc_removed_had8x8_neon(pel * org,int s_org)226*abb65b4bSAndroid Build Coastguard Worker int oapv_dc_removed_had8x8_neon(pel* org, int s_org)
227*abb65b4bSAndroid Build Coastguard Worker {
228*abb65b4bSAndroid Build Coastguard Worker int satd = 0;
229*abb65b4bSAndroid Build Coastguard Worker /* all 128 bit registers are named with a suffix mxnb, where m is the */
230*abb65b4bSAndroid Build Coastguard Worker /* number of n bits packed in the register */
231*abb65b4bSAndroid Build Coastguard Worker
232*abb65b4bSAndroid Build Coastguard Worker int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
233*abb65b4bSAndroid Build Coastguard Worker int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
234*abb65b4bSAndroid Build Coastguard Worker int16x8_t pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
235*abb65b4bSAndroid Build Coastguard Worker int16x8_t pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
236*abb65b4bSAndroid Build Coastguard Worker int16x8_t out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
237*abb65b4bSAndroid Build Coastguard Worker int16x8_t out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
238*abb65b4bSAndroid Build Coastguard Worker int16x8x2_t out0_8x16bx2, out1_8x16bx2, out2_8x16bx2, out3_8x16bx2;
239*abb65b4bSAndroid Build Coastguard Worker
240*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = (vld1q_s16(&org[0]));
241*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
242*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = (vld1q_s16(&org[0]));
243*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
244*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = (vld1q_s16(&org[0]));
245*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
246*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = (vld1q_s16(&org[0]));
247*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
248*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = (vld1q_s16(&org[0]));
249*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
250*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = (vld1q_s16(&org[0]));
251*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
252*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = (vld1q_s16(&org[0]));
253*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
254*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = (vld1q_s16(&org[0]));
255*abb65b4bSAndroid Build Coastguard Worker org = org + s_org;
256*abb65b4bSAndroid Build Coastguard Worker
257*abb65b4bSAndroid Build Coastguard Worker /**************** 8x8 horizontal transform *******************************/
258*abb65b4bSAndroid Build Coastguard Worker /*********************** 8x8 16 bit Transpose ************************/
259*abb65b4bSAndroid Build Coastguard Worker
260*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = vcombine_s16(vget_low_s16(src0_8x16b), vget_low_s16(src1_8x16b));
261*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vget_high_s16(src1_8x16b));
262*abb65b4bSAndroid Build Coastguard Worker
263*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vcombine_s16(vget_low_s16(src2_8x16b), vget_low_s16(src3_8x16b));
264*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vget_high_s16(src3_8x16b));
265*abb65b4bSAndroid Build Coastguard Worker
266*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = vcombine_s16(vget_low_s16(src4_8x16b), vget_low_s16(src5_8x16b));
267*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vget_high_s16(src5_8x16b));
268*abb65b4bSAndroid Build Coastguard Worker
269*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = vcombine_s16(vget_low_s16(src6_8x16b), vget_low_s16(src7_8x16b));
270*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vget_high_s16(src7_8x16b));
271*abb65b4bSAndroid Build Coastguard Worker
272*abb65b4bSAndroid Build Coastguard Worker
273*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
274*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
275*abb65b4bSAndroid Build Coastguard Worker
276*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
277*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
278*abb65b4bSAndroid Build Coastguard Worker
279*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
280*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
281*abb65b4bSAndroid Build Coastguard Worker
282*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
283*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
284*abb65b4bSAndroid Build Coastguard Worker
285*abb65b4bSAndroid Build Coastguard Worker out0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
286*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
287*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
288*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
289*abb65b4bSAndroid Build Coastguard Worker out4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
290*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
291*abb65b4bSAndroid Build Coastguard Worker out6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
292*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
293*abb65b4bSAndroid Build Coastguard Worker
294*abb65b4bSAndroid Build Coastguard Worker /********************** 8x8 16 bit Transpose End *********************/
295*abb65b4bSAndroid Build Coastguard Worker
296*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 */
297*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vaddq_s16(out0_8x16b, out1_8x16b);
298*abb65b4bSAndroid Build Coastguard Worker /* r2 + r3 */
299*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = vaddq_s16(out2_8x16b, out3_8x16b);
300*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 */
301*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = vaddq_s16(out4_8x16b, out5_8x16b);
302*abb65b4bSAndroid Build Coastguard Worker /* r6 + r7 */
303*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = vaddq_s16(out6_8x16b, out7_8x16b);
304*abb65b4bSAndroid Build Coastguard Worker
305*abb65b4bSAndroid Build Coastguard Worker
306*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 */
307*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
308*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 + r6 + r7 */
309*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
310*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
311*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
312*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
313*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
314*abb65b4bSAndroid Build Coastguard Worker
315*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 */
316*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
317*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 - r6 - r7 */
318*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
319*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
320*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
321*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
322*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
323*abb65b4bSAndroid Build Coastguard Worker
324*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 */
325*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vsubq_s16(out0_8x16b, out1_8x16b);
326*abb65b4bSAndroid Build Coastguard Worker /* r2 - r3 */
327*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = vsubq_s16(out2_8x16b, out3_8x16b);
328*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 */
329*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = vsubq_s16(out4_8x16b, out5_8x16b);
330*abb65b4bSAndroid Build Coastguard Worker /* r6 - r7 */
331*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = vsubq_s16(out6_8x16b, out7_8x16b);
332*abb65b4bSAndroid Build Coastguard Worker
333*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 */
334*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
335*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 + r6 - r7 */
336*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
337*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
338*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
339*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
340*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
341*abb65b4bSAndroid Build Coastguard Worker
342*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 */
343*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
344*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 - r6 + r7 */
345*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
346*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
347*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
348*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
349*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
350*abb65b4bSAndroid Build Coastguard Worker
351*abb65b4bSAndroid Build Coastguard Worker
352*abb65b4bSAndroid Build Coastguard Worker /*********************** 8x8 16 bit Transpose ************************/
353*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = vzip1q_s16(src0_8x16b, src1_8x16b);
354*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vzip1q_s16(src2_8x16b, src3_8x16b);
355*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = vzip1q_s16(src4_8x16b, src5_8x16b);
356*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = vzip1q_s16(src6_8x16b, src7_8x16b);
357*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = vzip2q_s16(src0_8x16b, src1_8x16b);
358*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vzip2q_s16(src2_8x16b, src3_8x16b);
359*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = vzip2q_s16(src4_8x16b, src5_8x16b);
360*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vzip2q_s16(src6_8x16b, src7_8x16b);
361*abb65b4bSAndroid Build Coastguard Worker
362*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
363*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
364*abb65b4bSAndroid Build Coastguard Worker
365*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
366*abb65b4bSAndroid Build Coastguard Worker pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
367*abb65b4bSAndroid Build Coastguard Worker
368*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
369*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
370*abb65b4bSAndroid Build Coastguard Worker
371*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
372*abb65b4bSAndroid Build Coastguard Worker pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
373*abb65b4bSAndroid Build Coastguard Worker
374*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
375*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
376*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
377*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
378*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
379*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
380*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
381*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
382*abb65b4bSAndroid Build Coastguard Worker
383*abb65b4bSAndroid Build Coastguard Worker /********************** 8x8 16 bit Transpose End *********************/
384*abb65b4bSAndroid Build Coastguard Worker /**************** 8x8 horizontal transform *******************************/
385*abb65b4bSAndroid Build Coastguard Worker {
386*abb65b4bSAndroid Build Coastguard Worker int16x8_t out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
387*abb65b4bSAndroid Build Coastguard Worker int16x8_t out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
388*abb65b4bSAndroid Build Coastguard Worker int16x8_t tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
389*abb65b4bSAndroid Build Coastguard Worker int16x8_t tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
390*abb65b4bSAndroid Build Coastguard Worker
391*abb65b4bSAndroid Build Coastguard Worker /************************* 8x8 Vertical Transform*************************/
392*abb65b4bSAndroid Build Coastguard Worker tmp0_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vcreate_s32(0));
393*abb65b4bSAndroid Build Coastguard Worker tmp1_8x16b = vcombine_s16(vget_high_s16(src1_8x16b), vcreate_s32(0));
394*abb65b4bSAndroid Build Coastguard Worker tmp2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vcreate_s32(0));
395*abb65b4bSAndroid Build Coastguard Worker tmp3_8x16b = vcombine_s16(vget_high_s16(src3_8x16b), vcreate_s32(0));
396*abb65b4bSAndroid Build Coastguard Worker tmp4_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vcreate_s32(0));
397*abb65b4bSAndroid Build Coastguard Worker tmp5_8x16b = vcombine_s16(vget_high_s16(src5_8x16b), vcreate_s32(0));
398*abb65b4bSAndroid Build Coastguard Worker tmp6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vcreate_s32(0));
399*abb65b4bSAndroid Build Coastguard Worker tmp7_8x16b = vcombine_s16(vget_high_s16(src7_8x16b), vcreate_s32(0));
400*abb65b4bSAndroid Build Coastguard Worker
401*abb65b4bSAndroid Build Coastguard Worker /*************************First 4 pixels ********************************/
402*abb65b4bSAndroid Build Coastguard Worker
403*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = vmovl_s16(vget_low_s16(src0_8x16b));
404*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = vmovl_s16(vget_low_s16(src1_8x16b));
405*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vmovl_s16(vget_low_s16(src2_8x16b));
406*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = vmovl_s16(vget_low_s16(src3_8x16b));
407*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = vmovl_s16(vget_low_s16(src4_8x16b));
408*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = vmovl_s16(vget_low_s16(src5_8x16b));
409*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vmovl_s16(vget_low_s16(src6_8x16b));
410*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = vmovl_s16(vget_low_s16(src7_8x16b));
411*abb65b4bSAndroid Build Coastguard Worker
412*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 */
413*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
414*abb65b4bSAndroid Build Coastguard Worker /* r2 + r3 */
415*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
416*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 */
417*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
418*abb65b4bSAndroid Build Coastguard Worker /* r6 + r7 */
419*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
420*abb65b4bSAndroid Build Coastguard Worker
421*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 */
422*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
423*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 + r6 + r7 */
424*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
425*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
426*abb65b4bSAndroid Build Coastguard Worker out0_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
427*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
428*abb65b4bSAndroid Build Coastguard Worker out4_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
429*abb65b4bSAndroid Build Coastguard Worker
430*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 */
431*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
432*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 - r6 - r7 */
433*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
434*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
435*abb65b4bSAndroid Build Coastguard Worker out2_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
436*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
437*abb65b4bSAndroid Build Coastguard Worker out6_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
438*abb65b4bSAndroid Build Coastguard Worker
439*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 */
440*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
441*abb65b4bSAndroid Build Coastguard Worker /* r2 - r3 */
442*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
443*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 */
444*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
445*abb65b4bSAndroid Build Coastguard Worker /* r6 - r7 */
446*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
447*abb65b4bSAndroid Build Coastguard Worker
448*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 */
449*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
450*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 + r6 - r7 */
451*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
452*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
453*abb65b4bSAndroid Build Coastguard Worker out1_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
454*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
455*abb65b4bSAndroid Build Coastguard Worker out5_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
456*abb65b4bSAndroid Build Coastguard Worker
457*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 */
458*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
459*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 - r6 + r7 */
460*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
461*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
462*abb65b4bSAndroid Build Coastguard Worker out3_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
463*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
464*abb65b4bSAndroid Build Coastguard Worker out7_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
465*abb65b4bSAndroid Build Coastguard Worker
466*abb65b4bSAndroid Build Coastguard Worker /*************************First 4 pixels ********************************/
467*abb65b4bSAndroid Build Coastguard Worker
468*abb65b4bSAndroid Build Coastguard Worker /**************************Next 4 pixels *******************************/
469*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = vmovl_s16(vget_low_s16(tmp0_8x16b));
470*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = vmovl_s16(vget_low_s16(tmp1_8x16b));
471*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vmovl_s16(vget_low_s16(tmp2_8x16b));
472*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = vmovl_s16(vget_low_s16(tmp3_8x16b));
473*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = vmovl_s16(vget_low_s16(tmp4_8x16b));
474*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = vmovl_s16(vget_low_s16(tmp5_8x16b));
475*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vmovl_s16(vget_low_s16(tmp6_8x16b));
476*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = vmovl_s16(vget_low_s16(tmp7_8x16b));
477*abb65b4bSAndroid Build Coastguard Worker
478*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 */
479*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
480*abb65b4bSAndroid Build Coastguard Worker /* r2 + r3 */
481*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
482*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 */
483*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
484*abb65b4bSAndroid Build Coastguard Worker /* r6 + r7 */
485*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
486*abb65b4bSAndroid Build Coastguard Worker
487*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 */
488*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
489*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 + r6 + r7 */
490*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
491*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
492*abb65b4bSAndroid Build Coastguard Worker out0a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
493*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
494*abb65b4bSAndroid Build Coastguard Worker out4a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
495*abb65b4bSAndroid Build Coastguard Worker
496*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 */
497*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
498*abb65b4bSAndroid Build Coastguard Worker /* r4 + r5 - r6 - r7 */
499*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
500*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
501*abb65b4bSAndroid Build Coastguard Worker out2a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
502*abb65b4bSAndroid Build Coastguard Worker /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
503*abb65b4bSAndroid Build Coastguard Worker out6a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
504*abb65b4bSAndroid Build Coastguard Worker
505*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 */
506*abb65b4bSAndroid Build Coastguard Worker pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
507*abb65b4bSAndroid Build Coastguard Worker /* r2 - r3 */
508*abb65b4bSAndroid Build Coastguard Worker pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
509*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 */
510*abb65b4bSAndroid Build Coastguard Worker pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
511*abb65b4bSAndroid Build Coastguard Worker /* r6 - r7 */
512*abb65b4bSAndroid Build Coastguard Worker pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
513*abb65b4bSAndroid Build Coastguard Worker
514*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 */
515*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
516*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 + r6 - r7 */
517*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
518*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
519*abb65b4bSAndroid Build Coastguard Worker out1a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
520*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
521*abb65b4bSAndroid Build Coastguard Worker out5a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
522*abb65b4bSAndroid Build Coastguard Worker
523*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 */
524*abb65b4bSAndroid Build Coastguard Worker pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
525*abb65b4bSAndroid Build Coastguard Worker /* r4 - r5 - r6 + r7 */
526*abb65b4bSAndroid Build Coastguard Worker pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
527*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
528*abb65b4bSAndroid Build Coastguard Worker out3a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
529*abb65b4bSAndroid Build Coastguard Worker /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
530*abb65b4bSAndroid Build Coastguard Worker out7a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
531*abb65b4bSAndroid Build Coastguard Worker
532*abb65b4bSAndroid Build Coastguard Worker /**************************Next 4 pixels *******************************/
533*abb65b4bSAndroid Build Coastguard Worker /************************* 8x8 Vertical Transform*************************/
534*abb65b4bSAndroid Build Coastguard Worker
535*abb65b4bSAndroid Build Coastguard Worker /****************************SATD calculation ****************************/
536*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = vabsq_s32(out0_8x16b);
537*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = vabsq_s32(out1_8x16b);
538*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vabsq_s32(out2_8x16b);
539*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = vabsq_s32(out3_8x16b);
540*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = vabsq_s32(out4_8x16b);
541*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = vabsq_s32(out5_8x16b);
542*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vabsq_s32(out6_8x16b);
543*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = vabsq_s32(out7_8x16b);
544*abb65b4bSAndroid Build Coastguard Worker s32* p = (s32*)&src0_8x16b;
545*abb65b4bSAndroid Build Coastguard Worker p[0] = 0;
546*abb65b4bSAndroid Build Coastguard Worker
547*abb65b4bSAndroid Build Coastguard Worker satd = vaddvq_s32(src0_8x16b);
548*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src1_8x16b);
549*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src2_8x16b);
550*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src3_8x16b);
551*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src4_8x16b);
552*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src5_8x16b);
553*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src6_8x16b);
554*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src7_8x16b);
555*abb65b4bSAndroid Build Coastguard Worker
556*abb65b4bSAndroid Build Coastguard Worker src0_8x16b = vabsq_s32(out0a_8x16b);
557*abb65b4bSAndroid Build Coastguard Worker src1_8x16b = vabsq_s32(out1a_8x16b);
558*abb65b4bSAndroid Build Coastguard Worker src2_8x16b = vabsq_s32(out2a_8x16b);
559*abb65b4bSAndroid Build Coastguard Worker src3_8x16b = vabsq_s32(out3a_8x16b);
560*abb65b4bSAndroid Build Coastguard Worker src4_8x16b = vabsq_s32(out4a_8x16b);
561*abb65b4bSAndroid Build Coastguard Worker src5_8x16b = vabsq_s32(out5a_8x16b);
562*abb65b4bSAndroid Build Coastguard Worker src6_8x16b = vabsq_s32(out6a_8x16b);
563*abb65b4bSAndroid Build Coastguard Worker src7_8x16b = vabsq_s32(out7a_8x16b);
564*abb65b4bSAndroid Build Coastguard Worker
565*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src0_8x16b);
566*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src1_8x16b);
567*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src2_8x16b);
568*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src3_8x16b);
569*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src4_8x16b);
570*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src5_8x16b);
571*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src6_8x16b);
572*abb65b4bSAndroid Build Coastguard Worker satd += vaddvq_s32(src7_8x16b);
573*abb65b4bSAndroid Build Coastguard Worker
574*abb65b4bSAndroid Build Coastguard Worker satd = (satd + 2) >> 2;
575*abb65b4bSAndroid Build Coastguard Worker return satd;
576*abb65b4bSAndroid Build Coastguard Worker }
577*abb65b4bSAndroid Build Coastguard Worker }
578*abb65b4bSAndroid Build Coastguard Worker #endif /* ARM_NEON */
579