1*abb65b4bSAndroid Build Coastguard Worker /*
2*abb65b4bSAndroid Build Coastguard Worker * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3*abb65b4bSAndroid Build Coastguard Worker * All Rights Reserved.
4*abb65b4bSAndroid Build Coastguard Worker *
5*abb65b4bSAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
6*abb65b4bSAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
7*abb65b4bSAndroid Build Coastguard Worker *
8*abb65b4bSAndroid Build Coastguard Worker * - Redistributions of source code must retain the above copyright notice,
9*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer.
10*abb65b4bSAndroid Build Coastguard Worker *
11*abb65b4bSAndroid Build Coastguard Worker * - Redistributions in binary form must reproduce the above copyright notice,
12*abb65b4bSAndroid Build Coastguard Worker * this list of conditions and the following disclaimer in the documentation
13*abb65b4bSAndroid Build Coastguard Worker * and/or other materials provided with the distribution.
14*abb65b4bSAndroid Build Coastguard Worker *
15*abb65b4bSAndroid Build Coastguard Worker * - Neither the name of the copyright owner, nor the names of its contributors
16*abb65b4bSAndroid Build Coastguard Worker * may be used to endorse or promote products derived from this software
17*abb65b4bSAndroid Build Coastguard Worker * without specific prior written permission.
18*abb65b4bSAndroid Build Coastguard Worker *
19*abb65b4bSAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20*abb65b4bSAndroid Build Coastguard Worker * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21*abb65b4bSAndroid Build Coastguard Worker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22*abb65b4bSAndroid Build Coastguard Worker * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23*abb65b4bSAndroid Build Coastguard Worker * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24*abb65b4bSAndroid Build Coastguard Worker * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25*abb65b4bSAndroid Build Coastguard Worker * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26*abb65b4bSAndroid Build Coastguard Worker * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27*abb65b4bSAndroid Build Coastguard Worker * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28*abb65b4bSAndroid Build Coastguard Worker * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29*abb65b4bSAndroid Build Coastguard Worker * POSSIBILITY OF SUCH DAMAGE.
30*abb65b4bSAndroid Build Coastguard Worker */
31*abb65b4bSAndroid Build Coastguard Worker
32*abb65b4bSAndroid Build Coastguard Worker
33*abb65b4bSAndroid Build Coastguard Worker #include "oapv_def.h"
34*abb65b4bSAndroid Build Coastguard Worker #include "oapv_tq_neon.h"
35*abb65b4bSAndroid Build Coastguard Worker
36*abb65b4bSAndroid Build Coastguard Worker #if ARM_NEON
37*abb65b4bSAndroid Build Coastguard Worker
38*abb65b4bSAndroid Build Coastguard Worker const s32 oapv_coeff[8][4] =
39*abb65b4bSAndroid Build Coastguard Worker {
40*abb65b4bSAndroid Build Coastguard Worker {64, 64, 64, 64}, // 0th row coeff
41*abb65b4bSAndroid Build Coastguard Worker {89, 75, 50, 18}, // 2nd row coeff
42*abb65b4bSAndroid Build Coastguard Worker {84, 35, 84, 35}, // 3rd row coeff
43*abb65b4bSAndroid Build Coastguard Worker {75,-18,-89,-50}, // 4th row coeff
44*abb65b4bSAndroid Build Coastguard Worker {64,-64, 64,-64}, // 5th row coeff
45*abb65b4bSAndroid Build Coastguard Worker {50,-89, 18, 75}, // 6th row coeff
46*abb65b4bSAndroid Build Coastguard Worker {35,-84, 35,-84}, // 7th row coeff
47*abb65b4bSAndroid Build Coastguard Worker {18,-50, 75,-89} // 8th row coeff
48*abb65b4bSAndroid Build Coastguard Worker };
49*abb65b4bSAndroid Build Coastguard Worker
50*abb65b4bSAndroid Build Coastguard Worker #define multiply_s32(part1, part2, coeff, res) \
51*abb65b4bSAndroid Build Coastguard Worker low = vmulq_s32(part1, coeff); \
52*abb65b4bSAndroid Build Coastguard Worker high = vmulq_s32(part2, coeff); \
53*abb65b4bSAndroid Build Coastguard Worker res = vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), vpadd_s32(vget_low_s32(high), vget_high_s32(high))); \
54*abb65b4bSAndroid Build Coastguard Worker
oapv_tx_pb8b_neon(s16 * src,s16 * dst,const int shift,int line)55*abb65b4bSAndroid Build Coastguard Worker static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
56*abb65b4bSAndroid Build Coastguard Worker {
57*abb65b4bSAndroid Build Coastguard Worker s16 i;
58*abb65b4bSAndroid Build Coastguard Worker s16 *tempSrc = src;
59*abb65b4bSAndroid Build Coastguard Worker int16x4_t src_part1, src_part2;
60*abb65b4bSAndroid Build Coastguard Worker int32x4_t coeff0, coeff1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7;
61*abb65b4bSAndroid Build Coastguard Worker int32x4_t add = vdupq_n_s32(1 << (shift - 1));
62*abb65b4bSAndroid Build Coastguard Worker int32x4_t sh = vdupq_n_s32(-shift);
63*abb65b4bSAndroid Build Coastguard Worker
64*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE_part1, EE_part2, EO_part1, EO_part2, low, high, result0, result1, result2, result3, result4, result5, result6, result7, E1, O1, E2, O2, res1, res2, res3, res4;
65*abb65b4bSAndroid Build Coastguard Worker
66*abb65b4bSAndroid Build Coastguard Worker for(i = 0; i < 8; i += 4)
67*abb65b4bSAndroid Build Coastguard Worker {
68*abb65b4bSAndroid Build Coastguard Worker // Loading src[0 - 3] and src[4 - 7]
69*abb65b4bSAndroid Build Coastguard Worker src_part1 = vld1_s16(tempSrc);
70*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
71*abb65b4bSAndroid Build Coastguard Worker src_part2 = vld1_s16(tempSrc);
72*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
73*abb65b4bSAndroid Build Coastguard Worker
74*abb65b4bSAndroid Build Coastguard Worker //reverse src_part2
75*abb65b4bSAndroid Build Coastguard Worker src_part2 = vrev64_s16(src_part2);
76*abb65b4bSAndroid Build Coastguard Worker
77*abb65b4bSAndroid Build Coastguard Worker E1 = vaddl_s16(src_part1, src_part2);
78*abb65b4bSAndroid Build Coastguard Worker O1 = vsubl_s16(src_part1, src_part2);
79*abb65b4bSAndroid Build Coastguard Worker
80*abb65b4bSAndroid Build Coastguard Worker // Loading src[8 - 11] and src[12 - 15]
81*abb65b4bSAndroid Build Coastguard Worker src_part1 = vld1_s16(tempSrc);
82*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
83*abb65b4bSAndroid Build Coastguard Worker src_part2 = vld1_s16(tempSrc);
84*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
85*abb65b4bSAndroid Build Coastguard Worker
86*abb65b4bSAndroid Build Coastguard Worker //reverse src_part2
87*abb65b4bSAndroid Build Coastguard Worker src_part2 = vrev64_s16(src_part2);
88*abb65b4bSAndroid Build Coastguard Worker
89*abb65b4bSAndroid Build Coastguard Worker E2 = vaddl_s16(src_part1, src_part2);
90*abb65b4bSAndroid Build Coastguard Worker O2 = vsubl_s16(src_part1, src_part2);
91*abb65b4bSAndroid Build Coastguard Worker
92*abb65b4bSAndroid Build Coastguard Worker int32x4_t tmp1 = vcombine_s32(vget_low_s32(E1), vget_low_s32(E2));
93*abb65b4bSAndroid Build Coastguard Worker int32x4_t tmp2 = vcombine_s32(vget_high_s32(E1), vget_high_s32(E2));
94*abb65b4bSAndroid Build Coastguard Worker tmp2 = vrev64q_s32(tmp2);
95*abb65b4bSAndroid Build Coastguard Worker
96*abb65b4bSAndroid Build Coastguard Worker EE_part1 = vaddq_s32(tmp1, tmp2);
97*abb65b4bSAndroid Build Coastguard Worker EO_part1 = vsubq_s32(tmp1, tmp2);
98*abb65b4bSAndroid Build Coastguard Worker
99*abb65b4bSAndroid Build Coastguard Worker coeff1 = vld1q_s32(oapv_coeff[1]);
100*abb65b4bSAndroid Build Coastguard Worker coeff3 = vld1q_s32(oapv_coeff[3]);
101*abb65b4bSAndroid Build Coastguard Worker coeff5 = vld1q_s32(oapv_coeff[5]);
102*abb65b4bSAndroid Build Coastguard Worker coeff7 = vld1q_s32(oapv_coeff[7]);
103*abb65b4bSAndroid Build Coastguard Worker
104*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff1, result1);
105*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff3, result3);
106*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff5, result5);
107*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff7, result7);
108*abb65b4bSAndroid Build Coastguard Worker
109*abb65b4bSAndroid Build Coastguard Worker res1 = vpaddq_s32(result1, result3);
110*abb65b4bSAndroid Build Coastguard Worker res2 = vpaddq_s32(result5, result7);
111*abb65b4bSAndroid Build Coastguard Worker
112*abb65b4bSAndroid Build Coastguard Worker // add and shift
113*abb65b4bSAndroid Build Coastguard Worker res1 = vshlq_s32(vaddq_s32(res1, add), sh);
114*abb65b4bSAndroid Build Coastguard Worker res2 = vshlq_s32(vaddq_s32(res2, add), sh);
115*abb65b4bSAndroid Build Coastguard Worker
116*abb65b4bSAndroid Build Coastguard Worker // Loading src[16 - 19] and src[20 - 23]
117*abb65b4bSAndroid Build Coastguard Worker src_part1 = vld1_s16(tempSrc);
118*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
119*abb65b4bSAndroid Build Coastguard Worker src_part2 = vld1_s16(tempSrc);
120*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
121*abb65b4bSAndroid Build Coastguard Worker
122*abb65b4bSAndroid Build Coastguard Worker //reverse src_part2
123*abb65b4bSAndroid Build Coastguard Worker src_part2 = vrev64_s16(src_part2);
124*abb65b4bSAndroid Build Coastguard Worker
125*abb65b4bSAndroid Build Coastguard Worker E1 = vaddl_s16(src_part1, src_part2);
126*abb65b4bSAndroid Build Coastguard Worker O1 = vsubl_s16(src_part1, src_part2);
127*abb65b4bSAndroid Build Coastguard Worker
128*abb65b4bSAndroid Build Coastguard Worker // Loading src[24 - 27] and src[28 - 31]
129*abb65b4bSAndroid Build Coastguard Worker src_part1 = vld1_s16(tempSrc);
130*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
131*abb65b4bSAndroid Build Coastguard Worker src_part2 = vld1_s16(tempSrc);
132*abb65b4bSAndroid Build Coastguard Worker tempSrc += 4;
133*abb65b4bSAndroid Build Coastguard Worker
134*abb65b4bSAndroid Build Coastguard Worker //reverse src_part2
135*abb65b4bSAndroid Build Coastguard Worker src_part2 = vrev64_s16(src_part2);
136*abb65b4bSAndroid Build Coastguard Worker
137*abb65b4bSAndroid Build Coastguard Worker E2 = vaddl_s16(src_part1, src_part2);
138*abb65b4bSAndroid Build Coastguard Worker O2 = vsubl_s16(src_part1, src_part2);
139*abb65b4bSAndroid Build Coastguard Worker
140*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff1, result1);
141*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff3, result3);
142*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff5, result5);
143*abb65b4bSAndroid Build Coastguard Worker multiply_s32(O1, O2, coeff7, result7);
144*abb65b4bSAndroid Build Coastguard Worker
145*abb65b4bSAndroid Build Coastguard Worker res3 = vpaddq_s32(result1, result3);
146*abb65b4bSAndroid Build Coastguard Worker res4 = vpaddq_s32(result5, result7);
147*abb65b4bSAndroid Build Coastguard Worker
148*abb65b4bSAndroid Build Coastguard Worker // add and shift
149*abb65b4bSAndroid Build Coastguard Worker res3 = vshlq_s32(vaddq_s32(res3, add), sh);
150*abb65b4bSAndroid Build Coastguard Worker res4 = vshlq_s32(vaddq_s32(res4, add), sh);
151*abb65b4bSAndroid Build Coastguard Worker
152*abb65b4bSAndroid Build Coastguard Worker // store result in destination
153*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 1 * line + i, vmovn_s32(vcombine_s32(vget_low_s32(res1), vget_low_s32(res3))));
154*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 3 * line + i, vmovn_s32(vcombine_s32(vget_high_s32(res1), vget_high_s32(res3))));
155*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 5 * line + i, vmovn_s32(vcombine_s32(vget_low_s32(res2), vget_low_s32(res4))));
156*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 7 * line + i, vmovn_s32(vcombine_s32(vget_high_s32(res2), vget_high_s32(res4))));
157*abb65b4bSAndroid Build Coastguard Worker
158*abb65b4bSAndroid Build Coastguard Worker coeff0 = vld1q_s32(oapv_coeff[0]);
159*abb65b4bSAndroid Build Coastguard Worker coeff2 = vld1q_s32(oapv_coeff[2]);
160*abb65b4bSAndroid Build Coastguard Worker coeff4 = vld1q_s32(oapv_coeff[4]);
161*abb65b4bSAndroid Build Coastguard Worker coeff6 = vld1q_s32(oapv_coeff[6]);
162*abb65b4bSAndroid Build Coastguard Worker
163*abb65b4bSAndroid Build Coastguard Worker tmp1 = vcombine_s32(vget_low_s32(E1), vget_low_s32(E2));
164*abb65b4bSAndroid Build Coastguard Worker tmp2 = vcombine_s32(vget_high_s32(E1), vget_high_s32(E2));
165*abb65b4bSAndroid Build Coastguard Worker tmp2 = vrev64q_s32(tmp2);
166*abb65b4bSAndroid Build Coastguard Worker
167*abb65b4bSAndroid Build Coastguard Worker EE_part2 = vaddq_s32(tmp1, tmp2);
168*abb65b4bSAndroid Build Coastguard Worker EO_part2 = vsubq_s32(tmp1, tmp2);
169*abb65b4bSAndroid Build Coastguard Worker
170*abb65b4bSAndroid Build Coastguard Worker multiply_s32(EE_part1, EE_part2, coeff0, result0);
171*abb65b4bSAndroid Build Coastguard Worker multiply_s32(EE_part1, EE_part2, coeff4, result4);
172*abb65b4bSAndroid Build Coastguard Worker multiply_s32(EO_part1, EO_part2, coeff2, result2);
173*abb65b4bSAndroid Build Coastguard Worker multiply_s32(EO_part1, EO_part2, coeff6, result6);
174*abb65b4bSAndroid Build Coastguard Worker
175*abb65b4bSAndroid Build Coastguard Worker // add and shift
176*abb65b4bSAndroid Build Coastguard Worker result0 = vshlq_s32(vaddq_s32(result0, add), sh);
177*abb65b4bSAndroid Build Coastguard Worker result2 = vshlq_s32(vaddq_s32(result2, add), sh);
178*abb65b4bSAndroid Build Coastguard Worker result4 = vshlq_s32(vaddq_s32(result4, add), sh);
179*abb65b4bSAndroid Build Coastguard Worker result6 = vshlq_s32(vaddq_s32(result6, add), sh);
180*abb65b4bSAndroid Build Coastguard Worker
181*abb65b4bSAndroid Build Coastguard Worker // store result in destination
182*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 0 * line + i, vmovn_s32(result0));
183*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 2 * line + i, vmovn_s32(result2));
184*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 4 * line + i, vmovn_s32(result4));
185*abb65b4bSAndroid Build Coastguard Worker vst1_s16(dst + 6 * line + i, vmovn_s32(result6));
186*abb65b4bSAndroid Build Coastguard Worker }
187*abb65b4bSAndroid Build Coastguard Worker }
188*abb65b4bSAndroid Build Coastguard Worker
189*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2] =
190*abb65b4bSAndroid Build Coastguard Worker {
191*abb65b4bSAndroid Build Coastguard Worker oapv_tx_pb8b_neon,
192*abb65b4bSAndroid Build Coastguard Worker NULL
193*abb65b4bSAndroid Build Coastguard Worker };
194*abb65b4bSAndroid Build Coastguard Worker
195*abb65b4bSAndroid Build Coastguard Worker ///////////////////////////////////////////////////////////////////////////////
196*abb65b4bSAndroid Build Coastguard Worker // end of encoder code
197*abb65b4bSAndroid Build Coastguard Worker // ENABLE_ENCODER
198*abb65b4bSAndroid Build Coastguard Worker ///////////////////////////////////////////////////////////////////////////////
199*abb65b4bSAndroid Build Coastguard Worker
200*abb65b4bSAndroid Build Coastguard Worker // Required coefficients from oapv_tbl_tm8
201*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_0 89 // coef10, -coef32, -coef51, coef73
202*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_1 75 // coef11, coef30, coef53, coef72
203*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_2 50 // coef12, -coef33, coef50, -coef71
204*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_3 18 // coef13, -coef31, coef52, coef70
205*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_5 84 // coef20, -coef61
206*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_6 35 // coef21, coef60
207*abb65b4bSAndroid Build Coastguard Worker # define OAPV_INVTX_COEF_4_LOG2 6 // log2(coef00), log2(coef01), log2(coef40), log2(-coef41)
208*abb65b4bSAndroid Build Coastguard Worker
oapv_itx_pb8b_opt_neon(s16 * src,int shift1,int shift2,int line)209*abb65b4bSAndroid Build Coastguard Worker void oapv_itx_pb8b_opt_neon(s16* src, int shift1, int shift2, int line)
210*abb65b4bSAndroid Build Coastguard Worker {
211*abb65b4bSAndroid Build Coastguard Worker int32x4_t add1 = vdupq_n_s32(1 << (shift1 - 1));
212*abb65b4bSAndroid Build Coastguard Worker int32x4_t add2 = vdupq_n_s32(1 << (shift2 - 1));
213*abb65b4bSAndroid Build Coastguard Worker
214*abb65b4bSAndroid Build Coastguard Worker int32x4_t sh1 = vdupq_n_s32(-shift1);
215*abb65b4bSAndroid Build Coastguard Worker int32x4_t sh2 = vdupq_n_s32(-shift2);
216*abb65b4bSAndroid Build Coastguard Worker
217*abb65b4bSAndroid Build Coastguard Worker int16x4_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, dest9, dest10, dest11, dest12, dest13, dest14, dest15;
218*abb65b4bSAndroid Build Coastguard Worker
219*abb65b4bSAndroid Build Coastguard Worker //DCT Pass 1
220*abb65b4bSAndroid Build Coastguard Worker {
221*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_0_8 = vld1q_s16(src);
222*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_1_9 = vld1q_s16(src + line);
223*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_2_10 = vld1q_s16(src + 2 * line);
224*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_3_11 = vld1q_s16(src + 3 * line);
225*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_4_12 = vld1q_s16(src + 4 * line);
226*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_5_13 = vld1q_s16(src + 5 * line);
227*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_6_14 = vld1q_s16(src + 6 * line);
228*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_7_15 = vld1q_s16(src + 7 * line);
229*abb65b4bSAndroid Build Coastguard Worker
230*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_0 = vget_low_s16(v_src_0_8);
231*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_1 = vget_low_s16(v_src_1_9);
232*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_2 = vget_low_s16(v_src_2_10);
233*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_3 = vget_low_s16(v_src_3_11);
234*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_4 = vget_low_s16(v_src_4_12);
235*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_5 = vget_low_s16(v_src_5_13);
236*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_6 = vget_low_s16(v_src_6_14);
237*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_7 = vget_low_s16(v_src_7_15);
238*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_8 = vget_high_s16(v_src_0_8);
239*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_9 = vget_high_s16(v_src_1_9);
240*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_10 = vget_high_s16(v_src_2_10);
241*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_11 = vget_high_s16(v_src_3_11);
242*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_12 = vget_high_s16(v_src_4_12);
243*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_13 = vget_high_s16(v_src_5_13);
244*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_14 = vget_high_s16(v_src_6_14);
245*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_15 = vget_high_s16(v_src_7_15);
246*abb65b4bSAndroid Build Coastguard Worker
247*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp1 = vaddq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_1));
248*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp2 = vsubq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_3));
249*abb65b4bSAndroid Build Coastguard Worker
250*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp3 = vsubq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_0));
251*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp4 = vsubq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_2));
252*abb65b4bSAndroid Build Coastguard Worker
253*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp5 = vaddq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_3));
254*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp6 = vaddq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_2));
255*abb65b4bSAndroid Build Coastguard Worker temp6 = vnegq_s32(temp6);
256*abb65b4bSAndroid Build Coastguard Worker
257*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp7 = vaddq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_1));
258*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp8 = vsubq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_0));
259*abb65b4bSAndroid Build Coastguard Worker
260*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp9 = vaddq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_1));
261*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp10 = vsubq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_3));
262*abb65b4bSAndroid Build Coastguard Worker
263*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp11 = vsubq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_0));
264*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp12 = vsubq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_2));
265*abb65b4bSAndroid Build Coastguard Worker
266*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp13 = vaddq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_3));
267*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp14 = vaddq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_2));
268*abb65b4bSAndroid Build Coastguard Worker temp14 = vnegq_s32(temp14);
269*abb65b4bSAndroid Build Coastguard Worker
270*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp15 = vaddq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_1));
271*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp16 = vsubq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_0));
272*abb65b4bSAndroid Build Coastguard Worker
273*abb65b4bSAndroid Build Coastguard Worker int32x4_t O0 = vaddq_s32(temp1, temp5);
274*abb65b4bSAndroid Build Coastguard Worker int32x4_t O1 = vaddq_s32(temp2, temp6);
275*abb65b4bSAndroid Build Coastguard Worker int32x4_t O2 = vaddq_s32(temp3, temp7);
276*abb65b4bSAndroid Build Coastguard Worker int32x4_t O3 = vaddq_s32(temp4, temp8);
277*abb65b4bSAndroid Build Coastguard Worker int32x4_t O4 = vaddq_s32(temp9, temp13);
278*abb65b4bSAndroid Build Coastguard Worker int32x4_t O5 = vaddq_s32(temp10, temp14);
279*abb65b4bSAndroid Build Coastguard Worker int32x4_t O6 = vaddq_s32(temp11, temp15);
280*abb65b4bSAndroid Build Coastguard Worker int32x4_t O7 = vaddq_s32(temp12, temp16);
281*abb65b4bSAndroid Build Coastguard Worker
282*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO0 = vaddq_s32(vmull_n_s16(v_src_2, OAPV_INVTX_COEF_5), vmull_n_s16(v_src_6, OAPV_INVTX_COEF_6));
283*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO1 = vsubq_s32(vmull_n_s16(v_src_2, OAPV_INVTX_COEF_6), vmull_n_s16(v_src_6, OAPV_INVTX_COEF_5));
284*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE0 = vaddq_s32(vshll_n_s16(v_src_0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_4, OAPV_INVTX_COEF_4_LOG2));
285*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE1 = vsubq_s32(vshll_n_s16(v_src_0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_4, OAPV_INVTX_COEF_4_LOG2));
286*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO2 = vaddq_s32(vmull_n_s16(v_src_10, OAPV_INVTX_COEF_5), vmull_n_s16(v_src_14, OAPV_INVTX_COEF_6));
287*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO3 = vsubq_s32(vmull_n_s16(v_src_10, OAPV_INVTX_COEF_6), vmull_n_s16(v_src_14, OAPV_INVTX_COEF_5));
288*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE2 = vaddq_s32(vshll_n_s16(v_src_8, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_12, OAPV_INVTX_COEF_4_LOG2));
289*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE3 = vsubq_s32(vshll_n_s16(v_src_8, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_12, OAPV_INVTX_COEF_4_LOG2));
290*abb65b4bSAndroid Build Coastguard Worker
291*abb65b4bSAndroid Build Coastguard Worker int32x4_t E0 = vaddq_s32(EE0, EO0);
292*abb65b4bSAndroid Build Coastguard Worker int32x4_t E1 = vaddq_s32(EE1, EO1);
293*abb65b4bSAndroid Build Coastguard Worker int32x4_t E2 = vsubq_s32(EE1, EO1);
294*abb65b4bSAndroid Build Coastguard Worker int32x4_t E3 = vsubq_s32(EE0, EO0);
295*abb65b4bSAndroid Build Coastguard Worker int32x4_t E4 = vaddq_s32(EE2, EO2);
296*abb65b4bSAndroid Build Coastguard Worker int32x4_t E5 = vaddq_s32(EE3, EO3);
297*abb65b4bSAndroid Build Coastguard Worker int32x4_t E6 = vsubq_s32(EE3, EO3);
298*abb65b4bSAndroid Build Coastguard Worker int32x4_t E7 = vsubq_s32(EE2, EO2);
299*abb65b4bSAndroid Build Coastguard Worker
300*abb65b4bSAndroid Build Coastguard Worker dest0 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E0, O0), add1), sh1));
301*abb65b4bSAndroid Build Coastguard Worker dest1 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E1, O1), add1), sh1));
302*abb65b4bSAndroid Build Coastguard Worker dest2 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E2, O2), add1), sh1));
303*abb65b4bSAndroid Build Coastguard Worker dest3 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E3, O3), add1), sh1));
304*abb65b4bSAndroid Build Coastguard Worker dest4 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E0, O0), add1), sh1));
305*abb65b4bSAndroid Build Coastguard Worker dest5 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E1, O1), add1), sh1));
306*abb65b4bSAndroid Build Coastguard Worker dest6 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E2, O2), add1), sh1));
307*abb65b4bSAndroid Build Coastguard Worker dest7 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E3, O3), add1), sh1));
308*abb65b4bSAndroid Build Coastguard Worker dest8 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E4, O4), add1), sh1));
309*abb65b4bSAndroid Build Coastguard Worker dest9 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E5, O5), add1), sh1));
310*abb65b4bSAndroid Build Coastguard Worker dest10 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E6, O6), add1), sh1));
311*abb65b4bSAndroid Build Coastguard Worker dest11 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E7, O7), add1), sh1));
312*abb65b4bSAndroid Build Coastguard Worker dest12 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E4, O4), add1), sh1));
313*abb65b4bSAndroid Build Coastguard Worker dest13 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E5, O5), add1), sh1));
314*abb65b4bSAndroid Build Coastguard Worker dest14 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E6, O6), add1), sh1));
315*abb65b4bSAndroid Build Coastguard Worker dest15 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E7, O7), add1), sh1));
316*abb65b4bSAndroid Build Coastguard Worker
317*abb65b4bSAndroid Build Coastguard Worker int16x4_t t0 = vzip1_s16(dest0, dest1);
318*abb65b4bSAndroid Build Coastguard Worker int16x4_t t1 = vzip1_s16(dest2, dest3);
319*abb65b4bSAndroid Build Coastguard Worker int16x4_t t2 = vzip2_s16(dest0, dest1);
320*abb65b4bSAndroid Build Coastguard Worker int16x4_t t3 = vzip2_s16(dest2, dest3);
321*abb65b4bSAndroid Build Coastguard Worker int16x4_t t4 = vzip1_s16(dest8, dest9);
322*abb65b4bSAndroid Build Coastguard Worker int16x4_t t5 = vzip1_s16(dest10, dest11);
323*abb65b4bSAndroid Build Coastguard Worker int16x4_t t6 = vzip2_s16(dest8, dest9);
324*abb65b4bSAndroid Build Coastguard Worker int16x4_t t7 = vzip2_s16(dest10, dest11);
325*abb65b4bSAndroid Build Coastguard Worker
326*abb65b4bSAndroid Build Coastguard Worker dest0 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
327*abb65b4bSAndroid Build Coastguard Worker dest1 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
328*abb65b4bSAndroid Build Coastguard Worker dest2 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
329*abb65b4bSAndroid Build Coastguard Worker dest3 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
330*abb65b4bSAndroid Build Coastguard Worker dest8 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
331*abb65b4bSAndroid Build Coastguard Worker dest9 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
332*abb65b4bSAndroid Build Coastguard Worker dest10 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
333*abb65b4bSAndroid Build Coastguard Worker dest11 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
334*abb65b4bSAndroid Build Coastguard Worker
335*abb65b4bSAndroid Build Coastguard Worker int16x4_t t8 = vzip1_s16(dest5, dest4);
336*abb65b4bSAndroid Build Coastguard Worker int16x4_t t9 = vzip1_s16(dest7, dest6);
337*abb65b4bSAndroid Build Coastguard Worker int16x4_t t10 = vzip2_s16(dest5, dest4);
338*abb65b4bSAndroid Build Coastguard Worker int16x4_t t11 = vzip2_s16(dest7, dest6);
339*abb65b4bSAndroid Build Coastguard Worker int16x4_t t12 = vzip1_s16(dest13, dest12);
340*abb65b4bSAndroid Build Coastguard Worker int16x4_t t13 = vzip1_s16(dest15, dest14);
341*abb65b4bSAndroid Build Coastguard Worker int16x4_t t14 = vzip2_s16(dest13, dest12);
342*abb65b4bSAndroid Build Coastguard Worker int16x4_t t15 = vzip2_s16(dest15, dest14);
343*abb65b4bSAndroid Build Coastguard Worker
344*abb65b4bSAndroid Build Coastguard Worker dest4 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
345*abb65b4bSAndroid Build Coastguard Worker dest5 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
346*abb65b4bSAndroid Build Coastguard Worker dest6 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
347*abb65b4bSAndroid Build Coastguard Worker dest7 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
348*abb65b4bSAndroid Build Coastguard Worker dest12 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
349*abb65b4bSAndroid Build Coastguard Worker dest13 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
350*abb65b4bSAndroid Build Coastguard Worker dest14 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
351*abb65b4bSAndroid Build Coastguard Worker dest15 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
352*abb65b4bSAndroid Build Coastguard Worker }
353*abb65b4bSAndroid Build Coastguard Worker
354*abb65b4bSAndroid Build Coastguard Worker //DCT Pass 2
355*abb65b4bSAndroid Build Coastguard Worker {
356*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp1 = vaddq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_0), vmull_n_s16(dest3, OAPV_INVTX_COEF_1));
357*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp2 = vsubq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_1), vmull_n_s16(dest3, OAPV_INVTX_COEF_3));
358*abb65b4bSAndroid Build Coastguard Worker
359*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp3 = vsubq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_2), vmull_n_s16(dest3, OAPV_INVTX_COEF_0));
360*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp4 = vsubq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_3), vmull_n_s16(dest3, OAPV_INVTX_COEF_2));
361*abb65b4bSAndroid Build Coastguard Worker
362*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp5 = vaddq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_2), vmull_n_s16(dest11, OAPV_INVTX_COEF_3));
363*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp6 = vaddq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_0), vmull_n_s16(dest11, OAPV_INVTX_COEF_2));
364*abb65b4bSAndroid Build Coastguard Worker temp6 = vnegq_s32(temp6);
365*abb65b4bSAndroid Build Coastguard Worker
366*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp7 = vaddq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_3), vmull_n_s16(dest11, OAPV_INVTX_COEF_1));
367*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp8 = vsubq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_1), vmull_n_s16(dest11, OAPV_INVTX_COEF_0));
368*abb65b4bSAndroid Build Coastguard Worker
369*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp9 = vaddq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_0), vmull_n_s16(dest7, OAPV_INVTX_COEF_1));
370*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp10 = vsubq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_1), vmull_n_s16(dest7, OAPV_INVTX_COEF_3));
371*abb65b4bSAndroid Build Coastguard Worker
372*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp11 = vsubq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_2), vmull_n_s16(dest7, OAPV_INVTX_COEF_0));
373*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp12 = vsubq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_3), vmull_n_s16(dest7, OAPV_INVTX_COEF_2));
374*abb65b4bSAndroid Build Coastguard Worker
375*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp13 = vaddq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_2), vmull_n_s16(dest15, OAPV_INVTX_COEF_3));
376*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp14 = vaddq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_0), vmull_n_s16(dest15, OAPV_INVTX_COEF_2));
377*abb65b4bSAndroid Build Coastguard Worker temp14 = vnegq_s32(temp14);
378*abb65b4bSAndroid Build Coastguard Worker
379*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp15 = vaddq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_3), vmull_n_s16(dest15, OAPV_INVTX_COEF_1));
380*abb65b4bSAndroid Build Coastguard Worker int32x4_t temp16 = vsubq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_1), vmull_n_s16(dest15, OAPV_INVTX_COEF_0));
381*abb65b4bSAndroid Build Coastguard Worker
382*abb65b4bSAndroid Build Coastguard Worker int32x4_t O0 = vaddq_s32(temp1, temp5);
383*abb65b4bSAndroid Build Coastguard Worker int32x4_t O1 = vaddq_s32(temp2, temp6);
384*abb65b4bSAndroid Build Coastguard Worker int32x4_t O2 = vaddq_s32(temp3, temp7);
385*abb65b4bSAndroid Build Coastguard Worker int32x4_t O3 = vaddq_s32(temp4, temp8);
386*abb65b4bSAndroid Build Coastguard Worker int32x4_t O4 = vaddq_s32(temp9, temp13);
387*abb65b4bSAndroid Build Coastguard Worker int32x4_t O5 = vaddq_s32(temp10, temp14);
388*abb65b4bSAndroid Build Coastguard Worker int32x4_t O6 = vaddq_s32(temp11, temp15);
389*abb65b4bSAndroid Build Coastguard Worker int32x4_t O7 = vaddq_s32(temp12, temp16);
390*abb65b4bSAndroid Build Coastguard Worker
391*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO0 = vaddq_s32(vmull_n_s16(dest2, OAPV_INVTX_COEF_5), vmull_n_s16(dest10, OAPV_INVTX_COEF_6));
392*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO1 = vsubq_s32(vmull_n_s16(dest2, OAPV_INVTX_COEF_6), vmull_n_s16(dest10, OAPV_INVTX_COEF_5));
393*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE0 = vaddq_s32(vshll_n_s16(dest0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest8, OAPV_INVTX_COEF_4_LOG2));
394*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE1 = vsubq_s32(vshll_n_s16(dest0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest8, OAPV_INVTX_COEF_4_LOG2));
395*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO2 = vaddq_s32(vmull_n_s16(dest6, OAPV_INVTX_COEF_5), vmull_n_s16(dest14, OAPV_INVTX_COEF_6));
396*abb65b4bSAndroid Build Coastguard Worker int32x4_t EO3 = vsubq_s32(vmull_n_s16(dest6, OAPV_INVTX_COEF_6), vmull_n_s16(dest14, OAPV_INVTX_COEF_5));
397*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE2 = vaddq_s32(vshll_n_s16(dest4, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest12, OAPV_INVTX_COEF_4_LOG2));
398*abb65b4bSAndroid Build Coastguard Worker int32x4_t EE3 = vsubq_s32(vshll_n_s16(dest4, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest12, OAPV_INVTX_COEF_4_LOG2));
399*abb65b4bSAndroid Build Coastguard Worker
400*abb65b4bSAndroid Build Coastguard Worker int32x4_t E0 = vaddq_s32(EE0, EO0);
401*abb65b4bSAndroid Build Coastguard Worker int32x4_t E1 = vaddq_s32(EE1, EO1);
402*abb65b4bSAndroid Build Coastguard Worker int32x4_t E2 = vsubq_s32(EE1, EO1);
403*abb65b4bSAndroid Build Coastguard Worker int32x4_t E3 = vsubq_s32(EE0, EO0);
404*abb65b4bSAndroid Build Coastguard Worker int32x4_t E4 = vaddq_s32(EE2, EO2);
405*abb65b4bSAndroid Build Coastguard Worker int32x4_t E5 = vaddq_s32(EE3, EO3);
406*abb65b4bSAndroid Build Coastguard Worker int32x4_t E6 = vsubq_s32(EE3, EO3);
407*abb65b4bSAndroid Build Coastguard Worker int32x4_t E7 = vsubq_s32(EE2, EO2);
408*abb65b4bSAndroid Build Coastguard Worker
409*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_0 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E0, O0), add2), sh2));
410*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_1 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E1, O1), add2), sh2));
411*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_2 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E2, O2), add2), sh2));
412*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_3 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E3, O3), add2), sh2));
413*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_4 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E0, O0), add2), sh2));
414*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_5 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E1, O1), add2), sh2));
415*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_6 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E2, O2), add2), sh2));
416*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_7 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E3, O3), add2), sh2));
417*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_8 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E4, O4), add2), sh2));
418*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_9 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E5, O5), add2), sh2));
419*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_10 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E6, O6), add2), sh2));
420*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_11 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E7, O7), add2), sh2));
421*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_12 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E4, O4), add2), sh2));
422*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_13 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E5, O5), add2), sh2));
423*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_14 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E6, O6), add2), sh2));
424*abb65b4bSAndroid Build Coastguard Worker int16x4_t v_src_15 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E7, O7), add2), sh2));
425*abb65b4bSAndroid Build Coastguard Worker
426*abb65b4bSAndroid Build Coastguard Worker int16x4_t t0 = vzip1_s16(v_src_0, v_src_1);
427*abb65b4bSAndroid Build Coastguard Worker int16x4_t t1 = vzip1_s16(v_src_2, v_src_3);
428*abb65b4bSAndroid Build Coastguard Worker int16x4_t t2 = vzip2_s16(v_src_0, v_src_1);
429*abb65b4bSAndroid Build Coastguard Worker int16x4_t t3 = vzip2_s16(v_src_2, v_src_3);
430*abb65b4bSAndroid Build Coastguard Worker int16x4_t t4 = vzip1_s16(v_src_8, v_src_9);
431*abb65b4bSAndroid Build Coastguard Worker int16x4_t t5 = vzip1_s16(v_src_10, v_src_11);
432*abb65b4bSAndroid Build Coastguard Worker int16x4_t t6 = vzip2_s16(v_src_8, v_src_9);
433*abb65b4bSAndroid Build Coastguard Worker int16x4_t t7 = vzip2_s16(v_src_10, v_src_11);
434*abb65b4bSAndroid Build Coastguard Worker
435*abb65b4bSAndroid Build Coastguard Worker v_src_0 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
436*abb65b4bSAndroid Build Coastguard Worker v_src_1 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
437*abb65b4bSAndroid Build Coastguard Worker v_src_2 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
438*abb65b4bSAndroid Build Coastguard Worker v_src_3 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
439*abb65b4bSAndroid Build Coastguard Worker v_src_8 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
440*abb65b4bSAndroid Build Coastguard Worker v_src_9 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
441*abb65b4bSAndroid Build Coastguard Worker v_src_10 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
442*abb65b4bSAndroid Build Coastguard Worker v_src_11 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
443*abb65b4bSAndroid Build Coastguard Worker
444*abb65b4bSAndroid Build Coastguard Worker int16x4_t t8 = vzip1_s16(v_src_5, v_src_4);
445*abb65b4bSAndroid Build Coastguard Worker int16x4_t t9 = vzip1_s16(v_src_7, v_src_6);
446*abb65b4bSAndroid Build Coastguard Worker int16x4_t t10 = vzip2_s16(v_src_5, v_src_4);
447*abb65b4bSAndroid Build Coastguard Worker int16x4_t t11 = vzip2_s16(v_src_7, v_src_6);
448*abb65b4bSAndroid Build Coastguard Worker int16x4_t t12 = vzip1_s16(v_src_13, v_src_12);
449*abb65b4bSAndroid Build Coastguard Worker int16x4_t t13 = vzip1_s16(v_src_15, v_src_14);
450*abb65b4bSAndroid Build Coastguard Worker int16x4_t t14 = vzip2_s16(v_src_13, v_src_12);
451*abb65b4bSAndroid Build Coastguard Worker int16x4_t t15 = vzip2_s16(v_src_15, v_src_14);
452*abb65b4bSAndroid Build Coastguard Worker
453*abb65b4bSAndroid Build Coastguard Worker v_src_4 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
454*abb65b4bSAndroid Build Coastguard Worker v_src_5 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
455*abb65b4bSAndroid Build Coastguard Worker v_src_6 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
456*abb65b4bSAndroid Build Coastguard Worker v_src_7 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
457*abb65b4bSAndroid Build Coastguard Worker v_src_12 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
458*abb65b4bSAndroid Build Coastguard Worker v_src_13 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
459*abb65b4bSAndroid Build Coastguard Worker v_src_14 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
460*abb65b4bSAndroid Build Coastguard Worker v_src_15 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
461*abb65b4bSAndroid Build Coastguard Worker
462*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_0_4 = vcombine_s16(v_src_0, v_src_4);
463*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_1_5 = vcombine_s16(v_src_1, v_src_5);
464*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_2_6 = vcombine_s16(v_src_2, v_src_6);
465*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_3_7 = vcombine_s16(v_src_3, v_src_7);
466*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_8_12 = vcombine_s16(v_src_8, v_src_12);
467*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_9_13 = vcombine_s16(v_src_9, v_src_13);
468*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_10_14 = vcombine_s16(v_src_10, v_src_14);
469*abb65b4bSAndroid Build Coastguard Worker int16x8_t v_src_11_15 = vcombine_s16(v_src_11, v_src_15);
470*abb65b4bSAndroid Build Coastguard Worker
471*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src, v_src_0_4);
472*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 8, v_src_1_5);
473*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 16, v_src_2_6);
474*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 24, v_src_3_7);
475*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 32, v_src_8_12);
476*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 40, v_src_9_13);
477*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 48, v_src_10_14);
478*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(src + 56, v_src_11_15);
479*abb65b4bSAndroid Build Coastguard Worker }
480*abb65b4bSAndroid Build Coastguard Worker }
481*abb65b4bSAndroid Build Coastguard Worker
482*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_itx_t oapv_tbl_fn_itx_neon[2] =
483*abb65b4bSAndroid Build Coastguard Worker {
484*abb65b4bSAndroid Build Coastguard Worker oapv_itx_pb8b_opt_neon,
485*abb65b4bSAndroid Build Coastguard Worker NULL
486*abb65b4bSAndroid Build Coastguard Worker };
487*abb65b4bSAndroid Build Coastguard Worker
oapv_quant_neon(s16 * coef,u8 qp,int q_matrix[OAPV_BLK_D],int log2_w,int log2_h,int bit_depth,int deadzone_offset)488*abb65b4bSAndroid Build Coastguard Worker static int oapv_quant_neon(s16* coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)
489*abb65b4bSAndroid Build Coastguard Worker {
490*abb65b4bSAndroid Build Coastguard Worker s64 offset;
491*abb65b4bSAndroid Build Coastguard Worker int shift;
492*abb65b4bSAndroid Build Coastguard Worker int tr_shift;
493*abb65b4bSAndroid Build Coastguard Worker
494*abb65b4bSAndroid Build Coastguard Worker int log2_size = (log2_w + log2_h) >> 1;
495*abb65b4bSAndroid Build Coastguard Worker tr_shift = MAX_TX_DYNAMIC_RANGE - bit_depth - log2_size;
496*abb65b4bSAndroid Build Coastguard Worker shift = QUANT_SHIFT + tr_shift + (qp / 6);
497*abb65b4bSAndroid Build Coastguard Worker offset = (s64)deadzone_offset << (shift - 9);
498*abb65b4bSAndroid Build Coastguard Worker int pixels=(1 << (log2_w + log2_h));
499*abb65b4bSAndroid Build Coastguard Worker
500*abb65b4bSAndroid Build Coastguard Worker int i;
501*abb65b4bSAndroid Build Coastguard Worker int16x8_t coef_row;
502*abb65b4bSAndroid Build Coastguard Worker int64x2_t offset_vector = vdupq_n_s64(offset);
503*abb65b4bSAndroid Build Coastguard Worker int64x2_t shift_vector = vdupq_n_s64(-shift);
504*abb65b4bSAndroid Build Coastguard Worker uint16x8_t zero_vector = vdupq_n_s16(0);
505*abb65b4bSAndroid Build Coastguard Worker
506*abb65b4bSAndroid Build Coastguard Worker for (i = 0; i < pixels; i+=8)
507*abb65b4bSAndroid Build Coastguard Worker {
508*abb65b4bSAndroid Build Coastguard Worker // Load one coef row
509*abb65b4bSAndroid Build Coastguard Worker coef_row = vld1q_s16(coef+i);
510*abb65b4bSAndroid Build Coastguard Worker
511*abb65b4bSAndroid Build Coastguard Worker // Extract coef signs and construct abs coef-vec
512*abb65b4bSAndroid Build Coastguard Worker uint16x8_t sign_mask = vcltq_s16(coef_row, zero_vector);
513*abb65b4bSAndroid Build Coastguard Worker int16x8_t coef_row_abs = vabsq_s16(coef_row);
514*abb65b4bSAndroid Build Coastguard Worker
515*abb65b4bSAndroid Build Coastguard Worker // Split abs coef-vec and unpack to s32
516*abb65b4bSAndroid Build Coastguard Worker int32x4_t coef_low_32b = vmovl_s16(vget_low_s16(coef_row_abs));
517*abb65b4bSAndroid Build Coastguard Worker int32x4_t coef_high_32b = vmovl_high_s16(coef_row_abs);
518*abb65b4bSAndroid Build Coastguard Worker
519*abb65b4bSAndroid Build Coastguard Worker // Load q_matrix elements
520*abb65b4bSAndroid Build Coastguard Worker int32x4_t quant_matrix_low = vld1q_s32(q_matrix + i);
521*abb65b4bSAndroid Build Coastguard Worker int32x4_t quant_matrix_high = vld1q_s32(q_matrix + i + 4);
522*abb65b4bSAndroid Build Coastguard Worker
523*abb65b4bSAndroid Build Coastguard Worker // Multiply 2X: 32-bit coef with 32-bit q_matrix and add 64-bit offset_vector to store result as 64-bit
524*abb65b4bSAndroid Build Coastguard Worker int64x2_t coef_low_32b_first_half = vmlal_s32(offset_vector, vget_low_s32 (coef_low_32b), vget_low_s32 (quant_matrix_low));
525*abb65b4bSAndroid Build Coastguard Worker int64x2_t coef_low_32b_second_half = vmlal_s32(offset_vector, vget_high_s32(coef_low_32b), vget_high_s32(quant_matrix_low));
526*abb65b4bSAndroid Build Coastguard Worker
527*abb65b4bSAndroid Build Coastguard Worker int64x2_t coef_high_32b_first_half = vmlal_s32(offset_vector, vget_low_s32 (coef_high_32b), vget_low_s32 (quant_matrix_high));
528*abb65b4bSAndroid Build Coastguard Worker int64x2_t coef_high_32b_second_half = vmlal_s32(offset_vector, vget_high_s32(coef_high_32b), vget_high_s32(quant_matrix_high));
529*abb65b4bSAndroid Build Coastguard Worker
530*abb65b4bSAndroid Build Coastguard Worker // Shift 64-bit results
531*abb65b4bSAndroid Build Coastguard Worker coef_low_32b_first_half = vshlq_s64(coef_low_32b_first_half, shift_vector);
532*abb65b4bSAndroid Build Coastguard Worker coef_low_32b_second_half = vshlq_s64(coef_low_32b_second_half, shift_vector);
533*abb65b4bSAndroid Build Coastguard Worker coef_high_32b_first_half = vshlq_s64(coef_high_32b_first_half, shift_vector);
534*abb65b4bSAndroid Build Coastguard Worker coef_high_32b_second_half = vshlq_s64(coef_high_32b_second_half, shift_vector);
535*abb65b4bSAndroid Build Coastguard Worker
536*abb65b4bSAndroid Build Coastguard Worker // Combine 2X: 64x2 registers into one 32x4 register
537*abb65b4bSAndroid Build Coastguard Worker coef_low_32b = vcombine_u32(vmovn_s64(coef_low_32b_first_half), vmovn_s64(coef_low_32b_second_half));
538*abb65b4bSAndroid Build Coastguard Worker coef_high_32b = vcombine_u32(vmovn_s64(coef_high_32b_first_half), vmovn_s64(coef_high_32b_second_half));
539*abb65b4bSAndroid Build Coastguard Worker
540*abb65b4bSAndroid Build Coastguard Worker // Combine 2X: 32x4 registers into one 16x8 register
541*abb65b4bSAndroid Build Coastguard Worker int16x8_t output_vector = vcombine_u16(vmovn_s32(coef_low_32b), vmovn_s32(coef_high_32b));
542*abb65b4bSAndroid Build Coastguard Worker
543*abb65b4bSAndroid Build Coastguard Worker // Apply extracted coef sign to result
544*abb65b4bSAndroid Build Coastguard Worker output_vector = vbslq_s16(sign_mask, vnegq_s16(output_vector), output_vector);
545*abb65b4bSAndroid Build Coastguard Worker
546*abb65b4bSAndroid Build Coastguard Worker // Store result row into buffer
547*abb65b4bSAndroid Build Coastguard Worker vst1q_s16(coef + i, output_vector);
548*abb65b4bSAndroid Build Coastguard Worker }
549*abb65b4bSAndroid Build Coastguard Worker return OAPV_OK;
550*abb65b4bSAndroid Build Coastguard Worker }
551*abb65b4bSAndroid Build Coastguard Worker
552*abb65b4bSAndroid Build Coastguard Worker
553*abb65b4bSAndroid Build Coastguard Worker const oapv_fn_quant_t oapv_tbl_fn_quant_neon[2] =
554*abb65b4bSAndroid Build Coastguard Worker {
555*abb65b4bSAndroid Build Coastguard Worker oapv_quant_neon,
556*abb65b4bSAndroid Build Coastguard Worker NULL
557*abb65b4bSAndroid Build Coastguard Worker };
558*abb65b4bSAndroid Build Coastguard Worker
559*abb65b4bSAndroid Build Coastguard Worker #endif /* ARM_NEON */
560