xref: /aosp_15_r20/external/libavc/decoder/arm/svc/isvcd_iquant_itrans_residual_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_iquant_itrans_residual_neonintr.c
24  *
25  * @brief
26  *  Contains definition of functions for h264 inverse quantization inverse
27  *    transformation and resd comp
28  *
29  * @author
30  *  Kishore
31  *
32  *  @par List of Functions:
33  *  - isvcd_iquant_itrans_residual_4x4_neonintr()
34  *  - isvcd_iquant_itrans_residual_8x8_neonintr()
35  *  - isvcd_iquant_itrans_residual_4x4_dc_neonintr()
36  *  - isvcd_iquant_itrans_residual_8x8_dc_neonintr()
37  *  - isvcd_iquant_itrans_residual_chroma_4x4_neonintr()
38  *  - isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr()
39  *
40  * @remarks
41  *
42  *******************************************************************************
43  */
44 
45 /*****************************************************************************/
46 /* File Includes                                                             */
47 /*****************************************************************************/
48 
49 #include <string.h>
50 #include <arm_neon.h>
51 
52 /* User include files */
53 #include "ih264_typedefs.h"
54 #include "ih264_defs.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "ih264_trans_data.h"
59 #include "ih264_size_defs.h"
60 #include "ih264_structs.h"
61 #include "isvcd_iquant_itrans_residual.h"
62 
63 /*****************************************************************************/
64 /*                                                                           */
65 /*  Function Name : isvcd_iquant_itrans_residual_4x4_neonintr                 */
66 /*                                                                           */
67 /*  Description   : this function computes the resd output from the          */
68 /*                  IQ+IT                                                    */
69 /*                                                                           */
70 /*  Inputs        :                                                          */
71 /*  Globals       : none                                                     */
72 /*  Processing    :                                                          */
73 /*                                                                           */
74 /*  Outputs       : i4_nnz                                                   */
75 /*  Returns       : none                                                     */
76 /*                                                                           */
77 /*  Issues        : none                                                     */
78 /*                                                                           */
79 /*  Revision History:                                                        */
80 /*                                                                           */
81 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
82 /*         25 11 2021   Kishore               creation                       */
83 /*                                                                           */
84 /*****************************************************************************/
85 
isvcd_iquant_itrans_residual_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)86 WORD32 isvcd_iquant_itrans_residual_4x4_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out,
87                                                  WORD32 pred_strd, WORD32 out_strd,
88                                                  const UWORD16 *pu2_iscal_mat,
89                                                  const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
90                                                  WORD16 *pi2_tmp, WORD32 iq_start_idx,
91                                                  WORD16 *pi2_dc_ld_addr)
92 {
93     int16x4x4_t src_16x4x2;
94     int16x4x4_t iscal_16x4x2;
95     int16x4x4_t weigh_16x4x2;
96 
97     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
98     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
99 
100     int16x4_t pred0, pred1, pred2, pred3;
101     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
102     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
103     int16x4_t rq1_16x4, rq3_16x4;
104     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
105     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
106     int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
107     int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
108     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4, dup_min, dup_max;
109     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
110 
111     int16x8_t pred01_in, pred23_in;
112     WORD32 i4_nnz;
113     int16x4x2_t xx0_16x4_2, xx2_16x4_2;
114     int32x2x2_t x0_32x2_2, x1_32x2_2;
115     int16x8_t dup_val_1, dup_val_2, dup_abs;
116     UNUSED(pi2_tmp);
117 
118     src_16x4x2 = vld4_s16(pi2_src);
119     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
120     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
121 
122     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
123     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
124     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
125     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
126 
127     dup_min = vdup_n_s16(RSD_MIN);
128     dup_max = vdup_n_s16(RSD_MAX);
129 
130     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
131     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
132     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
133     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
134 
135     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
136     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
137     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
138     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
139 
140     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
141     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
142     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
143     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
144 
145     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
146     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
147     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
148     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
149 
150     if(iq_start_idx == 1)
151     {
152         q0_16x4 = vset_lane_s16(pi2_dc_ld_addr[0], q0_16x4, 0);
153     }
154 
155     rq1_16x4 = vshr_n_s16(q1_16x4, 1);      // q1 >>1
156     rq3_16x4 = vshr_n_s16(q3_16x4, 1);      // q3 >>1
157 
158     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);   // x0 =  q0 + q2
159     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);   // x1 =  q0 - q2
160     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);  // x2 =  q1>>1 - q3
161     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);  // x2 =  q1 + q3>>1
162 
163     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);  // x0+x3
164     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);  // x1+x2
165     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);  // x1-x2
166     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);  // x0-x3
167 
168     xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
169     xx0_0_16x4 = xx0_16x4_2.val[0];
170     xx0_1_16x4 = xx0_16x4_2.val[1];
171     xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
172     xx2_0_16x4 = xx2_16x4_2.val[0];
173     xx2_1_16x4 = xx2_16x4_2.val[1];
174     x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
175     x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
176     x0_32x2 = x0_32x2_2.val[0];
177     x1_32x2 = x1_32x2_2.val[0];
178     x2_32x2 = x0_32x2_2.val[1];
179     x3_32x2 = x1_32x2_2.val[1];
180 
181     x0_16x4 = vreinterpret_s16_s32(x0_32x2);
182     x1_16x4 = vreinterpret_s16_s32(x1_32x2);
183     x2_16x4 = vreinterpret_s16_s32(x2_32x2);
184     x3_16x4 = vreinterpret_s16_s32(x3_32x2);
185 
186     /* vertical inverse transform */
187     rq1_16x4 = vshr_n_s16(x1_16x4, 1);       // q1 >> 1
188     rq3_16x4 = vshr_n_s16(x3_16x4, 1);       // q3 >> 1
189 
190     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);   // x0 =  q0 + q2
191     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);   // x1 =  q0 - q2
192     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);  // x2 =  q1>>1 - q3
193     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);  // x3 =  q1 + q3>>1
194 
195     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);  // imacro = x0 + x3
196     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);  // imacro = x1 + x2
197     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);  // imacro = x1 - x2
198     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);  // imacro = x0 - x3
199 
200     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
201     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
202     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
203     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
204 
205     pred0 = vld1_s16((int16_t *) pi2_pred);
206     pred1 = vld1_s16((int16_t *) pi2_pred + pred_strd);
207     pred2 = vld1_s16((int16_t *) pi2_pred + (pred_strd * 2));
208     pred3 = vld1_s16((int16_t *) pi2_pred + (pred_strd * 3));
209 
210     x0_16x4 = vadd_s16(pred0, x0_16x4);
211     x1_16x4 = vadd_s16(pred1, x1_16x4);
212     x2_16x4 = vadd_s16(pred2, x2_16x4);
213     x3_16x4 = vadd_s16(pred3, x3_16x4);
214 
215     x0_16x4 = vmin_s16(x0_16x4, dup_max);
216     x0_16x4 = vmax_s16(x0_16x4, dup_min);
217     x1_16x4 = vmin_s16(x1_16x4, dup_max);
218     x1_16x4 = vmax_s16(x1_16x4, dup_min);
219     x2_16x4 = vmin_s16(x2_16x4, dup_max);
220     x2_16x4 = vmax_s16(x2_16x4, dup_min);
221     x3_16x4 = vmin_s16(x3_16x4, dup_max);
222     x3_16x4 = vmax_s16(x3_16x4, dup_min);
223 
224     pred01_in = vcombine_s16(x0_16x4, x1_16x4);
225     pred23_in = vcombine_s16(x2_16x4, x3_16x4);
226 
227     dup_val_1 = vabsq_s16(pred01_in);
228     dup_val_2 = vabsq_s16(pred23_in);
229     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
230     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
231              dup_abs[6] || dup_abs[7];
232 
233     vst1_s16(pi2_out, x0_16x4);
234     vst1_s16(pi2_out + out_strd, x1_16x4);
235     vst1_s16(pi2_out + (out_strd << 1), x2_16x4);
236     vst1_s16(pi2_out + ((out_strd << 1) + out_strd), x3_16x4);
237 
238     return i4_nnz;
239 }
240 
241 /*****************************************************************************/
242 /*                                                                           */
243 /*  Function Name : isvcd_iquant_itrans_residual_4x4_dc_neonintr              */
244 /*                                                                           */
245 /*  Description   : this function computes the resd output from the          */
246 /*                  IQ+IT                                                    */
247 /*                                                                           */
248 /*  Inputs        :                                                          */
249 /*  Globals       : none                                                     */
250 /*  Processing    :                                                          */
251 /*                                                                           */
252 /*  Outputs       : i4_nnz                                                   */
253 /*  Returns       : none                                                     */
254 /*                                                                           */
255 /*  Issues        : none                                                     */
256 /*                                                                           */
257 /*  Revision History:                                                        */
258 /*                                                                           */
259 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
260 /*         25 11 2021   Kishore               creation                       */
261 /*                                                                           */
262 /*****************************************************************************/
isvcd_iquant_itrans_residual_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)263 WORD32 isvcd_iquant_itrans_residual_4x4_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred,
264                                                     WORD16 *pi2_out, WORD32 pred_strd,
265                                                     WORD32 out_strd, const UWORD16 *pu2_iscal_mat,
266                                                     const UWORD16 *pu2_weigh_mat,
267                                                     UWORD32 u4_qp_div_6, WORD16 *pi2_tmp,
268                                                     WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
269 {
270     WORD32 i4_iq_out_temp;
271     int16x4_t temp_0;
272     int16x4_t pred0_in, pred1_in, pred2_in, pred3_in, dup_min, dup_max;
273     int16x8_t pred01_in, pred23_in;
274     WORD32 i4_nnz;
275     int16x8_t dup_val_1, dup_val_2, dup_abs;
276 
277     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
278     UNUSED(pi2_tmp);
279 
280     if(iq_start_idx == 0)
281     {
282         i4_iq_out_temp = pi2_src[0];
283         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
284     }
285     else
286     {
287         i4_iq_out_temp = pi2_dc_ld_addr[0];
288     }
289 
290     /* inv transform and residual comp */
291     pred0_in = vld1_s16((int16_t *) pi2_pred);
292     pi2_pred = pi2_pred + pred_strd;
293     pred1_in = vld1_s16((int16_t *) pi2_pred);
294     pi2_pred = pi2_pred + pred_strd;
295     pred2_in = vld1_s16((int16_t *) pi2_pred);
296     pi2_pred = pi2_pred + pred_strd;
297     pred3_in = vld1_s16((int16_t *) pi2_pred);
298 
299     temp_0 = vdup_n_s16((i4_iq_out_temp + 32) >> 6);
300     dup_min = vdup_n_s16(RSD_MIN);
301     dup_max = vdup_n_s16(RSD_MAX);
302 
303     pred0_in = vadd_s16(pred0_in, temp_0);
304     pred1_in = vadd_s16(pred1_in, temp_0);
305     pred2_in = vadd_s16(pred2_in, temp_0);
306     pred3_in = vadd_s16(pred3_in, temp_0);
307 
308     pred0_in = vmin_s16(pred0_in, dup_max);
309     pred0_in = vmax_s16(pred0_in, dup_min);
310     pred1_in = vmin_s16(pred1_in, dup_max);
311     pred1_in = vmax_s16(pred1_in, dup_min);
312     pred2_in = vmin_s16(pred2_in, dup_max);
313     pred2_in = vmax_s16(pred2_in, dup_min);
314     pred3_in = vmin_s16(pred3_in, dup_max);
315     pred3_in = vmax_s16(pred3_in, dup_min);
316 
317     pred01_in = vcombine_s16(pred0_in, pred1_in);
318     pred23_in = vcombine_s16(pred2_in, pred3_in);
319 
320     dup_val_1 = vabsq_s16(pred01_in);
321     dup_val_2 = vabsq_s16(pred23_in);
322     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
323     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
324              dup_abs[6] || dup_abs[7];
325 
326     vst1_s16((int16_t *) (pi2_out), pred0_in);
327     vst1_s16((int16_t *) (pi2_out + out_strd), pred1_in);
328     vst1_s16((int16_t *) (pi2_out + (out_strd * 2)), pred2_in);
329     vst1_s16((int16_t *) (pi2_out + (out_strd * 3)), pred3_in);
330 
331     return i4_nnz;
332 }
333 
334 /*****************************************************************************/
335 /*                                                                           */
336 /*  Function Name : isvcd_iquant_itrans_residual_chroma_4x4_neonintr          */
337 /*                                                                           */
338 /*  Description   : this function computes the resd output from the          */
339 /*                  IQ+IT                                                    */
340 /*                                                                           */
341 /*  Inputs        :                                                          */
342 /*  Globals       : none                                                     */
343 /*  Processing    :                                                          */
344 /*                                                                           */
345 /*  Outputs       : i4_nnz                                                   */
346 /*  Returns       : none                                                     */
347 /*                                                                           */
348 /*  Issues        : none                                                     */
349 /*                                                                           */
350 /*  Revision History:                                                        */
351 /*                                                                           */
352 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
353 /*         25 11 2021   Kishore               creation                       */
354 /*                                                                           */
355 /*****************************************************************************/
356 
isvcd_iquant_itrans_residual_chroma_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)357 WORD32 isvcd_iquant_itrans_residual_chroma_4x4_neonintr(
358     WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out, WORD32 pred_strd, WORD32 out_strd,
359     const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
360     WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
361 {
362     int16x4x4_t src_16x4x2;
363     int16x4x4_t iscal_16x4x2;
364     int16x4x4_t weigh_16x4x2;
365 
366     int16x8_t pred0, pred1, pred2, pred3;
367     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
368     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
369     int16x4_t rq1_16x4, rq3_16x4;
370     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
371     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
372     int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
373     int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
374     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
375     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
376     int16x4_t zero_16x4 = vdup_n_s16(0);
377     int16x4_t x_16x4_low, x_16x4_high;
378     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
379     int16x8_t dup_min, dup_max;
380     WORD32 i4_nnz;
381     int16x4x2_t xx0_16x4_2, xx2_16x4_2, x_16x4x2_t;
382     int32x2x2_t x0_32x2_2, x1_32x2_2;
383     int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs;
384     int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
385     uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
386     int16x8_t chroma_mask_16x8_2 = vreinterpretq_s16_s32(vdupq_n_s32(0x0000ffff));
387 
388     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
389     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
390     UNUSED(pi2_tmp);
391 
392     dup_min = vdupq_n_s16(RSD_MIN);
393     dup_max = vdupq_n_s16(RSD_MAX);
394 
395     src_16x4x2 = vld4_s16(pi2_src);
396     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
397     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
398 
399     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
400     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
401     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
402     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
403 
404     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
405     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
406     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
407     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
408 
409     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
410     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
411     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
412     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
413 
414     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
415     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
416     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
417     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
418 
419     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
420     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
421     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
422     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
423 
424     rq1_16x4 = vshr_n_s16(q1_16x4, 1);  // q1 >>1
425     rq3_16x4 = vshr_n_s16(q3_16x4, 1);  // q3 >>1
426 
427     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
428 
429     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);   // x0 =  q0 + q2
430     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);   // x1 =  q0 - q2
431     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);  // x2 =  q1>>1 - q3
432     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);  // x2 =  q1 + q3>>1
433 
434     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);  // x0+x3
435     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);  // x1+x2
436     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);  // x1-x2
437     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);  // x0-x3
438 
439     xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
440     xx0_0_16x4 = xx0_16x4_2.val[0];
441     xx0_1_16x4 = xx0_16x4_2.val[1];
442     xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
443     xx2_0_16x4 = xx2_16x4_2.val[0];
444     xx2_1_16x4 = xx2_16x4_2.val[1];
445     x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
446     x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
447     x0_32x2 = x0_32x2_2.val[0];
448     x1_32x2 = x1_32x2_2.val[0];
449     x2_32x2 = x0_32x2_2.val[1];
450     x3_32x2 = x1_32x2_2.val[1];
451 
452     x0_16x4 = vreinterpret_s16_s32(x0_32x2);
453     x1_16x4 = vreinterpret_s16_s32(x1_32x2);
454     x2_16x4 = vreinterpret_s16_s32(x2_32x2);
455     x3_16x4 = vreinterpret_s16_s32(x3_32x2);
456 
457     /* vertical inverse transform */
458     rq1_16x4 = vshr_n_s16(x1_16x4, 1);       // q1 >> 1
459     rq3_16x4 = vshr_n_s16(x3_16x4, 1);       // q3 >> 1
460 
461     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);   // x0 =  q0 + q2
462     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);   // x1 =  q0 - q2
463     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);  // x2 =  q1>>1 - q3
464     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);  // x3 =  q1 + q3>>1
465 
466     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);  // imacro = x0 + x3
467     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);  // imacro = x1 + x2
468     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);  // imacro = x1 - x2
469     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);  // imacro = x0 - x3
470 
471     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
472     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
473     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
474     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
475 
476     x_16x4x2_t = vzip_s16(x0_16x4, zero_16x4);
477     x_16x4_low = x_16x4x2_t.val[0];
478     x_16x4_high = x_16x4x2_t.val[1];
479     x0_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
480 
481     x_16x4x2_t = vzip_s16(x1_16x4, zero_16x4);
482     x_16x4_low = x_16x4x2_t.val[0];
483     x_16x4_high = x_16x4x2_t.val[1];
484     x1_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
485 
486     x_16x4x2_t = vzip_s16(x2_16x4, zero_16x4);
487     x_16x4_low = x_16x4x2_t.val[0];
488     x_16x4_high = x_16x4x2_t.val[1];
489     x2_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
490 
491     x_16x4x2_t = vzip_s16(x3_16x4, zero_16x4);
492     x_16x4_low = x_16x4x2_t.val[0];
493     x_16x4_high = x_16x4x2_t.val[1];
494     x3_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
495 
496     pred0 = vld1q_s16((int16_t *) pi2_pred);
497     pred1 = vld1q_s16((int16_t *) pi2_pred + pred_strd);
498     pred2 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 2));
499     pred3 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 3));
500 
501     x0_16x8 = vaddq_s16(pred0, x0_16x8);
502     x1_16x8 = vaddq_s16(pred1, x1_16x8);
503     x2_16x8 = vaddq_s16(pred2, x2_16x8);
504     x3_16x8 = vaddq_s16(pred3, x3_16x8);
505 
506     x0_16x8 = vminq_s16(x0_16x8, dup_max);
507     x0_16x8 = vmaxq_s16(x0_16x8, dup_min);
508     x1_16x8 = vminq_s16(x1_16x8, dup_max);
509     x1_16x8 = vmaxq_s16(x1_16x8, dup_min);
510     x2_16x8 = vminq_s16(x2_16x8, dup_max);
511     x2_16x8 = vmaxq_s16(x2_16x8, dup_min);
512     x3_16x8 = vminq_s16(x3_16x8, dup_max);
513     x3_16x8 = vmaxq_s16(x3_16x8, dup_min);
514 
515     x0_16x8 = vandq_s16(x0_16x8, chroma_mask_16x8_2);
516     x1_16x8 = vandq_s16(x1_16x8, chroma_mask_16x8_2);
517     x2_16x8 = vandq_s16(x2_16x8, chroma_mask_16x8_2);
518     x3_16x8 = vandq_s16(x3_16x8, chroma_mask_16x8_2);
519 
520     dup_val_1 = vabsq_s16(x0_16x8);
521     dup_val_2 = vabsq_s16(x1_16x8);
522     dup_val_3 = vabsq_s16(x2_16x8);
523     dup_val_4 = vabsq_s16(x3_16x8);
524     dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2);
525     dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4);
526     dup_abs = vqaddq_s16(dup_val_1, dup_val_5);
527 
528     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
529              dup_abs[6] || dup_abs[7];
530 
531     i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
532     i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
533     i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + out_strd * 2);
534     i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + out_strd * 3);
535 
536     i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, x0_16x8, i4_out_horz_16x8_r0);
537     i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, x1_16x8, i4_out_horz_16x8_r1);
538     i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, x2_16x8, i4_out_horz_16x8_r2);
539     i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, x3_16x8, i4_out_horz_16x8_r3);
540 
541     vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
542     vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
543     vst1q_s16((int16_t *) (pi2_out + out_strd * 2), i4_out_horz_16x8_r2);
544     vst1q_s16((int16_t *) (pi2_out + out_strd * 3), i4_out_horz_16x8_r3);
545 
546     return i4_nnz;
547 }
548 
549 /*****************************************************************************/
550 /*                                                                           */
551 /*  Function Name : isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr       */
552 /*                                                                           */
553 /*  Description   : this function computes the resd output from the          */
554 /*                  IQ+IT                                                    */
555 /*                                                                           */
556 /*  Inputs        :                                                          */
557 /*  Globals       : none                                                     */
558 /*  Processing    :                                                          */
559 /*                                                                           */
560 /*  Outputs       : i4_nnz                                                   */
561 /*  Returns       : none                                                     */
562 /*                                                                           */
563 /*  Issues        : none                                                     */
564 /*                                                                           */
565 /*  Revision History:                                                        */
566 /*                                                                           */
567 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
568 /*         25 11 2021   Kishore               creation                       */
569 /*                                                                           */
570 /*****************************************************************************/
571 
isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)572 WORD32 isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr(
573     WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out, WORD32 pred_strd, WORD32 out_strd,
574     const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
575     WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
576 {
577     int16x8_t pred0, pred1, pred2, pred3;
578     int16x8_t temp_0, dup_min, dup_max;
579     WORD32 i4_iq_out_temp;
580     int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs;
581     int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
582     uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
583     int16x8_t chroma_mask_16x8_2 = vreinterpretq_s16_s32(vdupq_n_s32(0x0000ffff));
584 
585     WORD32 i4_nnz;
586     UNUSED(pi2_src);
587     UNUSED(pu2_iscal_mat);
588     UNUSED(pu2_weigh_mat);
589     UNUSED(u4_qp_div_6);
590     UNUSED(pi2_tmp);
591     UNUSED(pi2_dc_src);
592 
593     i4_iq_out_temp = pi2_dc_src[0];
594     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
595     dup_min = vdupq_n_s16(RSD_MIN);
596     dup_max = vdupq_n_s16(RSD_MAX);
597 
598     pred0 = vld1q_s16((int16_t *) pi2_pred);
599     pred1 = vld1q_s16((int16_t *) pi2_pred + pred_strd);
600     pred2 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 2));
601     pred3 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 3));
602 
603     pred0 = vaddq_s16(pred0, temp_0);
604     pred1 = vaddq_s16(pred1, temp_0);
605     pred2 = vaddq_s16(pred2, temp_0);
606     pred3 = vaddq_s16(pred3, temp_0);
607 
608     pred0 = vminq_s16(pred0, dup_max);
609     pred0 = vmaxq_s16(pred0, dup_min);
610     pred1 = vminq_s16(pred1, dup_max);
611     pred1 = vmaxq_s16(pred1, dup_min);
612     pred2 = vminq_s16(pred2, dup_max);
613     pred2 = vmaxq_s16(pred2, dup_min);
614     pred3 = vminq_s16(pred3, dup_max);
615     pred3 = vmaxq_s16(pred3, dup_min);
616 
617     pred0 = vandq_s16(pred0, chroma_mask_16x8_2);
618     pred1 = vandq_s16(pred1, chroma_mask_16x8_2);
619     pred2 = vandq_s16(pred2, chroma_mask_16x8_2);
620     pred3 = vandq_s16(pred3, chroma_mask_16x8_2);
621 
622     dup_val_1 = vabsq_s16(pred0);
623     dup_val_2 = vabsq_s16(pred1);
624     dup_val_3 = vabsq_s16(pred2);
625     dup_val_4 = vabsq_s16(pred2);
626     dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2);
627     dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4);
628     dup_abs = vqaddq_s16(dup_val_1, dup_val_5);
629 
630     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
631              dup_abs[6] || dup_abs[7];
632 
633     i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
634     i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
635     i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + (out_strd * 2));
636     i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + (out_strd * 3));
637 
638     i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, pred0, i4_out_horz_16x8_r0);
639     i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, pred1, i4_out_horz_16x8_r1);
640     i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, pred2, i4_out_horz_16x8_r2);
641     i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, pred3, i4_out_horz_16x8_r3);
642 
643     vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
644     vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
645     vst1q_s16((int16_t *) (pi2_out + (out_strd * 2)), i4_out_horz_16x8_r2);
646     vst1q_s16((int16_t *) (pi2_out + (out_strd * 3)), i4_out_horz_16x8_r3);
647 
648     return i4_nnz;
649 }
650 
651 /*****************************************************************************/
652 /*                                                                           */
653 /*  Function Name : isvcd_iquant_itrans_residual_8x8_neonintr                 */
654 /*                                                                           */
655 /*  Description   : this function computes the resd output from the          */
656 /*                  IQ+IT                                                    */
657 /*                                                                           */
658 /*  Inputs        :                                                          */
659 /*  Globals       : none                                                     */
660 /*  Processing    :                                                          */
661 /*                                                                           */
662 /*  Outputs       : i4_nnz                                                   */
663 /*  Returns       : none                                                     */
664 /*                                                                           */
665 /*  Issues        : none                                                     */
666 /*                                                                           */
667 /*  Revision History:                                                        */
668 /*                                                                           */
669 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
670 /*         25 11 2021   Kishore               creation                       */
671 /*                                                                           */
672 /*****************************************************************************/
673 
isvcd_iquant_itrans_residual_8x8_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)674 WORD32 isvcd_iquant_itrans_residual_8x8_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out,
675                                                  WORD32 pred_strd, WORD32 out_strd,
676                                                  const UWORD16 *pu2_iscale_mat,
677                                                  const UWORD16 *pu2_weigh_mat, UWORD32 qp_div,
678                                                  WORD16 *pi2_tmp, WORD32 iq_start_idx,
679                                                  WORD16 *pi2_dc_ld_addr)
680 {
681     int16x8_t iscal_16x8_0, iscal_16x8_1, iscal_16x8_2, iscal_16x8_3, iscal_16x8_4, iscal_16x8_5,
682         iscal_16x8_6, iscal_16x8_7;
683 
684     int16x8_t weigh_16x8_0, weigh_16x8_1, weigh_16x8_2, weigh_16x8_3, weigh_16x8_4, weigh_16x8_5,
685         weigh_16x8_6, weigh_16x8_7;
686 
687     int16x8_t src_16x8_0, src_16x8_1, src_16x8_2, src_16x8_3, src_16x8_4, src_16x8_5, src_16x8_6,
688         src_16x8_7;
689     int16x8_t coeff_mul_16x8_0, coeff_mul_16x8_1, coeff_mul_16x8_2, coeff_mul_16x8_3,
690         coeff_mul_16x8_4, coeff_mul_16x8_5, coeff_mul_16x8_6, coeff_mul_16x8_7;
691 
692     int32x4_t quant_res_32x4_l_0, quant_res_32x4_l_1, quant_res_32x4_l_2, quant_res_32x4_l_3,
693         quant_res_32x4_l_4, quant_res_32x4_l_5, quant_res_32x4_l_6, quant_res_32x4_l_7;
694     int32x4_t quant_res_32x4_h_0, quant_res_32x4_h_1, quant_res_32x4_h_2, quant_res_32x4_h_3,
695         quant_res_32x4_h_4, quant_res_32x4_h_5, quant_res_32x4_h_6, quant_res_32x4_h_7;
696     int16x4_t quant_res_16x4_l_0, quant_res_16x4_l_1, quant_res_16x4_l_2, quant_res_16x4_l_3,
697         quant_res_16x4_l_4, quant_res_16x4_l_5, quant_res_16x4_l_6, quant_res_16x4_l_7;
698     int16x4_t quant_res_16x4_h_0, quant_res_16x4_h_1, quant_res_16x4_h_2, quant_res_16x4_h_3,
699         quant_res_16x4_h_4, quant_res_16x4_h_5, quant_res_16x4_h_6, quant_res_16x4_h_7;
700 
701     int16x8_t quant_res_16x8_0, quant_res_16x8_1, quant_res_16x8_2, quant_res_16x8_3,
702         quant_res_16x8_4, quant_res_16x8_5, quant_res_16x8_6, quant_res_16x8_7;
703 
704     int16x8_t trans_16x8_0, trans_16x8_1, trans_16x8_2, trans_16x8_3, trans_16x8_4, trans_16x8_5,
705         trans_16x8_6, trans_16x8_7;
706     int32x4_t trans_32x4_0, trans_32x4_1, trans_32x4_2, trans_32x4_3, trans_32x4_4, trans_32x4_5,
707         trans_32x4_6, trans_32x4_7;
708     int64x2_t trans_64x2_0, trans_64x2_1, trans_64x2_2, trans_64x2_3, trans_64x2_4, trans_64x2_5,
709         trans_64x2_6, trans_64x2_7;
710     int16x4_t trans_16x4_1_l, trans_16x4_3_l, trans_16x4_5_l, trans_16x4_7_l;
711     int16x8_t rs_trans_16x8_1, rs_trans_16x8_2, rs_trans_16x8_3, rs_trans_16x8_5, rs_trans_16x8_6,
712         rs_trans_16x8_7;
713     int32x4_t sub_3_5_l, sub_3_5_h;
714     int32x4_t add_3_5_l, add_3_5_h;
715     int32x4_t sub_1_7_l, sub_1_7_h;
716     int32x4_t add_1_7_l, add_1_7_h;
717     int32x4_t sub_357_l, sub_357_h;
718     int32x4_t add_351_l, add_351_h;
719     int32x4_t add_175_l, add_175_h;
720     int32x4_t sub_173_l, sub_173_h;
721     int32x4_t y1_32x4_l, y1_32x4_h;
722     int32x4_t y3_32x4_l, y3_32x4_h;
723     int32x4_t y5_32x4_l, y5_32x4_h;
724     int32x4_t y7_32x4_l, y7_32x4_h;
725     int16x4_t y1_16x4_l, y3_16x4_l, y5_16x4_l, y7_16x4_l;
726 
727     int16x8_t y0_16x8, y1_16x8, y2_16x8, y3_16x8, y4_16x8, y5_16x8, y6_16x8, y7_16x8;
728     int16x8_t rs_y1_16x8, rs_y3_16x8, rs_y5_16x8, rs_y7_16x8;
729     int16x8_t z0_16x8, z1_16x8, z2_16x8, z3_16x8, z4_16x8, z5_16x8, z6_16x8, z7_16x8;
730     int16x8_t pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
731 
732     int64x2_t pred0_in_64x2, pred1_in_64x2, pred2_in_64x2, pred3_in_64x2, pred4_in_64x2,
733         pred5_in_64x2, pred6_in_64x2, pred7_in_64x2;
734 
735     int32x4_t qp_div_6_32x4 = vdupq_n_s32(qp_div);
736 
737     int16x8_t pred_b0_r01_in;
738     int16x8_t pred_b0_r23_in;
739     int16x8_t pred_b1_r01_in;
740     int16x8_t pred_b1_r23_in;
741     int16x8_t pred_b2_r45_in;
742     int16x8_t pred_b2_r67_in;
743     int16x8_t pred_b3_r45_in;
744     int16x8_t pred_b3_r67_in;
745     int16x8_t dup_min, dup_max;
746 
747     int16x8x2_t trans_16x8_0_1, trans_16x8_2_3, trans_16x8_4_5, trans_16x8_6_7;
748     int32x4x2_t trans_32x4_0_2, trans_32x4_1_3, trans_32x4_4_6, trans_32x4_5_7;
749     int16x4_t y1_16x4_h, y3_16x4_h, y5_16x4_h, y7_16x4_h;
750     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3, i;
751     int16x8_t dup_val_1, dup_val_2, dup_abs;
752 
753     UNUSED(pi2_tmp);
754     UNUSED(iq_start_idx);
755     UNUSED(pi2_dc_ld_addr);
756 
757     dup_min = vdupq_n_s16(RSD_MIN);
758     dup_max = vdupq_n_s16(RSD_MAX);
759 
760     iscal_16x8_0 = vld1q_s16((const int16_t *) pu2_iscale_mat);
761     iscal_16x8_1 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 8));
762     iscal_16x8_2 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 16));
763     iscal_16x8_3 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 24));
764     iscal_16x8_4 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 32));
765     iscal_16x8_5 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 40));
766     iscal_16x8_6 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 48));
767     iscal_16x8_7 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 56));
768 
769     weigh_16x8_0 = vld1q_s16((const int16_t *) pu2_weigh_mat);
770     weigh_16x8_1 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 8));
771     weigh_16x8_2 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 16));
772     weigh_16x8_3 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 24));
773     weigh_16x8_4 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 32));
774     weigh_16x8_5 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 40));
775     weigh_16x8_6 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 48));
776     weigh_16x8_7 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 56));
777 
778     src_16x8_0 = vld1q_s16((const int16_t *) pi2_src);        // a0 a1 a2 a3 a4 a5 a6 a7
779     src_16x8_1 = vld1q_s16((const int16_t *) (pi2_src + 8));  // b0 b1 b2 b3 b4 b5 b6 b7
780     src_16x8_2 = vld1q_s16((const int16_t *) (pi2_src + 16));
781     src_16x8_3 = vld1q_s16((const int16_t *) (pi2_src + 24));
782     src_16x8_4 = vld1q_s16((const int16_t *) (pi2_src + 32));
783     src_16x8_5 = vld1q_s16((const int16_t *) (pi2_src + 40));
784     src_16x8_6 = vld1q_s16((const int16_t *) (pi2_src + 48));
785     src_16x8_7 = vld1q_s16((const int16_t *) (pi2_src + 56));
786 
787     coeff_mul_16x8_0 = vmulq_s16(iscal_16x8_0, weigh_16x8_0);
788     coeff_mul_16x8_1 = vmulq_s16(iscal_16x8_1, weigh_16x8_1);
789     coeff_mul_16x8_2 = vmulq_s16(iscal_16x8_2, weigh_16x8_2);
790     coeff_mul_16x8_3 = vmulq_s16(iscal_16x8_3, weigh_16x8_3);
791     coeff_mul_16x8_4 = vmulq_s16(iscal_16x8_4, weigh_16x8_4);
792     coeff_mul_16x8_5 = vmulq_s16(iscal_16x8_5, weigh_16x8_5);
793     coeff_mul_16x8_6 = vmulq_s16(iscal_16x8_6, weigh_16x8_6);
794     coeff_mul_16x8_7 = vmulq_s16(iscal_16x8_7, weigh_16x8_7);
795 
796     quant_res_32x4_l_0 = vmull_s16(vget_low_s16(coeff_mul_16x8_0), vget_low_s16(src_16x8_0));
797     quant_res_32x4_l_1 = vmull_s16(vget_low_s16(coeff_mul_16x8_1), vget_low_s16(src_16x8_1));
798     quant_res_32x4_l_2 = vmull_s16(vget_low_s16(coeff_mul_16x8_2), vget_low_s16(src_16x8_2));
799     quant_res_32x4_l_3 = vmull_s16(vget_low_s16(coeff_mul_16x8_3), vget_low_s16(src_16x8_3));
800     quant_res_32x4_l_4 = vmull_s16(vget_low_s16(coeff_mul_16x8_4), vget_low_s16(src_16x8_4));
801     quant_res_32x4_l_5 = vmull_s16(vget_low_s16(coeff_mul_16x8_5), vget_low_s16(src_16x8_5));
802     quant_res_32x4_l_6 = vmull_s16(vget_low_s16(coeff_mul_16x8_6), vget_low_s16(src_16x8_6));
803     quant_res_32x4_l_7 = vmull_s16(vget_low_s16(coeff_mul_16x8_7), vget_low_s16(src_16x8_7));
804 
805     quant_res_32x4_h_0 = vmull_s16(vget_high_s16(coeff_mul_16x8_0), vget_high_s16(src_16x8_0));
806     quant_res_32x4_h_1 = vmull_s16(vget_high_s16(coeff_mul_16x8_1), vget_high_s16(src_16x8_1));
807     quant_res_32x4_h_2 = vmull_s16(vget_high_s16(coeff_mul_16x8_2), vget_high_s16(src_16x8_2));
808     quant_res_32x4_h_3 = vmull_s16(vget_high_s16(coeff_mul_16x8_3), vget_high_s16(src_16x8_3));
809     quant_res_32x4_h_4 = vmull_s16(vget_high_s16(coeff_mul_16x8_4), vget_high_s16(src_16x8_4));
810     quant_res_32x4_h_5 = vmull_s16(vget_high_s16(coeff_mul_16x8_5), vget_high_s16(src_16x8_5));
811     quant_res_32x4_h_6 = vmull_s16(vget_high_s16(coeff_mul_16x8_6), vget_high_s16(src_16x8_6));
812     quant_res_32x4_h_7 = vmull_s16(vget_high_s16(coeff_mul_16x8_7), vget_high_s16(src_16x8_7));
813 
814     quant_res_32x4_l_0 = vshlq_s32(quant_res_32x4_l_0, qp_div_6_32x4);
815     quant_res_32x4_l_1 = vshlq_s32(quant_res_32x4_l_1, qp_div_6_32x4);
816     quant_res_32x4_l_2 = vshlq_s32(quant_res_32x4_l_2, qp_div_6_32x4);
817     quant_res_32x4_l_3 = vshlq_s32(quant_res_32x4_l_3, qp_div_6_32x4);
818     quant_res_32x4_l_4 = vshlq_s32(quant_res_32x4_l_4, qp_div_6_32x4);
819     quant_res_32x4_l_5 = vshlq_s32(quant_res_32x4_l_5, qp_div_6_32x4);
820     quant_res_32x4_l_6 = vshlq_s32(quant_res_32x4_l_6, qp_div_6_32x4);
821     quant_res_32x4_l_7 = vshlq_s32(quant_res_32x4_l_7, qp_div_6_32x4);
822 
823     quant_res_32x4_h_0 = vshlq_s32(quant_res_32x4_h_0, qp_div_6_32x4);
824     quant_res_32x4_h_1 = vshlq_s32(quant_res_32x4_h_1, qp_div_6_32x4);
825     quant_res_32x4_h_2 = vshlq_s32(quant_res_32x4_h_2, qp_div_6_32x4);
826     quant_res_32x4_h_3 = vshlq_s32(quant_res_32x4_h_3, qp_div_6_32x4);
827     quant_res_32x4_h_4 = vshlq_s32(quant_res_32x4_h_4, qp_div_6_32x4);
828     quant_res_32x4_h_5 = vshlq_s32(quant_res_32x4_h_5, qp_div_6_32x4);
829     quant_res_32x4_h_6 = vshlq_s32(quant_res_32x4_h_6, qp_div_6_32x4);
830     quant_res_32x4_h_7 = vshlq_s32(quant_res_32x4_h_7, qp_div_6_32x4);
831 
832     quant_res_16x4_l_0 = vqrshrn_n_s32(quant_res_32x4_l_0, 6);
833     quant_res_16x4_l_1 = vqrshrn_n_s32(quant_res_32x4_l_1, 6);
834     quant_res_16x4_l_2 = vqrshrn_n_s32(quant_res_32x4_l_2, 6);
835     quant_res_16x4_l_3 = vqrshrn_n_s32(quant_res_32x4_l_3, 6);
836     quant_res_16x4_l_4 = vqrshrn_n_s32(quant_res_32x4_l_4, 6);
837     quant_res_16x4_l_5 = vqrshrn_n_s32(quant_res_32x4_l_5, 6);
838     quant_res_16x4_l_6 = vqrshrn_n_s32(quant_res_32x4_l_6, 6);
839     quant_res_16x4_l_7 = vqrshrn_n_s32(quant_res_32x4_l_7, 6);
840 
841     quant_res_16x4_h_0 = vqrshrn_n_s32(quant_res_32x4_h_0, 6);
842     quant_res_16x4_h_1 = vqrshrn_n_s32(quant_res_32x4_h_1, 6);
843     quant_res_16x4_h_2 = vqrshrn_n_s32(quant_res_32x4_h_2, 6);
844     quant_res_16x4_h_3 = vqrshrn_n_s32(quant_res_32x4_h_3, 6);
845     quant_res_16x4_h_4 = vqrshrn_n_s32(quant_res_32x4_h_4, 6);
846     quant_res_16x4_h_5 = vqrshrn_n_s32(quant_res_32x4_h_5, 6);
847     quant_res_16x4_h_6 = vqrshrn_n_s32(quant_res_32x4_h_6, 6);
848     quant_res_16x4_h_7 = vqrshrn_n_s32(quant_res_32x4_h_7, 6);
849 
850     quant_res_16x8_0 = vcombine_s16(quant_res_16x4_l_0, quant_res_16x4_h_0);
851     quant_res_16x8_1 = vcombine_s16(quant_res_16x4_l_1, quant_res_16x4_h_1);
852     quant_res_16x8_2 = vcombine_s16(quant_res_16x4_l_2, quant_res_16x4_h_2);
853     quant_res_16x8_3 = vcombine_s16(quant_res_16x4_l_3, quant_res_16x4_h_3);
854     quant_res_16x8_4 = vcombine_s16(quant_res_16x4_l_4, quant_res_16x4_h_4);
855     quant_res_16x8_5 = vcombine_s16(quant_res_16x4_l_5, quant_res_16x4_h_5);
856     quant_res_16x8_6 = vcombine_s16(quant_res_16x4_l_6, quant_res_16x4_h_6);
857     quant_res_16x8_7 = vcombine_s16(quant_res_16x4_l_7, quant_res_16x4_h_7);
858 
859     for(i = 0; i < 2; i++)
860     {
861         trans_16x8_0_1 = vtrnq_s16(quant_res_16x8_0, quant_res_16x8_1);
862         trans_16x8_0 = trans_16x8_0_1.val[0];
863         trans_16x8_1 = trans_16x8_0_1.val[1];
864 
865         trans_16x8_2_3 = vtrnq_s16(quant_res_16x8_2, quant_res_16x8_3);
866         trans_16x8_2 = trans_16x8_2_3.val[0];
867         trans_16x8_3 = trans_16x8_2_3.val[1];
868 
869         trans_16x8_4_5 = vtrnq_s16(quant_res_16x8_4, quant_res_16x8_5);
870         trans_16x8_4 = trans_16x8_4_5.val[0];
871         trans_16x8_5 = trans_16x8_4_5.val[1];
872 
873         trans_16x8_6_7 = vtrnq_s16(quant_res_16x8_6, quant_res_16x8_7);
874         trans_16x8_6 = trans_16x8_6_7.val[0];
875         trans_16x8_7 = trans_16x8_6_7.val[1];
876 
877         trans_32x4_0_2 =
878             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_0), vreinterpretq_s32_s16(trans_16x8_2));
879         trans_32x4_0 = trans_32x4_0_2.val[0];
880         trans_32x4_2 = trans_32x4_0_2.val[1];
881 
882         trans_32x4_1_3 =
883             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_1), vreinterpretq_s32_s16(trans_16x8_3));
884         trans_32x4_1 = trans_32x4_1_3.val[0];
885         trans_32x4_3 = trans_32x4_1_3.val[1];
886 
887         trans_32x4_4_6 =
888             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_4), vreinterpretq_s32_s16(trans_16x8_6));
889         trans_32x4_4 = trans_32x4_4_6.val[0];
890         trans_32x4_6 = trans_32x4_4_6.val[1];
891 
892         trans_32x4_5_7 =
893             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_5), vreinterpretq_s32_s16(trans_16x8_7));
894         trans_32x4_5 = trans_32x4_5_7.val[0];
895         trans_32x4_7 = trans_32x4_5_7.val[1];
896 
897         trans_64x2_0 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_0)),
898                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_4)));
899         trans_64x2_4 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_0)),
900                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_4)));
901 
902         trans_64x2_1 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_1)),
903                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_5)));
904         trans_64x2_5 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_1)),
905                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_5)));
906 
907         trans_64x2_2 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_2)),
908                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_6)));
909         trans_64x2_6 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_2)),
910                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_6)));
911 
912         trans_64x2_3 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_3)),
913                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_7)));
914         trans_64x2_7 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_3)),
915                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_7)));
916 
917         trans_16x8_0 = vreinterpretq_s16_s64(trans_64x2_0);
918         trans_16x8_1 = vreinterpretq_s16_s64(trans_64x2_1);
919         trans_16x8_2 = vreinterpretq_s16_s64(trans_64x2_2);
920         trans_16x8_3 = vreinterpretq_s16_s64(trans_64x2_3);
921         trans_16x8_4 = vreinterpretq_s16_s64(trans_64x2_4);
922         trans_16x8_5 = vreinterpretq_s16_s64(trans_64x2_5);
923         trans_16x8_6 = vreinterpretq_s16_s64(trans_64x2_6);
924         trans_16x8_7 = vreinterpretq_s16_s64(trans_64x2_7);
925 
926         rs_trans_16x8_1 = vshrq_n_s16(trans_16x8_1, 1);  //(pi2_tmp_ptr[1] >> 1)
927         rs_trans_16x8_2 = vshrq_n_s16(trans_16x8_2, 1);  //(pi2_tmp_ptr[2] >> 1)
928         rs_trans_16x8_3 = vshrq_n_s16(trans_16x8_3, 1);  //(pi2_tmp_ptr[3] >> 1)
929         rs_trans_16x8_5 = vshrq_n_s16(trans_16x8_5, 1);  //(pi2_tmp_ptr[5] >> 1)
930         rs_trans_16x8_6 = vshrq_n_s16(trans_16x8_6, 1);  //(pi2_tmp_ptr[6] >> 1)
931         rs_trans_16x8_7 = vshrq_n_s16(trans_16x8_7, 1);  //(pi2_tmp_ptr[7] >> 1)
932 
933         y0_16x8 = vaddq_s16(trans_16x8_0,
934                             trans_16x8_4);     // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
935         y2_16x8 = vsubq_s16(trans_16x8_0,
936                             trans_16x8_4);     // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
937         y4_16x8 = vsubq_s16(rs_trans_16x8_2,
938                             trans_16x8_6);     // i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
939         y6_16x8 = vaddq_s16(trans_16x8_2,
940                             rs_trans_16x8_6);  // i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
941 
942         trans_16x4_3_l = vget_low_s16(trans_16x8_3);
943         trans_16x4_5_l = vget_low_s16(trans_16x8_5);
944 
945         //-w3 + w5
946         sub_3_5_l = vsubl_s16(vget_low_s16(trans_16x8_5), vget_low_s16(trans_16x8_3));
947         sub_3_5_h = vsubl_s16(vget_high_s16(trans_16x8_5), vget_high_s16(trans_16x8_3));
948 
949         // w3 + w5
950         add_3_5_l = vaddl_s16(trans_16x4_3_l, trans_16x4_5_l);
951         add_3_5_h = vaddl_s16(vget_high_s16(trans_16x8_3), vget_high_s16(trans_16x8_5));
952 
953         trans_16x4_1_l = vget_low_s16(trans_16x8_1);
954         trans_16x4_7_l = vget_low_s16(trans_16x8_7);
955 
956         //-w1 + w7
957         sub_1_7_l = vsubl_s16(trans_16x4_7_l, trans_16x4_1_l);
958         sub_1_7_h = vsubl_s16(vget_high_s16(trans_16x8_7), vget_high_s16(trans_16x8_1));
959 
960         // w1 + w7
961         add_1_7_l = vaddl_s16(trans_16x4_1_l, trans_16x4_7_l);
962         add_1_7_h = vaddl_s16(vget_high_s16(trans_16x8_1), vget_high_s16(trans_16x8_7));
963 
964         //-w3 + w5 - w7
965         sub_357_l = vsubw_s16(sub_3_5_l, trans_16x4_7_l);
966         sub_357_h = vsubw_s16(sub_3_5_h, vget_high_s16(trans_16x8_7));
967 
968         // w3 + w5 + w1
969         add_351_l = vaddw_s16(add_3_5_l, trans_16x4_1_l);
970         add_351_h = vaddw_s16(add_3_5_h, vget_high_s16(trans_16x8_1));
971 
972         //-w1 + w7 + w5
973         add_175_l = vaddw_s16(sub_1_7_l, trans_16x4_5_l);
974         add_175_h = vaddw_s16(sub_1_7_h, vget_high_s16(trans_16x8_5));
975 
976         // w1 + w7 - w3
977         sub_173_l = vsubw_s16(add_1_7_l, trans_16x4_3_l);
978         sub_173_h = vsubw_s16(add_1_7_h, vget_high_s16(trans_16x8_3));
979 
980         //-w3 + w5 - w7 - (w7 >> 1)
981         y1_32x4_l = vsubw_s16(sub_357_l, vget_low_s16(rs_trans_16x8_7));
982         y1_32x4_h = vsubw_s16(sub_357_h, vget_high_s16(rs_trans_16x8_7));
983 
984         // w1 + w7 - w3 - (w3 >> 1)
985         y3_32x4_l = vsubw_s16(sub_173_l, vget_low_s16(rs_trans_16x8_3));
986         y3_32x4_h = vsubw_s16(sub_173_h, vget_high_s16(rs_trans_16x8_3));
987 
988         //-w1 + w7 + w5 + (w5 >> 1)
989         y5_32x4_l = vaddw_s16(add_175_l, vget_low_s16(rs_trans_16x8_5));
990         y5_32x4_h = vaddw_s16(add_175_h, vget_high_s16(rs_trans_16x8_5));
991 
992         // w3 + w5 + w1 + (w1 >> 1)
993         y7_32x4_l = vaddw_s16(add_351_l, vget_low_s16(rs_trans_16x8_1));
994         y7_32x4_h = vaddw_s16(add_351_h, vget_high_s16(rs_trans_16x8_1));
995 
996         y1_16x4_l = vmovn_s32(y1_32x4_l);
997         y1_16x4_h = vmovn_s32(y1_32x4_h);
998         y1_16x8 = vcombine_s16(y1_16x4_l, y1_16x4_h);
999         y3_16x4_l = vmovn_s32(y3_32x4_l);
1000         y3_16x4_h = vmovn_s32(y3_32x4_h);
1001         y3_16x8 = vcombine_s16(y3_16x4_l, y3_16x4_h);
1002         y5_16x4_l = vmovn_s32(y5_32x4_l);
1003         y5_16x4_h = vmovn_s32(y5_32x4_h);
1004         y5_16x8 = vcombine_s16(y5_16x4_l, y5_16x4_h);
1005         y7_16x4_l = vmovn_s32(y7_32x4_l);
1006         y7_16x4_h = vmovn_s32(y7_32x4_h);
1007         y7_16x8 = vcombine_s16(y7_16x4_l, y7_16x4_h);
1008 
1009         rs_y1_16x8 = vshrq_n_s16(y1_16x8, 2);
1010         rs_y3_16x8 = vshrq_n_s16(y3_16x8, 2);
1011         rs_y5_16x8 = vshrq_n_s16(y5_16x8, 2);
1012         rs_y7_16x8 = vshrq_n_s16(y7_16x8, 2);
1013 
1014         z0_16x8 = vaddq_s16(y0_16x8, y6_16x8);           // z0 = y0 + y6
1015         z1_16x8 = vaddq_s16(y1_16x8, rs_y7_16x8);        // z1 = y1 + (y7 >> 2)
1016         z2_16x8 = vaddq_s16(y2_16x8, y4_16x8);           // z2 = y2 + y4
1017         z3_16x8 = vaddq_s16(y3_16x8, rs_y5_16x8);        // z3 = y3 + (y5 >> 2)
1018         z4_16x8 = vsubq_s16(y2_16x8, y4_16x8);           // z4 = y2 - y4
1019         z5_16x8 = vsubq_s16(rs_y3_16x8, y5_16x8);        // z5 = (y3 >> 2) - y5
1020         z6_16x8 = vsubq_s16(y0_16x8, y6_16x8);           // z6 = y0 - y6
1021         z7_16x8 = vsubq_s16(y7_16x8, rs_y1_16x8);        // z7 = y7 - (y1 >> 2)
1022 
1023         quant_res_16x8_0 = vaddq_s16(z0_16x8, z7_16x8);  // x0 = z0 + z7
1024         quant_res_16x8_1 = vaddq_s16(z2_16x8, z5_16x8);  // x1 = z2 + z5
1025         quant_res_16x8_2 = vaddq_s16(z4_16x8, z3_16x8);  // x2 = z4 + z3
1026         quant_res_16x8_3 = vaddq_s16(z6_16x8, z1_16x8);  // x3 = z6 + z1
1027         quant_res_16x8_4 = vsubq_s16(z6_16x8, z1_16x8);  // x4 = z6 - z1
1028         quant_res_16x8_5 = vsubq_s16(z4_16x8, z3_16x8);  // x5 = z4 - z3
1029         quant_res_16x8_6 = vsubq_s16(z2_16x8, z5_16x8);  // x6 = z2 - z5
1030         quant_res_16x8_7 = vsubq_s16(z0_16x8, z7_16x8);  // x7 = z0 - z7
1031     }
1032 
1033     quant_res_16x8_0 = vrshrq_n_s16(quant_res_16x8_0, 6);
1034     quant_res_16x8_1 = vrshrq_n_s16(quant_res_16x8_1, 6);
1035     quant_res_16x8_2 = vrshrq_n_s16(quant_res_16x8_2, 6);
1036     quant_res_16x8_3 = vrshrq_n_s16(quant_res_16x8_3, 6);
1037     quant_res_16x8_4 = vrshrq_n_s16(quant_res_16x8_4, 6);
1038     quant_res_16x8_5 = vrshrq_n_s16(quant_res_16x8_5, 6);
1039     quant_res_16x8_6 = vrshrq_n_s16(quant_res_16x8_6, 6);
1040     quant_res_16x8_7 = vrshrq_n_s16(quant_res_16x8_7, 6);
1041 
1042     pred0 = vld1q_s16((int16_t *) pi2_pred);
1043     pred1 = vld1q_s16((int16_t *) pi2_pred + pred_strd);
1044     pred2 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 2));
1045     pred3 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 3));
1046     pred4 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 4));
1047     pred5 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 5));
1048     pred6 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 6));
1049     pred7 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 7));
1050 
1051     quant_res_16x8_0 = vaddq_s16(pred0, quant_res_16x8_0);
1052     quant_res_16x8_1 = vaddq_s16(pred1, quant_res_16x8_1);
1053     quant_res_16x8_2 = vaddq_s16(pred2, quant_res_16x8_2);
1054     quant_res_16x8_3 = vaddq_s16(pred3, quant_res_16x8_3);
1055     quant_res_16x8_4 = vaddq_s16(pred4, quant_res_16x8_4);
1056     quant_res_16x8_5 = vaddq_s16(pred5, quant_res_16x8_5);
1057     quant_res_16x8_6 = vaddq_s16(pred6, quant_res_16x8_6);
1058     quant_res_16x8_7 = vaddq_s16(pred7, quant_res_16x8_7);
1059 
1060     quant_res_16x8_0 = vminq_s16(quant_res_16x8_0, dup_max);
1061     quant_res_16x8_0 = vmaxq_s16(quant_res_16x8_0, dup_min);
1062     quant_res_16x8_1 = vminq_s16(quant_res_16x8_1, dup_max);
1063     quant_res_16x8_1 = vmaxq_s16(quant_res_16x8_1, dup_min);
1064     quant_res_16x8_2 = vminq_s16(quant_res_16x8_2, dup_max);
1065     quant_res_16x8_2 = vmaxq_s16(quant_res_16x8_2, dup_min);
1066     quant_res_16x8_3 = vminq_s16(quant_res_16x8_3, dup_max);
1067     quant_res_16x8_3 = vmaxq_s16(quant_res_16x8_3, dup_min);
1068     quant_res_16x8_4 = vminq_s16(quant_res_16x8_4, dup_max);
1069     quant_res_16x8_4 = vmaxq_s16(quant_res_16x8_4, dup_min);
1070     quant_res_16x8_5 = vminq_s16(quant_res_16x8_5, dup_max);
1071     quant_res_16x8_5 = vmaxq_s16(quant_res_16x8_5, dup_min);
1072     quant_res_16x8_6 = vminq_s16(quant_res_16x8_6, dup_max);
1073     quant_res_16x8_6 = vmaxq_s16(quant_res_16x8_6, dup_min);
1074     quant_res_16x8_7 = vminq_s16(quant_res_16x8_7, dup_max);
1075     quant_res_16x8_7 = vmaxq_s16(quant_res_16x8_7, dup_min);
1076 
1077     pred0_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_0);
1078     pred1_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_1);
1079     pred2_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_2);
1080     pred3_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_3);
1081     pred4_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_4);
1082     pred5_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_5);
1083     pred6_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_6);
1084     pred7_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_7);
1085 
1086     pred_b0_r01_in = vreinterpretq_s16_s64(
1087         vcombine_s64(vget_low_s64(pred0_in_64x2), vget_low_s64(pred1_in_64x2)));
1088     pred_b0_r23_in = vreinterpretq_s16_s64(
1089         vcombine_s64(vget_low_s64(pred2_in_64x2), vget_low_s64(pred3_in_64x2)));
1090     pred_b1_r01_in = vreinterpretq_s16_s64(
1091         vcombine_s64(vget_high_s64(pred0_in_64x2), vget_high_s64(pred1_in_64x2)));
1092     pred_b1_r23_in = vreinterpretq_s16_s64(
1093         vcombine_s64(vget_high_s64(pred2_in_64x2), vget_high_s64(pred3_in_64x2)));
1094     pred_b2_r45_in = vreinterpretq_s16_s64(
1095         vcombine_s64(vget_low_s64(pred4_in_64x2), vget_low_s64(pred5_in_64x2)));
1096     pred_b2_r67_in = vreinterpretq_s16_s64(
1097         vcombine_s64(vget_low_s64(pred6_in_64x2), vget_low_s64(pred7_in_64x2)));
1098     pred_b3_r45_in = vreinterpretq_s16_s64(
1099         vcombine_s64(vget_high_s64(pred4_in_64x2), vget_high_s64(pred5_in_64x2)));
1100     pred_b3_r67_in = vreinterpretq_s16_s64(
1101         vcombine_s64(vget_high_s64(pred6_in_64x2), vget_high_s64(pred7_in_64x2)));
1102 
1103     dup_val_1 = vabsq_s16(pred_b0_r01_in);
1104     dup_val_2 = vabsq_s16(pred_b0_r23_in);
1105     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1106     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1107              dup_abs[6] || dup_abs[7];
1108 
1109     dup_val_1 = vabsq_s16(pred_b1_r01_in);
1110     dup_val_2 = vabsq_s16(pred_b1_r23_in);
1111     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1112     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1113              dup_abs[6] || dup_abs[7];
1114 
1115     dup_val_1 = vabsq_s16(pred_b2_r45_in);
1116     dup_val_2 = vabsq_s16(pred_b2_r67_in);
1117     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1118     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1119              dup_abs[6] || dup_abs[7];
1120 
1121     dup_val_1 = vabsq_s16(pred_b3_r45_in);
1122     dup_val_2 = vabsq_s16(pred_b3_r67_in);
1123     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1124     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1125              dup_abs[6] || dup_abs[7];
1126 
1127     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1128 
1129     vst1q_s16(pi2_out, quant_res_16x8_0);
1130     vst1q_s16(pi2_out + out_strd, quant_res_16x8_1);
1131     vst1q_s16(pi2_out + out_strd * 2, quant_res_16x8_2);
1132     vst1q_s16(pi2_out + out_strd * 3, quant_res_16x8_3);
1133     vst1q_s16(pi2_out + out_strd * 4, quant_res_16x8_4);
1134     vst1q_s16(pi2_out + out_strd * 5, quant_res_16x8_5);
1135     vst1q_s16(pi2_out + out_strd * 6, quant_res_16x8_6);
1136     vst1q_s16(pi2_out + out_strd * 7, quant_res_16x8_7);
1137 
1138     return nnz;
1139 }
1140 
1141 /*****************************************************************************/
1142 /*                                                                           */
1143 /*  Function Name : isvcd_iquant_itrans_residual_8x8_dc_neonintr              */
1144 /*                                                                           */
1145 /*  Description   : this function computes the resd dc output from the       */
1146 /*                  IQ+IT                                                    */
1147 /*                                                                           */
1148 /*  Inputs        :                                                          */
1149 /*  Globals       : none                                                     */
1150 /*  Processing    :                                                          */
1151 /*                                                                           */
1152 /*  Outputs       : i4_nnz                                                   */
1153 /*  Returns       : none                                                     */
1154 /*                                                                           */
1155 /*  Issues        : none                                                     */
1156 /*                                                                           */
1157 /*  Revision History:                                                        */
1158 /*                                                                           */
1159 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1160 /*         25 11 2021   Kishore               creation                       */
1161 /*                                                                           */
1162 /*****************************************************************************/
1163 
isvcd_iquant_itrans_residual_8x8_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)1164 WORD32 isvcd_iquant_itrans_residual_8x8_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred,
1165                                                     WORD16 *pi2_out, WORD32 pred_strd,
1166                                                     WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
1167                                                     const UWORD16 *pu2_weigh_mat, UWORD32 qp_div,
1168                                                     WORD16 *pi2_tmp, WORD32 iq_start_idx,
1169                                                     WORD16 *pi2_dc_ld_addr)
1170 {
1171     WORD32 i4_iq_out_temp;
1172     int16x8_t temp_0, dup_min, dup_max;
1173     int16x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1174     int16x8_t pred4_in, pred5_in, pred6_in, pred7_in;
1175     WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
1176 
1177     int64x2_t pred0_in_64x2, pred1_in_64x2, pred2_in_64x2, pred3_in_64x2, pred4_in_64x2,
1178         pred5_in_64x2, pred6_in_64x2, pred7_in_64x2;
1179 
1180     int16x8_t pred_b0_r01_in;
1181     int16x8_t pred_b0_r23_in;
1182     int16x8_t pred_b1_r01_in;
1183     int16x8_t pred_b1_r23_in;
1184     int16x8_t pred_b2_r45_in;
1185     int16x8_t pred_b2_r67_in;
1186     int16x8_t pred_b3_r45_in;
1187     int16x8_t pred_b3_r67_in;
1188 
1189     int16x8_t dup_val_1, dup_val_2, dup_abs;
1190     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1191 
1192     UNUSED(pi2_tmp);
1193     UNUSED(iq_start_idx);
1194     UNUSED(pi2_dc_ld_addr);
1195 
1196     i4_iq_out_temp = pi2_src[0];
1197     INV_QUANT(i4_iq_out_temp, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
1198 
1199     /* inv transform and residual comp */
1200     pred0_in = vld1q_s16((int16_t *) pi2_pred);
1201     pi2_pred = pi2_pred + pred_strd;
1202     pred1_in = vld1q_s16((int16_t *) pi2_pred);
1203     pi2_pred = pi2_pred + pred_strd;
1204     pred2_in = vld1q_s16((int16_t *) pi2_pred);
1205     pi2_pred = pi2_pred + pred_strd;
1206     pred3_in = vld1q_s16((int16_t *) pi2_pred);
1207     pi2_pred = pi2_pred + pred_strd;
1208     pred4_in = vld1q_s16((int16_t *) pi2_pred);
1209     pi2_pred = pi2_pred + pred_strd;
1210     pred5_in = vld1q_s16((int16_t *) pi2_pred);
1211     pi2_pred = pi2_pred + pred_strd;
1212     pred6_in = vld1q_s16((int16_t *) pi2_pred);
1213     pi2_pred = pi2_pred + pred_strd;
1214     pred7_in = vld1q_s16((int16_t *) pi2_pred);
1215 
1216     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1217     dup_min = vdupq_n_s16(RSD_MIN);
1218     dup_max = vdupq_n_s16(RSD_MAX);
1219 
1220     pred0_in = vaddq_s16(pred0_in, temp_0);
1221     pred1_in = vaddq_s16(pred1_in, temp_0);
1222     pred2_in = vaddq_s16(pred2_in, temp_0);
1223     pred3_in = vaddq_s16(pred3_in, temp_0);
1224     pred4_in = vaddq_s16(pred4_in, temp_0);
1225     pred5_in = vaddq_s16(pred5_in, temp_0);
1226     pred6_in = vaddq_s16(pred6_in, temp_0);
1227     pred7_in = vaddq_s16(pred7_in, temp_0);
1228 
1229     pred0_in = vminq_s16(pred0_in, dup_max);
1230     pred0_in = vmaxq_s16(pred0_in, dup_min);
1231     pred1_in = vminq_s16(pred1_in, dup_max);
1232     pred1_in = vmaxq_s16(pred1_in, dup_min);
1233     pred2_in = vminq_s16(pred2_in, dup_max);
1234     pred2_in = vmaxq_s16(pred2_in, dup_min);
1235     pred3_in = vminq_s16(pred3_in, dup_max);
1236     pred3_in = vmaxq_s16(pred3_in, dup_min);
1237     pred4_in = vminq_s16(pred4_in, dup_max);
1238     pred4_in = vmaxq_s16(pred4_in, dup_min);
1239     pred5_in = vminq_s16(pred5_in, dup_max);
1240     pred5_in = vmaxq_s16(pred5_in, dup_min);
1241     pred6_in = vminq_s16(pred6_in, dup_max);
1242     pred6_in = vmaxq_s16(pred6_in, dup_min);
1243     pred7_in = vminq_s16(pred7_in, dup_max);
1244     pred7_in = vmaxq_s16(pred7_in, dup_min);
1245 
1246     pred0_in_64x2 = vreinterpretq_s64_s16(pred0_in);
1247     pred1_in_64x2 = vreinterpretq_s64_s16(pred1_in);
1248     pred2_in_64x2 = vreinterpretq_s64_s16(pred2_in);
1249     pred3_in_64x2 = vreinterpretq_s64_s16(pred3_in);
1250     pred4_in_64x2 = vreinterpretq_s64_s16(pred4_in);
1251     pred5_in_64x2 = vreinterpretq_s64_s16(pred5_in);
1252     pred6_in_64x2 = vreinterpretq_s64_s16(pred6_in);
1253     pred7_in_64x2 = vreinterpretq_s64_s16(pred7_in);
1254 
1255     pred_b0_r01_in = vreinterpretq_s16_s64(
1256         vcombine_s64(vget_low_s64(pred0_in_64x2), vget_low_s64(pred1_in_64x2)));
1257     pred_b0_r23_in = vreinterpretq_s16_s64(
1258         vcombine_s64(vget_low_s64(pred2_in_64x2), vget_low_s64(pred3_in_64x2)));
1259     pred_b1_r01_in = vreinterpretq_s16_s64(
1260         vcombine_s64(vget_high_s64(pred0_in_64x2), vget_high_s64(pred1_in_64x2)));
1261     pred_b1_r23_in = vreinterpretq_s16_s64(
1262         vcombine_s64(vget_high_s64(pred2_in_64x2), vget_high_s64(pred3_in_64x2)));
1263     pred_b2_r45_in = vreinterpretq_s16_s64(
1264         vcombine_s64(vget_low_s64(pred4_in_64x2), vget_low_s64(pred5_in_64x2)));
1265     pred_b2_r67_in = vreinterpretq_s16_s64(
1266         vcombine_s64(vget_low_s64(pred6_in_64x2), vget_low_s64(pred7_in_64x2)));
1267     pred_b3_r45_in = vreinterpretq_s16_s64(
1268         vcombine_s64(vget_high_s64(pred4_in_64x2), vget_high_s64(pred5_in_64x2)));
1269     pred_b3_r67_in = vreinterpretq_s16_s64(
1270         vcombine_s64(vget_high_s64(pred6_in_64x2), vget_high_s64(pred7_in_64x2)));
1271 
1272     dup_val_1 = vabsq_s16(pred_b0_r01_in);
1273     dup_val_2 = vabsq_s16(pred_b0_r23_in);
1274     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1275     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1276              dup_abs[6] || dup_abs[7];
1277 
1278     dup_val_1 = vabsq_s16(pred_b1_r01_in);
1279     dup_val_2 = vabsq_s16(pred_b1_r23_in);
1280     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1281     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1282              dup_abs[6] || dup_abs[7];
1283 
1284     dup_val_1 = vabsq_s16(pred_b2_r45_in);
1285     dup_val_2 = vabsq_s16(pred_b2_r67_in);
1286     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1287     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1288              dup_abs[6] || dup_abs[7];
1289 
1290     dup_val_1 = vabsq_s16(pred_b3_r45_in);
1291     dup_val_2 = vabsq_s16(pred_b3_r67_in);
1292     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1293     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1294              dup_abs[6] || dup_abs[7];
1295 
1296     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1297 
1298     vst1q_s16((int16_t *) (pi2_out), pred0_in);
1299     vst1q_s16((int16_t *) (pi2_out + out_strd), pred1_in);
1300     vst1q_s16((int16_t *) (pi2_out + (out_strd * 2)), pred2_in);
1301     vst1q_s16((int16_t *) (pi2_out + (out_strd * 3)), pred3_in);
1302     vst1q_s16((int16_t *) (pi2_out + (out_strd * 4)), pred4_in);
1303     vst1q_s16((int16_t *) (pi2_out + (out_strd * 5)), pred5_in);
1304     vst1q_s16((int16_t *) (pi2_out + (out_strd * 6)), pred6_in);
1305     vst1q_s16((int16_t *) (pi2_out + (out_strd * 7)), pred7_in);
1306 
1307     return nnz;
1308 }
1309