xref: /aosp_15_r20/external/libavc/decoder/arm/svc/isvcd_iquant_itrans_residual_recon_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_iquant_itrans_residual_recon_neonintr.c
24  *
25  * @brief
26  *  Contains definition of functions for h264 inverse quantization inverse
27  *  transformation and recon
28  *
29  * @author
30  *  Kishore
31  *
32  *  @par List of Functions:
33  *  - ih264_iquant_itrans_residual_recon_4x4()
34  *  - ih264_iquant_itrans_residual_recon_8x8()
35  *  - isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr()
36  *  - isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr()
37  *  - ih264_iquant_itrans_residual_recon_chroma_4x4()
38  *  - isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr()
39  *
40  * @remarks
41  *
42  *******************************************************************************
43  */
44 
45 /*****************************************************************************/
46 /* File Includes                                                             */
47 /*****************************************************************************/
48 
49 #include <string.h>
50 #include <arm_neon.h>
51 
52 /* User include files */
53 #include "ih264_typedefs.h"
54 #include "ih264_defs.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "ih264_trans_data.h"
59 #include "ih264_size_defs.h"
60 #include "ih264_structs.h"
61 #include "isvcd_iquant_itrans_residual_recon.h"
62 
63 /*****************************************************************************/
64 /*                                                                           */
65 /*  Function Name : isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr        */
66 /*                                                                           */
67 /*  Description   : this function computes the recon output from the         */
68 /*                  IQ+IT+RESD                                               */
69 /*                                                                           */
70 /*  Inputs        :                                                          */
71 /*  Globals       : none                                                     */
72 /*  Processing    :                                                          */
73 /*                                                                           */
74 /*  Outputs       : i4_nnz                                                   */
75 /*  Returns       : none                                                     */
76 /*                                                                           */
77 /*  Issues        : none                                                     */
78 /*                                                                           */
79 /*  Revision History:                                                        */
80 /*                                                                           */
81 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
82 /*         25 11 2021   Kishore               creation                       */
83 /*                                                                           */
84 /*****************************************************************************/
85 
isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)86 WORD32 isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr(
87     WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
88     WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
89     UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
90 {
91     WORD32 i4_iq_out_temp;
92     int16x8_t temp_0;
93     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
94     int16x8_t pred0, pred1, pred2, pred3;
95     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
96     int16x8_t dup_val_1, dup_val_2, dup_abs;
97     int16x8_t resd01, resd23, dup_max, dup_min;
98     WORD32 i4_nnz;
99 
100     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
101     UNUSED(pi2_tmp);
102     if(iq_start_idx == 0)
103     {
104         i4_iq_out_temp = pi2_src[0];
105 
106         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
107     }
108     else
109     {
110         i4_iq_out_temp = pi2_dc_ld_addr[0];
111     }
112 
113     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
114     dup_min = vdupq_n_s16(RSD_MIN);
115     dup_max = vdupq_n_s16(RSD_MAX);
116 
117     pred0_in = vld1_u8((uint8_t *) pu1_pred);
118     pu1_pred = pu1_pred + pred_strd;
119     pred1_in = vld1_u8((uint8_t *) pu1_pred);
120     pu1_pred = pu1_pred + pred_strd;
121     pred2_in = vld1_u8((uint8_t *) pu1_pred);
122     pu1_pred = pu1_pred + pred_strd;
123     pred3_in = vld1_u8((uint8_t *) pu1_pred);
124 
125     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
126     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
127     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
128     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
129 
130     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
131     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
132     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
133     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
134 
135     resd0_in = vaddq_s16(resd0_in, temp_0);
136     resd1_in = vaddq_s16(resd1_in, temp_0);
137     resd2_in = vaddq_s16(resd2_in, temp_0);
138     resd3_in = vaddq_s16(resd3_in, temp_0);
139 
140     resd0_in = vminq_s16(resd0_in, dup_max);
141     resd0_in = vmaxq_s16(resd0_in, dup_min);
142     resd1_in = vminq_s16(resd1_in, dup_max);
143     resd1_in = vmaxq_s16(resd1_in, dup_min);
144     resd2_in = vminq_s16(resd2_in, dup_max);
145     resd2_in = vmaxq_s16(resd2_in, dup_min);
146     resd3_in = vminq_s16(resd3_in, dup_max);
147     resd3_in = vmaxq_s16(resd3_in, dup_min);
148 
149     resd01 = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
150                                                 vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
151 
152     resd23 = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
153                                                 vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
154 
155     dup_val_1 = vabsq_s16(resd01);
156     dup_val_2 = vabsq_s16(resd23);
157     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
158     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
159              dup_abs[6] || dup_abs[7];
160 
161     pred0 = vaddq_s16(pred0, resd0_in);
162     pred1 = vaddq_s16(pred1, resd1_in);
163     pred2 = vaddq_s16(pred2, resd2_in);
164     pred3 = vaddq_s16(pred3, resd3_in);
165 
166     pred0_in = vqmovun_s16(pred0);
167     pred1_in = vqmovun_s16(pred1);
168     pred2_in = vqmovun_s16(pred2);
169     pred3_in = vqmovun_s16(pred3);
170 
171     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
172     vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred1_in), 0);
173     vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 2), vreinterpret_u32_u8(pred2_in), 0);
174     vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 3), vreinterpret_u32_u8(pred3_in), 0);
175 
176     return i4_nnz;
177 }
178 
179 /*****************************************************************************/
180 /*                                                                           */
181 /*  Function Name : isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr */
182 /*                                                                           */
183 /*  Description   : this function computes the recon output for the chroma   */
184 /*                  from IQ+IT+RESD                                          */
185 /*                                                                           */
186 /*  Inputs        :                                                          */
187 /*  Globals       : none                                                     */
188 /*  Processing    :                                                          */
189 /*                                                                           */
190 /*  Outputs       : none                                                     */
191 /*  Returns       : none                                                     */
192 /*                                                                           */
193 /*  Issues        : none                                                     */
194 /*                                                                           */
195 /*  Revision History:                                                        */
196 /*                                                                           */
197 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
198 /*         25 11 2021   Kishore               creation                       */
199 /*                                                                           */
200 /*****************************************************************************/
201 
isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)202 void isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr(
203     WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
204     WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
205     UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
206 {
207     WORD32 i4_iq_out_temp;
208     int16x8_t temp_0;
209     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
210     int16x8_t pred0, pred1, pred2, pred3;
211     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in, dup_min, dup_max;
212 
213     uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
214     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
215 
216     UNUSED(pi2_src);
217     UNUSED(pu2_iscal_mat);
218     UNUSED(pu2_weigh_mat);
219     UNUSED(u4_qp_div_6);
220     UNUSED(pi2_tmp);
221 
222     i4_iq_out_temp = pi2_dc_src[0];
223     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
224     dup_min = vdupq_n_s16(RSD_MIN);
225     dup_max = vdupq_n_s16(RSD_MAX);
226 
227     pred0_in = vld1_u8((uint8_t *) pu1_pred);
228     pu1_pred = pu1_pred + pred_strd;
229     pred1_in = vld1_u8((uint8_t *) pu1_pred);
230     pu1_pred = pu1_pred + pred_strd;
231     pred2_in = vld1_u8((uint8_t *) pu1_pred);
232     pu1_pred = pu1_pred + pred_strd;
233     pred3_in = vld1_u8((uint8_t *) pu1_pred);
234 
235     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
236     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
237     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
238     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
239 
240     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
241     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
242     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
243     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
244 
245     resd0_in = vaddq_s16(resd0_in, temp_0);
246     resd1_in = vaddq_s16(resd1_in, temp_0);
247     resd2_in = vaddq_s16(resd2_in, temp_0);
248     resd3_in = vaddq_s16(resd3_in, temp_0);
249 
250     resd0_in = vminq_s16(resd0_in, dup_max);
251     resd0_in = vmaxq_s16(resd0_in, dup_min);
252     resd1_in = vminq_s16(resd1_in, dup_max);
253     resd1_in = vmaxq_s16(resd1_in, dup_min);
254     resd2_in = vminq_s16(resd2_in, dup_max);
255     resd2_in = vmaxq_s16(resd2_in, dup_min);
256     resd3_in = vminq_s16(resd3_in, dup_max);
257     resd3_in = vmaxq_s16(resd3_in, dup_min);
258 
259     pred0 = vaddq_s16(pred0, resd0_in);
260     pred1 = vaddq_s16(pred1, resd1_in);
261     pred2 = vaddq_s16(pred2, resd2_in);
262     pred3 = vaddq_s16(pred3, resd3_in);
263 
264     pred0_in = vqmovun_s16(pred0);
265     pred1_in = vqmovun_s16(pred1);
266     pred2_in = vqmovun_s16(pred2);
267     pred3_in = vqmovun_s16(pred3);
268 
269     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
270     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
271     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
272     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
273 
274     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
275     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
276     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
277     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
278 
279     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
280     vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
281     vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
282     vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
283 }
284 
285 /*****************************************************************************/
286 /*                                                                           */
287 /*  Function Name : isvcd_iquant_itrans_residual_recon_4x4_neonintr           */
288 /*                                                                           */
289 /*  Description   : this function computes the recon output from the         */
290 /*                  IQ+IT+RESD                                               */
291 /*                                                                           */
292 /*  Inputs        :                                                          */
293 /*  Globals       : none                                                     */
294 /*  Processing    :                                                          */
295 /*                                                                           */
296 /*  Outputs       : i4_nnz                                                   */
297 /*  Returns       : none                                                     */
298 /*                                                                           */
299 /*  Issues        : none                                                     */
300 /*                                                                           */
301 /*  Revision History:                                                        */
302 /*                                                                           */
303 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
304 /*         25 11 2021   Kishore               creation                       */
305 /*                                                                           */
306 /*****************************************************************************/
307 
isvcd_iquant_itrans_residual_recon_4x4_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)308 WORD32 isvcd_iquant_itrans_residual_recon_4x4_neonintr(
309     WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
310     WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
311     UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
312 {
313     int16x4x4_t src_16x4x2;
314     int16x4x4_t iscal_16x4x2;
315     int16x4x4_t weigh_16x4x2;
316 
317     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
318     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
319     int16x4_t rq1_16x4, rq3_16x4;
320     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
321     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
322     int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
323     int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
324     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
325 
326     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
327     int16x8_t pred0, pred1, pred2, pred3;
328     int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
329     int16x8_t resd01_in, resd23_in, dup_min, dup_max;
330     int16x8_t pred01_in, pred23_in;
331     uint8x8_t pred01_un, pred23_un;
332     WORD32 i4_nnz;
333     int16x4x2_t xx0_16x4_2, xx2_16x4_2;
334     int32x2x2_t x0_32x2_2, x1_32x2_2;
335     int16x8_t dup_val_1, dup_val_2, dup_abs;
336 
337     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
338     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
339     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
340     UNUSED(pi2_tmp);
341     dup_min = vdupq_n_s16(RSD_MIN);
342     dup_max = vdupq_n_s16(RSD_MAX);
343 
344     src_16x4x2 = vld4_s16(pi2_src);
345     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
346     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
347 
348     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
349     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
350     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
351     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
352 
353     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
354     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
355     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
356     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
357 
358     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
359     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
360     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
361     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
362 
363     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
364     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
365     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
366     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
367 
368     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
369     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
370     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
371     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
372 
373     if(iq_start_idx == 1)
374     {
375         q0_16x4 = vset_lane_s16(pi2_dc_ld_addr[0], q0_16x4, 0);
376     }
377 
378     rq1_16x4 = vshr_n_s16(q1_16x4, 1);      // q1 >>1
379     rq3_16x4 = vshr_n_s16(q3_16x4, 1);      // q3 >>1
380 
381     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);   // x0 =  q0 + q2
382     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);   // x1 =  q0 - q2
383     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);  // x2 =  q1>>1 - q3
384     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);  // x2 =  q1 + q3>>1
385 
386     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);  // x0+x3
387     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);  // x1+x2
388     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);  // x1-x2
389     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);  // x0-x3
390 
391     xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
392     xx0_0_16x4 = xx0_16x4_2.val[0];
393     xx0_1_16x4 = xx0_16x4_2.val[1];
394     xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
395     xx2_0_16x4 = xx2_16x4_2.val[0];
396     xx2_1_16x4 = xx2_16x4_2.val[1];
397     x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
398     x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
399     x0_32x2 = x0_32x2_2.val[0];
400     x1_32x2 = x1_32x2_2.val[0];
401     x2_32x2 = x0_32x2_2.val[1];
402     x3_32x2 = x1_32x2_2.val[1];
403 
404     x0_16x4 = vreinterpret_s16_s32(x0_32x2);
405     x1_16x4 = vreinterpret_s16_s32(x1_32x2);
406     x2_16x4 = vreinterpret_s16_s32(x2_32x2);
407     x3_16x4 = vreinterpret_s16_s32(x3_32x2);
408 
409     /* vertical inverse transform */
410     rq1_16x4 = vshr_n_s16(x1_16x4, 1);       // q1 >> 1
411     rq3_16x4 = vshr_n_s16(x3_16x4, 1);       // q3 >> 1
412 
413     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);   // x0 =  q0 + q2
414     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);   // x1 =  q0 - q2
415     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);  // x2 =  q1>>1 - q3
416     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);  // x3 =  q1 + q3>>1
417 
418     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);  // imacro = x0 + x3
419     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);  // imacro = x1 + x2
420     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);  // imacro = x1 - x2
421     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);  // imacro = x0 - x3
422 
423     resd0_in = vld1_s16((int16_t *) pi2_rsd);
424     resd1_in = vld1_s16((int16_t *) pi2_rsd + rsd_strd);
425     resd2_in = vld1_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
426     resd3_in = vld1_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
427 
428     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
429     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
430     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
431     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
432 
433     resd0_in = vadd_s16(resd0_in, x0_16x4);
434     resd1_in = vadd_s16(resd1_in, x1_16x4);
435     resd2_in = vadd_s16(resd2_in, x2_16x4);
436     resd3_in = vadd_s16(resd3_in, x3_16x4);
437 
438     pred0_in = vld1_u8((uint8_t *) pu1_pred);
439     pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
440     pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd << 1));
441     pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
442 
443     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
444     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
445     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
446     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
447 
448     resd01_in = vcombine_s16(resd0_in, resd1_in);
449     resd23_in = vcombine_s16(resd2_in, resd3_in);
450 
451     resd01_in = vminq_s16(resd01_in, dup_max);
452     resd01_in = vmaxq_s16(resd01_in, dup_min);
453     resd23_in = vminq_s16(resd23_in, dup_max);
454     resd23_in = vmaxq_s16(resd23_in, dup_min);
455 
456     dup_val_1 = vabsq_s16(resd01_in);
457     dup_val_2 = vabsq_s16(resd23_in);
458     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
459     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
460              dup_abs[6] || dup_abs[7];
461 
462     pred01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(pred0)),
463                                                    vget_low_s64(vreinterpretq_s64_s16(pred1))));
464 
465     pred23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(pred2)),
466                                                    vget_low_s64(vreinterpretq_s64_s16(pred3))));
467 
468     pred01_in = vaddq_s16(pred01_in, resd01_in);
469     pred23_in = vaddq_s16(pred23_in, resd23_in);
470 
471     pred01_un = vqmovun_s16(pred01_in);
472     pred23_un = vqmovun_s16(pred23_in);
473 
474     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
475     vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred01_un), 1);
476     vst1_lane_u32((uint32_t *) (pu1_out + (out_strd << 1)), vreinterpret_u32_u8(pred23_un), 0);
477     vst1_lane_u32((uint32_t *) (pu1_out + ((out_strd << 1) + out_strd)),
478                   vreinterpret_u32_u8(pred23_un), 1);
479 
480     return i4_nnz;
481 }
482 
483 /*****************************************************************************/
484 /*                                                                           */
485 /*  Function Name : isvcd_iquant_itrans_residual_recon_chroma_4x4_neonintr    */
486 /*                                                                           */
487 /*  Description   : this function computes the recon output for the chroma   */
488 /*                  from IQ+IT+RESD                                          */
489 /*                                                                           */
490 /*  Inputs        :                                                          */
491 /*  Globals       : none                                                     */
492 /*  Processing    :                                                          */
493 /*                                                                           */
494 /*  Outputs       : none                                                     */
495 /*  Returns       : none                                                     */
496 /*                                                                           */
497 /*  Issues        : none                                                     */
498 /*                                                                           */
499 /*  Revision History:                                                        */
500 /*                                                                           */
501 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
502 /*         25 11 2021   Kishore               creation                       */
503 /*                                                                           */
504 /*****************************************************************************/
505 
isvcd_iquant_itrans_residual_recon_chroma_4x4_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)506 void isvcd_iquant_itrans_residual_recon_chroma_4x4_neonintr(
507     WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
508     WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
509     UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
510 {
511     int16x4x4_t src_16x4x2;
512     int16x4x4_t iscal_16x4x2;
513     int16x4x4_t weigh_16x4x2;
514 
515     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
516     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
517     int16x4_t rq1_16x4, rq3_16x4;
518     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
519     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
520     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
521     int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
522     int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
523     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
524 
525     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
526     int16x8_t pred0, pred1, pred2, pred3;
527     int16x8_t rec0, rec1, rec2, rec3;
528     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
529     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in, dup_min, dup_max;
530     uint8x8_t out0, out1, out2, out3;
531     int16x4x2_t xx0_16x4_2, xx2_16x4_2;
532     int32x2x2_t x0_32x2_2, x1_32x2_2;
533 
534     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
535     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
536     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
537     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
538     UNUSED(pi2_tmp);
539     dup_min = vdupq_n_s16(RSD_MIN);
540     dup_max = vdupq_n_s16(RSD_MAX);
541 
542     src_16x4x2 = vld4_s16(pi2_src);
543     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
544     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
545 
546     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
547     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
548     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
549     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
550 
551     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
552     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
553     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
554     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
555 
556     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
557     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
558     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
559     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
560 
561     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
562     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
563     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
564     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
565 
566     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
567     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
568     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
569     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
570 
571     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
572 
573     rq1_16x4 = vshr_n_s16(q1_16x4, 1);      // q1 >>1
574     rq3_16x4 = vshr_n_s16(q3_16x4, 1);      // q3 >>1
575 
576     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);   // x0 =  q0 + q2
577     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);   // x1 =  q0 - q2
578     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);  // x2 =  q1>>1 - q3
579     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);  // x2 =  q1 + q3>>1
580 
581     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);  // x0+x3
582     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);  // x1+x2
583     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);  // x1-x2
584     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);  // x0-x3
585 
586     xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
587     xx0_0_16x4 = xx0_16x4_2.val[0];
588     xx0_1_16x4 = xx0_16x4_2.val[1];
589     xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
590     xx2_0_16x4 = xx2_16x4_2.val[0];
591     xx2_1_16x4 = xx2_16x4_2.val[1];
592     x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
593     x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
594     x0_32x2 = x0_32x2_2.val[0];
595     x1_32x2 = x1_32x2_2.val[0];
596     x2_32x2 = x0_32x2_2.val[1];
597     x3_32x2 = x1_32x2_2.val[1];
598 
599     x0_16x4 = vreinterpret_s16_s32(x0_32x2);
600     x1_16x4 = vreinterpret_s16_s32(x1_32x2);
601     x2_16x4 = vreinterpret_s16_s32(x2_32x2);
602     x3_16x4 = vreinterpret_s16_s32(x3_32x2);
603 
604     /* vertical inverse transform */
605     rq1_16x4 = vshr_n_s16(x1_16x4, 1);       // q1 >> 1
606     rq3_16x4 = vshr_n_s16(x3_16x4, 1);       // q3 >> 1
607 
608     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);   // x0 =  q0 + q2
609     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);   // x1 =  q0 - q2
610     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);  // x2 =  q1>>1 - q3
611     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);  // x3 =  q1 + q3>>1
612 
613     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);  // imacro = x0 + x3
614     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);  // imacro = x1 + x2
615     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);  // imacro = x1 - x2
616     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);  // imacro = x0 - x3
617 
618     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
619     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
620     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
621     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
622 
623     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
624     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
625     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
626     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
627 
628     x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
629     x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
630     x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
631     x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
632 
633     resd0_in = vaddq_s16(resd0_in, x0_16x8);
634     resd1_in = vaddq_s16(resd1_in, x1_16x8);
635     resd2_in = vaddq_s16(resd2_in, x2_16x8);
636     resd3_in = vaddq_s16(resd3_in, x3_16x8);
637 
638     resd0_in = vminq_s16(resd0_in, dup_max);
639     resd0_in = vmaxq_s16(resd0_in, dup_min);
640     resd1_in = vminq_s16(resd1_in, dup_max);
641     resd1_in = vmaxq_s16(resd1_in, dup_min);
642     resd2_in = vminq_s16(resd2_in, dup_max);
643     resd2_in = vmaxq_s16(resd2_in, dup_min);
644     resd3_in = vminq_s16(resd3_in, dup_max);
645     resd3_in = vmaxq_s16(resd3_in, dup_min);
646 
647     pred0_in = vld1_u8((uint8_t *) pu1_pred);
648     pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
649     pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd << 1));
650     pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
651 
652     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
653     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
654     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
655     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
656 
657     rec0 = vaddq_s16(pred0, resd0_in);
658     rec1 = vaddq_s16(pred1, resd1_in);
659     rec2 = vaddq_s16(pred2, resd2_in);
660     rec3 = vaddq_s16(pred3, resd3_in);
661 
662     out0 = vld1_u8(pu1_out);
663     out1 = vld1_u8(pu1_out + out_strd);
664     out2 = vld1_u8(pu1_out + out_strd * 2);
665     out3 = vld1_u8(pu1_out + out_strd * 3);
666 
667     rec0_un = vqmovun_s16(rec0);
668     rec1_un = vqmovun_s16(rec1);
669     rec2_un = vqmovun_s16(rec2);
670     rec3_un = vqmovun_s16(rec3);
671 
672     out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
673     out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
674     out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
675     out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
676 
677     vst1_u8((pu1_out), out0);
678     vst1_u8((pu1_out + out_strd), out1);
679     vst1_u8((pu1_out + (out_strd << 1)), out2);
680     vst1_u8((pu1_out + ((out_strd << 1) + out_strd)), out3);
681 }
682 
683 /*****************************************************************************/
684 /*                                                                           */
685 /*  Function Name : isvcd_iquant_itrans_residual_recon_8x8_neonintr           */
686 /*                                                                           */
687 /*  Description   : this function computes the recon output from the         */
688 /*                  IQ+IT+RESD                                               */
689 /*                                                                           */
690 /*  Inputs        :                                                          */
691 /*  Globals       : none                                                     */
692 /*  Processing    :                                                          */
693 /*                                                                           */
694 /*  Outputs       : i4_nnz                                                   */
695 /*  Returns       : none                                                     */
696 /*                                                                           */
697 /*  Issues        : none                                                     */
698 /*                                                                           */
699 /*  Revision History:                                                        */
700 /*                                                                           */
701 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
702 /*         25 11 2021   Kishore               creation                       */
703 /*                                                                           */
704 /*****************************************************************************/
705 
isvcd_iquant_itrans_residual_recon_8x8_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd_ptr,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)706 WORD32 isvcd_iquant_itrans_residual_recon_8x8_neonintr(
707     WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd_ptr, UWORD8 *pu1_out, WORD32 pred_strd,
708     WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
709     UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
710 {
711     int32x4_t qp_div_32x4 = vdupq_n_s32(qp_div);
712     int16x8_t iscal_16x8_0, iscal_16x8_1, iscal_16x8_2, iscal_16x8_3, iscal_16x8_4, iscal_16x8_5,
713         iscal_16x8_6, iscal_16x8_7;
714     int16x8_t weigh_16x8_0, weigh_16x8_1, weigh_16x8_2, weigh_16x8_3, weigh_16x8_4, weigh_16x8_5,
715         weigh_16x8_6, weigh_16x8_7;
716 
717     int16x8_t src_16x8_0, src_16x8_1, src_16x8_2, src_16x8_3, src_16x8_4, src_16x8_5, src_16x8_6,
718         src_16x8_7;
719     int16x8_t coeff_mul_16x8_0, coeff_mul_16x8_1, coeff_mul_16x8_2, coeff_mul_16x8_3,
720         coeff_mul_16x8_4, coeff_mul_16x8_5, coeff_mul_16x8_6, coeff_mul_16x8_7;
721 
722     int32x4_t quant_res_32x4_l_0, quant_res_32x4_l_1, quant_res_32x4_l_2, quant_res_32x4_l_3,
723         quant_res_32x4_l_4, quant_res_32x4_l_5, quant_res_32x4_l_6, quant_res_32x4_l_7;
724     int32x4_t quant_res_32x4_h_0, quant_res_32x4_h_1, quant_res_32x4_h_2, quant_res_32x4_h_3,
725         quant_res_32x4_h_4, quant_res_32x4_h_5, quant_res_32x4_h_6, quant_res_32x4_h_7;
726     int16x4_t quant_res_16x4_l_0, quant_res_16x4_l_1, quant_res_16x4_l_2, quant_res_16x4_l_3,
727         quant_res_16x4_l_4, quant_res_16x4_l_5, quant_res_16x4_l_6, quant_res_16x4_l_7;
728     int16x4_t quant_res_16x4_h_0, quant_res_16x4_h_1, quant_res_16x4_h_2, quant_res_16x4_h_3,
729         quant_res_16x4_h_4, quant_res_16x4_h_5, quant_res_16x4_h_6, quant_res_16x4_h_7;
730 
731     int16x8_t quant_res_16x8_0, quant_res_16x8_1, quant_res_16x8_2, quant_res_16x8_3,
732         quant_res_16x8_4, quant_res_16x8_5, quant_res_16x8_6, quant_res_16x8_7;
733 
734     int16x8_t trans_16x8_0, trans_16x8_1, trans_16x8_2, trans_16x8_3, trans_16x8_4, trans_16x8_5,
735         trans_16x8_6, trans_16x8_7;
736     int32x4_t trans_32x4_0, trans_32x4_1, trans_32x4_2, trans_32x4_3, trans_32x4_4, trans_32x4_5,
737         trans_32x4_6, trans_32x4_7;
738     int64x2_t trans_64x2_0, trans_64x2_1, trans_64x2_2, trans_64x2_3, trans_64x2_4, trans_64x2_5,
739         trans_64x2_6, trans_64x2_7;
740     int16x4_t trans_16x4_1_l, trans_16x4_3_l, trans_16x4_5_l, trans_16x4_7_l;
741     int16x8_t rs_trans_16x8_1, rs_trans_16x8_2, rs_trans_16x8_3, rs_trans_16x8_5, rs_trans_16x8_6,
742         rs_trans_16x8_7;
743     int32x4_t sub_3_5_l, sub_3_5_h;
744     int32x4_t add_3_5_l, add_3_5_h;
745     int32x4_t sub_1_7_l, sub_1_7_h;
746     int32x4_t add_1_7_l, add_1_7_h;
747     int32x4_t sub_357_l, sub_357_h;
748     int32x4_t add_351_l, add_351_h;
749     int32x4_t add_175_l, add_175_h;
750     int32x4_t sub_173_l, sub_173_h;
751     int32x4_t y1_32x4_l, y1_32x4_h;
752     int32x4_t y3_32x4_l, y3_32x4_h;
753     int32x4_t y5_32x4_l, y5_32x4_h;
754     int32x4_t y7_32x4_l, y7_32x4_h;
755     int16x4_t y1_16x4_l, y3_16x4_l, y5_16x4_l, y7_16x4_l;
756     int16x4_t y1_16x4_h, y3_16x4_h, y5_16x4_h, y7_16x4_h;
757 
758     int16x8_t y0_16x8, y1_16x8, y2_16x8, y3_16x8, y4_16x8, y5_16x8, y6_16x8, y7_16x8;
759     int16x8_t rs_y1_16x8, rs_y3_16x8, rs_y5_16x8, rs_y7_16x8;
760     int16x8_t z0_16x8, z1_16x8, z2_16x8, z3_16x8, z4_16x8, z5_16x8, z6_16x8, z7_16x8;
761 
762     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in, resd4_in, resd5_in, resd6_in, resd7_in;
763     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in, pred4_in, pred5_in, pred6_in, pred7_in;
764     int16x8_t pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
765     int16x8_t rec0, rec1, rec2, rec3, rec4, rec5, rec6, rec7;
766     uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un, rec4_un, rec5_un, rec6_un, rec7_un;
767 
768     int64x2_t resd0_64x2, resd1_64x2, resd2_64x2, resd3_64x2, resd4_64x2, resd5_64x2, resd6_64x2,
769         resd7_64x2;
770     int16x8x2_t trans_16x8_0_1, trans_16x8_2_3, trans_16x8_4_5, trans_16x8_6_7;
771     int32x4x2_t trans_32x4_0_2, trans_32x4_1_3, trans_32x4_4_6, trans_32x4_5_7;
772 
773     int16x8_t resd_b0_r01;
774     int16x8_t resd_b0_r23;
775     int16x8_t resd_b1_r01;
776     int16x8_t resd_b1_r23;
777     int16x8_t resd_b2_r45;
778     int16x8_t resd_b2_r67;
779     int16x8_t resd_b3_r45;
780     int16x8_t resd_b3_r67;
781 
782     int16x8_t dup_min, dup_max;
783     int16x8_t dup_val_1, dup_val_2, dup_abs;
784 
785     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
786     WORD32 i;
787     UNUSED(pi2_tmp);
788     UNUSED(iq_start_idx);
789     UNUSED(pi2_dc_ld_addr);
790 
791     iscal_16x8_0 = vld1q_s16((const int16_t *) pu2_iscale_mat);
792     iscal_16x8_1 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 8));
793     iscal_16x8_2 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 16));
794     iscal_16x8_3 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 24));
795     iscal_16x8_4 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 32));
796     iscal_16x8_5 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 40));
797     iscal_16x8_6 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 48));
798     iscal_16x8_7 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 56));
799 
800     weigh_16x8_0 = vld1q_s16((const int16_t *) pu2_weigh_mat);
801     weigh_16x8_1 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 8));
802     weigh_16x8_2 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 16));
803     weigh_16x8_3 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 24));
804     weigh_16x8_4 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 32));
805     weigh_16x8_5 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 40));
806     weigh_16x8_6 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 48));
807     weigh_16x8_7 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 56));
808 
809     src_16x8_0 = vld1q_s16((const int16_t *) pi2_src);        // a0 a1 a2 a3 a4 a5 a6 a7
810     src_16x8_1 = vld1q_s16((const int16_t *) (pi2_src + 8));  // b0 b1 b2 b3 b4 b5 b6 b7
811     src_16x8_2 = vld1q_s16((const int16_t *) (pi2_src + 16));
812     src_16x8_3 = vld1q_s16((const int16_t *) (pi2_src + 24));
813     src_16x8_4 = vld1q_s16((const int16_t *) (pi2_src + 32));
814     src_16x8_5 = vld1q_s16((const int16_t *) (pi2_src + 40));
815     src_16x8_6 = vld1q_s16((const int16_t *) (pi2_src + 48));
816     src_16x8_7 = vld1q_s16((const int16_t *) (pi2_src + 56));
817 
818     dup_min = vdupq_n_s16(RSD_MIN);
819     dup_max = vdupq_n_s16(RSD_MAX);
820 
821     coeff_mul_16x8_0 = vmulq_s16(iscal_16x8_0, weigh_16x8_0);
822     coeff_mul_16x8_1 = vmulq_s16(iscal_16x8_1, weigh_16x8_1);
823     coeff_mul_16x8_2 = vmulq_s16(iscal_16x8_2, weigh_16x8_2);
824     coeff_mul_16x8_3 = vmulq_s16(iscal_16x8_3, weigh_16x8_3);
825     coeff_mul_16x8_4 = vmulq_s16(iscal_16x8_4, weigh_16x8_4);
826     coeff_mul_16x8_5 = vmulq_s16(iscal_16x8_5, weigh_16x8_5);
827     coeff_mul_16x8_6 = vmulq_s16(iscal_16x8_6, weigh_16x8_6);
828     coeff_mul_16x8_7 = vmulq_s16(iscal_16x8_7, weigh_16x8_7);
829 
830     quant_res_32x4_l_0 = vmull_s16(vget_low_s16(coeff_mul_16x8_0), vget_low_s16(src_16x8_0));
831     quant_res_32x4_l_1 = vmull_s16(vget_low_s16(coeff_mul_16x8_1), vget_low_s16(src_16x8_1));
832     quant_res_32x4_l_2 = vmull_s16(vget_low_s16(coeff_mul_16x8_2), vget_low_s16(src_16x8_2));
833     quant_res_32x4_l_3 = vmull_s16(vget_low_s16(coeff_mul_16x8_3), vget_low_s16(src_16x8_3));
834     quant_res_32x4_l_4 = vmull_s16(vget_low_s16(coeff_mul_16x8_4), vget_low_s16(src_16x8_4));
835     quant_res_32x4_l_5 = vmull_s16(vget_low_s16(coeff_mul_16x8_5), vget_low_s16(src_16x8_5));
836     quant_res_32x4_l_6 = vmull_s16(vget_low_s16(coeff_mul_16x8_6), vget_low_s16(src_16x8_6));
837     quant_res_32x4_l_7 = vmull_s16(vget_low_s16(coeff_mul_16x8_7), vget_low_s16(src_16x8_7));
838 
839     quant_res_32x4_h_0 = vmull_s16(vget_high_s16(coeff_mul_16x8_0), vget_high_s16(src_16x8_0));
840     quant_res_32x4_h_1 = vmull_s16(vget_high_s16(coeff_mul_16x8_1), vget_high_s16(src_16x8_1));
841     quant_res_32x4_h_2 = vmull_s16(vget_high_s16(coeff_mul_16x8_2), vget_high_s16(src_16x8_2));
842     quant_res_32x4_h_3 = vmull_s16(vget_high_s16(coeff_mul_16x8_3), vget_high_s16(src_16x8_3));
843     quant_res_32x4_h_4 = vmull_s16(vget_high_s16(coeff_mul_16x8_4), vget_high_s16(src_16x8_4));
844     quant_res_32x4_h_5 = vmull_s16(vget_high_s16(coeff_mul_16x8_5), vget_high_s16(src_16x8_5));
845     quant_res_32x4_h_6 = vmull_s16(vget_high_s16(coeff_mul_16x8_6), vget_high_s16(src_16x8_6));
846     quant_res_32x4_h_7 = vmull_s16(vget_high_s16(coeff_mul_16x8_7), vget_high_s16(src_16x8_7));
847 
848     quant_res_32x4_l_0 = vshlq_s32(quant_res_32x4_l_0, qp_div_32x4);
849     quant_res_32x4_l_1 = vshlq_s32(quant_res_32x4_l_1, qp_div_32x4);
850     quant_res_32x4_l_2 = vshlq_s32(quant_res_32x4_l_2, qp_div_32x4);
851     quant_res_32x4_l_3 = vshlq_s32(quant_res_32x4_l_3, qp_div_32x4);
852     quant_res_32x4_l_4 = vshlq_s32(quant_res_32x4_l_4, qp_div_32x4);
853     quant_res_32x4_l_5 = vshlq_s32(quant_res_32x4_l_5, qp_div_32x4);
854     quant_res_32x4_l_6 = vshlq_s32(quant_res_32x4_l_6, qp_div_32x4);
855     quant_res_32x4_l_7 = vshlq_s32(quant_res_32x4_l_7, qp_div_32x4);
856 
857     quant_res_32x4_h_0 = vshlq_s32(quant_res_32x4_h_0, qp_div_32x4);
858     quant_res_32x4_h_1 = vshlq_s32(quant_res_32x4_h_1, qp_div_32x4);
859     quant_res_32x4_h_2 = vshlq_s32(quant_res_32x4_h_2, qp_div_32x4);
860     quant_res_32x4_h_3 = vshlq_s32(quant_res_32x4_h_3, qp_div_32x4);
861     quant_res_32x4_h_4 = vshlq_s32(quant_res_32x4_h_4, qp_div_32x4);
862     quant_res_32x4_h_5 = vshlq_s32(quant_res_32x4_h_5, qp_div_32x4);
863     quant_res_32x4_h_6 = vshlq_s32(quant_res_32x4_h_6, qp_div_32x4);
864     quant_res_32x4_h_7 = vshlq_s32(quant_res_32x4_h_7, qp_div_32x4);
865 
866     quant_res_16x4_l_0 = vqrshrn_n_s32(quant_res_32x4_l_0, 6);
867     quant_res_16x4_l_1 = vqrshrn_n_s32(quant_res_32x4_l_1, 6);
868     quant_res_16x4_l_2 = vqrshrn_n_s32(quant_res_32x4_l_2, 6);
869     quant_res_16x4_l_3 = vqrshrn_n_s32(quant_res_32x4_l_3, 6);
870     quant_res_16x4_l_4 = vqrshrn_n_s32(quant_res_32x4_l_4, 6);
871     quant_res_16x4_l_5 = vqrshrn_n_s32(quant_res_32x4_l_5, 6);
872     quant_res_16x4_l_6 = vqrshrn_n_s32(quant_res_32x4_l_6, 6);
873     quant_res_16x4_l_7 = vqrshrn_n_s32(quant_res_32x4_l_7, 6);
874 
875     quant_res_16x4_h_0 = vqrshrn_n_s32(quant_res_32x4_h_0, 6);
876     quant_res_16x4_h_1 = vqrshrn_n_s32(quant_res_32x4_h_1, 6);
877     quant_res_16x4_h_2 = vqrshrn_n_s32(quant_res_32x4_h_2, 6);
878     quant_res_16x4_h_3 = vqrshrn_n_s32(quant_res_32x4_h_3, 6);
879     quant_res_16x4_h_4 = vqrshrn_n_s32(quant_res_32x4_h_4, 6);
880     quant_res_16x4_h_5 = vqrshrn_n_s32(quant_res_32x4_h_5, 6);
881     quant_res_16x4_h_6 = vqrshrn_n_s32(quant_res_32x4_h_6, 6);
882     quant_res_16x4_h_7 = vqrshrn_n_s32(quant_res_32x4_h_7, 6);
883 
884     quant_res_16x8_0 = vcombine_s16(quant_res_16x4_l_0, quant_res_16x4_h_0);
885     quant_res_16x8_1 = vcombine_s16(quant_res_16x4_l_1, quant_res_16x4_h_1);
886     quant_res_16x8_2 = vcombine_s16(quant_res_16x4_l_2, quant_res_16x4_h_2);
887     quant_res_16x8_3 = vcombine_s16(quant_res_16x4_l_3, quant_res_16x4_h_3);
888     quant_res_16x8_4 = vcombine_s16(quant_res_16x4_l_4, quant_res_16x4_h_4);
889     quant_res_16x8_5 = vcombine_s16(quant_res_16x4_l_5, quant_res_16x4_h_5);
890     quant_res_16x8_6 = vcombine_s16(quant_res_16x4_l_6, quant_res_16x4_h_6);
891     quant_res_16x8_7 = vcombine_s16(quant_res_16x4_l_7, quant_res_16x4_h_7);
892 
893     for(i = 0; i < 2; i++)
894     {
895         trans_16x8_0_1 = vtrnq_s16(quant_res_16x8_0, quant_res_16x8_1);
896         trans_16x8_0 = trans_16x8_0_1.val[0];
897         trans_16x8_1 = trans_16x8_0_1.val[1];
898 
899         trans_16x8_2_3 = vtrnq_s16(quant_res_16x8_2, quant_res_16x8_3);
900         trans_16x8_2 = trans_16x8_2_3.val[0];
901         trans_16x8_3 = trans_16x8_2_3.val[1];
902 
903         trans_16x8_4_5 = vtrnq_s16(quant_res_16x8_4, quant_res_16x8_5);
904         trans_16x8_4 = trans_16x8_4_5.val[0];
905         trans_16x8_5 = trans_16x8_4_5.val[1];
906 
907         trans_16x8_6_7 = vtrnq_s16(quant_res_16x8_6, quant_res_16x8_7);
908         trans_16x8_6 = trans_16x8_6_7.val[0];
909         trans_16x8_7 = trans_16x8_6_7.val[1];
910 
911         trans_32x4_0_2 =
912             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_0), vreinterpretq_s32_s16(trans_16x8_2));
913         trans_32x4_0 = trans_32x4_0_2.val[0];
914         trans_32x4_2 = trans_32x4_0_2.val[1];
915 
916         trans_32x4_1_3 =
917             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_1), vreinterpretq_s32_s16(trans_16x8_3));
918         trans_32x4_1 = trans_32x4_1_3.val[0];
919         trans_32x4_3 = trans_32x4_1_3.val[1];
920 
921         trans_32x4_4_6 =
922             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_4), vreinterpretq_s32_s16(trans_16x8_6));
923         trans_32x4_4 = trans_32x4_4_6.val[0];
924         trans_32x4_6 = trans_32x4_4_6.val[1];
925 
926         trans_32x4_5_7 =
927             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_5), vreinterpretq_s32_s16(trans_16x8_7));
928         trans_32x4_5 = trans_32x4_5_7.val[0];
929         trans_32x4_7 = trans_32x4_5_7.val[1];
930 
931         trans_64x2_0 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_0)),
932                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_4)));
933         trans_64x2_4 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_0)),
934                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_4)));
935 
936         trans_64x2_1 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_1)),
937                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_5)));
938         trans_64x2_5 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_1)),
939                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_5)));
940 
941         trans_64x2_2 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_2)),
942                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_6)));
943         trans_64x2_6 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_2)),
944                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_6)));
945 
946         trans_64x2_3 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_3)),
947                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_7)));
948         trans_64x2_7 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_3)),
949                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_7)));
950 
951         trans_16x8_0 = vreinterpretq_s16_s64(trans_64x2_0);
952         trans_16x8_1 = vreinterpretq_s16_s64(trans_64x2_1);
953         trans_16x8_2 = vreinterpretq_s16_s64(trans_64x2_2);
954         trans_16x8_3 = vreinterpretq_s16_s64(trans_64x2_3);
955         trans_16x8_4 = vreinterpretq_s16_s64(trans_64x2_4);
956         trans_16x8_5 = vreinterpretq_s16_s64(trans_64x2_5);
957         trans_16x8_6 = vreinterpretq_s16_s64(trans_64x2_6);
958         trans_16x8_7 = vreinterpretq_s16_s64(trans_64x2_7);
959 
960         rs_trans_16x8_1 = vshrq_n_s16(trans_16x8_1, 1);
961         rs_trans_16x8_2 = vshrq_n_s16(trans_16x8_2, 1);
962         rs_trans_16x8_3 = vshrq_n_s16(trans_16x8_3, 1);
963         rs_trans_16x8_5 = vshrq_n_s16(trans_16x8_5, 1);
964         rs_trans_16x8_6 = vshrq_n_s16(trans_16x8_6, 1);
965         rs_trans_16x8_7 = vshrq_n_s16(trans_16x8_7, 1);
966 
967         y0_16x8 = vaddq_s16(trans_16x8_0,
968                             trans_16x8_4);     // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
969         y2_16x8 = vsubq_s16(trans_16x8_0,
970                             trans_16x8_4);     // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
971         y4_16x8 = vsubq_s16(rs_trans_16x8_2,
972                             trans_16x8_6);     // i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
973         y6_16x8 = vaddq_s16(trans_16x8_2,
974                             rs_trans_16x8_6);  // i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
975 
976         trans_16x4_3_l = vget_low_s16(trans_16x8_3);
977         trans_16x4_5_l = vget_low_s16(trans_16x8_5);
978 
979         //-w3 + w5
980         sub_3_5_l = vsubl_s16(vget_low_s16(trans_16x8_5), vget_low_s16(trans_16x8_3));
981         sub_3_5_h = vsubl_s16(vget_high_s16(trans_16x8_5), vget_high_s16(trans_16x8_3));
982 
983         // w3 + w5
984         add_3_5_l = vaddl_s16(trans_16x4_3_l, trans_16x4_5_l);
985         add_3_5_h = vaddl_s16(vget_high_s16(trans_16x8_3), vget_high_s16(trans_16x8_5));
986 
987         trans_16x4_1_l = vget_low_s16(trans_16x8_1);
988         trans_16x4_7_l = vget_low_s16(trans_16x8_7);
989 
990         //-w1 + w7
991         sub_1_7_l = vsubl_s16(trans_16x4_7_l, trans_16x4_1_l);
992         sub_1_7_h = vsubl_s16(vget_high_s16(trans_16x8_7), vget_high_s16(trans_16x8_1));
993 
994         // w1 + w7
995         add_1_7_l = vaddl_s16(trans_16x4_1_l, trans_16x4_7_l);
996         add_1_7_h = vaddl_s16(vget_high_s16(trans_16x8_1), vget_high_s16(trans_16x8_7));
997 
998         //-w3 + w5 - w7
999         sub_357_l = vsubw_s16(sub_3_5_l, trans_16x4_7_l);
1000         sub_357_h = vsubw_s16(sub_3_5_h, vget_high_s16(trans_16x8_7));
1001 
1002         // w3 + w5 + w1
1003         add_351_l = vaddw_s16(add_3_5_l, trans_16x4_1_l);
1004         add_351_h = vaddw_s16(add_3_5_h, vget_high_s16(trans_16x8_1));
1005 
1006         //-w1 + w7 + w5
1007         add_175_l = vaddw_s16(sub_1_7_l, trans_16x4_5_l);
1008         add_175_h = vaddw_s16(sub_1_7_h, vget_high_s16(trans_16x8_5));
1009 
1010         // w1 + w7 - w3
1011         sub_173_l = vsubw_s16(add_1_7_l, trans_16x4_3_l);
1012         sub_173_h = vsubw_s16(add_1_7_h, vget_high_s16(trans_16x8_3));
1013 
1014         //-w3 + w5 - w7 - (w7 >> 1)
1015         y1_32x4_l = vsubw_s16(sub_357_l, vget_low_s16(rs_trans_16x8_7));
1016         y1_32x4_h = vsubw_s16(sub_357_h, vget_high_s16(rs_trans_16x8_7));
1017 
1018         // w1 + w7 - w3 - (w3 >> 1)
1019         y3_32x4_l = vsubw_s16(sub_173_l, vget_low_s16(rs_trans_16x8_3));
1020         y3_32x4_h = vsubw_s16(sub_173_h, vget_high_s16(rs_trans_16x8_3));
1021 
1022         //-w1 + w7 + w5 + (w5 >> 1)
1023         y5_32x4_l = vaddw_s16(add_175_l, vget_low_s16(rs_trans_16x8_5));
1024         y5_32x4_h = vaddw_s16(add_175_h, vget_high_s16(rs_trans_16x8_5));
1025 
1026         // w3 + w5 + w1 + (w1 >> 1)
1027         y7_32x4_l = vaddw_s16(add_351_l, vget_low_s16(rs_trans_16x8_1));
1028         y7_32x4_h = vaddw_s16(add_351_h, vget_high_s16(rs_trans_16x8_1));
1029 
1030         y1_16x4_l = vmovn_s32(y1_32x4_l);
1031         y1_16x4_h = vmovn_s32(y1_32x4_h);
1032         y1_16x8 = vcombine_s16(y1_16x4_l, y1_16x4_h);
1033         y3_16x4_l = vmovn_s32(y3_32x4_l);
1034         y3_16x4_h = vmovn_s32(y3_32x4_h);
1035         y3_16x8 = vcombine_s16(y3_16x4_l, y3_16x4_h);
1036         y5_16x4_l = vmovn_s32(y5_32x4_l);
1037         y5_16x4_h = vmovn_s32(y5_32x4_h);
1038         y5_16x8 = vcombine_s16(y5_16x4_l, y5_16x4_h);
1039         y7_16x4_l = vmovn_s32(y7_32x4_l);
1040         y7_16x4_h = vmovn_s32(y7_32x4_h);
1041         y7_16x8 = vcombine_s16(y7_16x4_l, y7_16x4_h);
1042 
1043         rs_y1_16x8 = vshrq_n_s16(y1_16x8, 2);
1044         rs_y3_16x8 = vshrq_n_s16(y3_16x8, 2);
1045         rs_y5_16x8 = vshrq_n_s16(y5_16x8, 2);
1046         rs_y7_16x8 = vshrq_n_s16(y7_16x8, 2);
1047 
1048         z0_16x8 = vaddq_s16(y0_16x8, y6_16x8);           // z0 = y0 + y6
1049         z1_16x8 = vaddq_s16(y1_16x8, rs_y7_16x8);        // z1 = y1 + (y7 >> 2)
1050         z2_16x8 = vaddq_s16(y2_16x8, y4_16x8);           // z2 = y2 + y4
1051         z3_16x8 = vaddq_s16(y3_16x8, rs_y5_16x8);        // z3 = y3 + (y5 >> 2)
1052         z4_16x8 = vsubq_s16(y2_16x8, y4_16x8);           // z4 = y2 - y4
1053         z5_16x8 = vsubq_s16(rs_y3_16x8, y5_16x8);        // z5 = (y3 >> 2) - y5
1054         z6_16x8 = vsubq_s16(y0_16x8, y6_16x8);           // z6 = y0 - y6
1055         z7_16x8 = vsubq_s16(y7_16x8, rs_y1_16x8);        // z7 = y7 - (y1 >> 2)
1056 
1057         quant_res_16x8_0 = vaddq_s16(z0_16x8, z7_16x8);  // x0 = z0 + z7
1058         quant_res_16x8_1 = vaddq_s16(z2_16x8, z5_16x8);  // x1 = z2 + z5
1059         quant_res_16x8_2 = vaddq_s16(z4_16x8, z3_16x8);  // x2 = z4 + z3
1060         quant_res_16x8_3 = vaddq_s16(z6_16x8, z1_16x8);  // x3 = z6 + z1
1061         quant_res_16x8_4 = vsubq_s16(z6_16x8, z1_16x8);  // x4 = z6 - z1
1062         quant_res_16x8_5 = vsubq_s16(z4_16x8, z3_16x8);  // x5 = z4 - z3
1063         quant_res_16x8_6 = vsubq_s16(z2_16x8, z5_16x8);  // x6 = z2 - z5
1064         quant_res_16x8_7 = vsubq_s16(z0_16x8, z7_16x8);  // x7 = z0 - z7
1065     }
1066 
1067     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1068     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1069     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1070     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1071     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1072     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1073     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1074     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1075 
1076     quant_res_16x8_0 = vrshrq_n_s16(quant_res_16x8_0, 6);
1077     quant_res_16x8_1 = vrshrq_n_s16(quant_res_16x8_1, 6);
1078     quant_res_16x8_2 = vrshrq_n_s16(quant_res_16x8_2, 6);
1079     quant_res_16x8_3 = vrshrq_n_s16(quant_res_16x8_3, 6);
1080     quant_res_16x8_4 = vrshrq_n_s16(quant_res_16x8_4, 6);
1081     quant_res_16x8_5 = vrshrq_n_s16(quant_res_16x8_5, 6);
1082     quant_res_16x8_6 = vrshrq_n_s16(quant_res_16x8_6, 6);
1083     quant_res_16x8_7 = vrshrq_n_s16(quant_res_16x8_7, 6);
1084 
1085     resd0_in = vaddq_s16(quant_res_16x8_0, resd0_in);
1086     resd1_in = vaddq_s16(quant_res_16x8_1, resd1_in);
1087     resd2_in = vaddq_s16(quant_res_16x8_2, resd2_in);
1088     resd3_in = vaddq_s16(quant_res_16x8_3, resd3_in);
1089     resd4_in = vaddq_s16(quant_res_16x8_4, resd4_in);
1090     resd5_in = vaddq_s16(quant_res_16x8_5, resd5_in);
1091     resd6_in = vaddq_s16(quant_res_16x8_6, resd6_in);
1092     resd7_in = vaddq_s16(quant_res_16x8_7, resd7_in);
1093 
1094     resd0_in = vminq_s16(resd0_in, dup_max);
1095     resd0_in = vmaxq_s16(resd0_in, dup_min);
1096     resd1_in = vminq_s16(resd1_in, dup_max);
1097     resd1_in = vmaxq_s16(resd1_in, dup_min);
1098     resd2_in = vminq_s16(resd2_in, dup_max);
1099     resd2_in = vmaxq_s16(resd2_in, dup_min);
1100     resd3_in = vminq_s16(resd3_in, dup_max);
1101     resd3_in = vmaxq_s16(resd3_in, dup_min);
1102     resd4_in = vminq_s16(resd4_in, dup_max);
1103     resd4_in = vmaxq_s16(resd4_in, dup_min);
1104     resd5_in = vminq_s16(resd5_in, dup_max);
1105     resd5_in = vmaxq_s16(resd5_in, dup_min);
1106     resd6_in = vminq_s16(resd6_in, dup_max);
1107     resd6_in = vmaxq_s16(resd6_in, dup_min);
1108     resd7_in = vminq_s16(resd7_in, dup_max);
1109     resd7_in = vmaxq_s16(resd7_in, dup_min);
1110 
1111     resd0_64x2 = vreinterpretq_s64_s16(resd0_in);
1112     resd1_64x2 = vreinterpretq_s64_s16(resd1_in);
1113     resd2_64x2 = vreinterpretq_s64_s16(resd2_in);
1114     resd3_64x2 = vreinterpretq_s64_s16(resd3_in);
1115     resd4_64x2 = vreinterpretq_s64_s16(resd4_in);
1116     resd5_64x2 = vreinterpretq_s64_s16(resd5_in);
1117     resd6_64x2 = vreinterpretq_s64_s16(resd6_in);
1118     resd7_64x2 = vreinterpretq_s64_s16(resd7_in);
1119 
1120     resd_b0_r01 =
1121         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd0_64x2), vget_low_s64(resd1_64x2)));
1122     resd_b0_r23 =
1123         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd2_64x2), vget_low_s64(resd3_64x2)));
1124     resd_b1_r01 =
1125         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd0_64x2), vget_high_s64(resd1_64x2)));
1126     resd_b1_r23 =
1127         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd2_64x2), vget_high_s64(resd3_64x2)));
1128     resd_b2_r45 =
1129         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd4_64x2), vget_low_s64(resd5_64x2)));
1130     resd_b2_r67 =
1131         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd6_64x2), vget_low_s64(resd7_64x2)));
1132     resd_b3_r45 =
1133         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd4_64x2), vget_high_s64(resd5_64x2)));
1134     resd_b3_r67 =
1135         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd6_64x2), vget_high_s64(resd7_64x2)));
1136 
1137     dup_val_1 = vabsq_s16(resd_b0_r01);
1138     dup_val_2 = vabsq_s16(resd_b0_r23);
1139     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1140     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1141              dup_abs[6] || dup_abs[7];
1142 
1143     dup_val_1 = vabsq_s16(resd_b1_r01);
1144     dup_val_2 = vabsq_s16(resd_b1_r23);
1145     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1146     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1147              dup_abs[6] || dup_abs[7];
1148 
1149     dup_val_1 = vabsq_s16(resd_b2_r45);
1150     dup_val_2 = vabsq_s16(resd_b2_r67);
1151     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1152     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1153              dup_abs[6] || dup_abs[7];
1154 
1155     dup_val_1 = vabsq_s16(resd_b3_r45);
1156     dup_val_2 = vabsq_s16(resd_b3_r67);
1157     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1158     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1159              dup_abs[6] || dup_abs[7];
1160 
1161     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1162 
1163     pred0_in = vld1_u8((uint8_t *) pu1_pred);
1164     pu1_pred = pu1_pred + pred_strd;
1165     pred1_in = vld1_u8((uint8_t *) pu1_pred);
1166     pu1_pred = pu1_pred + pred_strd;
1167     pred2_in = vld1_u8((uint8_t *) pu1_pred);
1168     pu1_pred = pu1_pred + pred_strd;
1169     pred3_in = vld1_u8((uint8_t *) pu1_pred);
1170     pu1_pred = pu1_pred + pred_strd;
1171     pred4_in = vld1_u8((uint8_t *) pu1_pred);
1172     pu1_pred = pu1_pred + pred_strd;
1173     pred5_in = vld1_u8((uint8_t *) pu1_pred);
1174     pu1_pred = pu1_pred + pred_strd;
1175     pred6_in = vld1_u8((uint8_t *) pu1_pred);
1176     pu1_pred = pu1_pred + pred_strd;
1177     pred7_in = vld1_u8((uint8_t *) pu1_pred);
1178 
1179     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1180     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1181     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1182     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1183     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
1184     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
1185     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
1186     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
1187 
1188     rec0 = vaddq_s16(pred0, resd0_in);
1189     rec1 = vaddq_s16(pred1, resd1_in);
1190     rec2 = vaddq_s16(pred2, resd2_in);
1191     rec3 = vaddq_s16(pred3, resd3_in);
1192     rec4 = vaddq_s16(pred4, resd4_in);
1193     rec5 = vaddq_s16(pred5, resd5_in);
1194     rec6 = vaddq_s16(pred6, resd6_in);
1195     rec7 = vaddq_s16(pred7, resd7_in);
1196 
1197     rec0_un = vqmovun_s16(rec0);
1198     rec1_un = vqmovun_s16(rec1);
1199     rec2_un = vqmovun_s16(rec2);
1200     rec3_un = vqmovun_s16(rec3);
1201     rec4_un = vqmovun_s16(rec4);
1202     rec5_un = vqmovun_s16(rec5);
1203     rec6_un = vqmovun_s16(rec6);
1204     rec7_un = vqmovun_s16(rec7);
1205 
1206     vst1_u8(pu1_out, rec0_un);
1207     vst1_u8(pu1_out + out_strd, rec1_un);
1208     vst1_u8(pu1_out + out_strd * 2, rec2_un);
1209     vst1_u8(pu1_out + out_strd * 3, rec3_un);
1210     vst1_u8(pu1_out + out_strd * 4, rec4_un);
1211     vst1_u8(pu1_out + out_strd * 5, rec5_un);
1212     vst1_u8(pu1_out + out_strd * 6, rec6_un);
1213     vst1_u8(pu1_out + out_strd * 7, rec7_un);
1214 
1215     return nnz;
1216 }
1217 
1218 /*****************************************************************************/
1219 /*                                                                           */
1220 /*  Function Name : isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr        */
1221 /*                                                                           */
1222 /*  Description   : this function computes the recon dc output from the      */
1223 /*                  IQ+IT+RESD                                               */
1224 /*                                                                           */
1225 /*  Inputs        :                                                          */
1226 /*  Globals       : none                                                     */
1227 /*  Processing    :                                                          */
1228 /*                                                                           */
1229 /*  Outputs       : i4_nnz                                                   */
1230 /*  Returns       : none                                                     */
1231 /*                                                                           */
1232 /*  Issues        : none                                                     */
1233 /*                                                                           */
1234 /*  Revision History:                                                        */
1235 /*                                                                           */
1236 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1237 /*         25 11 2021   Kishore               creation                       */
1238 /*                                                                           */
1239 /*****************************************************************************/
1240 
isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)1241 WORD32 isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr(
1242     WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
1243     WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
1244     UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
1245 {
1246     WORD32 i4_iq_out_temp;
1247     int16x8_t temp_0;
1248     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1249     uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
1250     int16x8_t pred0, pred1, pred2, pred3;
1251     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1252     int16x8_t pred4, pred5, pred6, pred7, dup_min, dup_max;
1253     int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
1254 
1255     int64x2_t pred0_64x2, pred1_64x2, pred2_64x2, pred3_64x2, pred4_64x2, pred5_64x2, pred6_64x2,
1256         pred7_64x2;
1257 
1258     int16x8_t pred_b0_r01;
1259     int16x8_t pred_b0_r23;
1260     int16x8_t pred_b1_r01;
1261     int16x8_t pred_b1_r23;
1262     int16x8_t pred_b2_r45;
1263     int16x8_t pred_b2_r67;
1264     int16x8_t pred_b3_r45;
1265     int16x8_t pred_b3_r67;
1266     int16x8_t dup_val_1, dup_val_2, dup_abs;
1267 
1268     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1269     WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
1270 
1271     UNUSED(pi2_tmp);
1272     UNUSED(iq_start_idx);
1273     UNUSED(pi2_dc_ld_addr);
1274     i4_iq_out_temp = pi2_src[0];
1275 
1276     INV_QUANT(i4_iq_out_temp, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
1277 
1278     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1279     dup_min = vdupq_n_s16(RSD_MIN);
1280     dup_max = vdupq_n_s16(RSD_MAX);
1281 
1282     pred0_in = vld1_u8((uint8_t *) pu1_pred);
1283     pu1_pred = pu1_pred + pred_strd;
1284     pred1_in = vld1_u8((uint8_t *) pu1_pred);
1285     pu1_pred = pu1_pred + pred_strd;
1286     pred2_in = vld1_u8((uint8_t *) pu1_pred);
1287     pu1_pred = pu1_pred + pred_strd;
1288     pred3_in = vld1_u8((uint8_t *) pu1_pred);
1289     pu1_pred = pu1_pred + pred_strd;
1290     pred4_in = vld1_u8((uint8_t *) pu1_pred);
1291     pu1_pred = pu1_pred + pred_strd;
1292     pred5_in = vld1_u8((uint8_t *) pu1_pred);
1293     pu1_pred = pu1_pred + pred_strd;
1294     pred6_in = vld1_u8((uint8_t *) pu1_pred);
1295     pu1_pred = pu1_pred + pred_strd;
1296     pred7_in = vld1_u8((uint8_t *) pu1_pred);
1297 
1298     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1299     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1300     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1301     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1302     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
1303     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
1304     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
1305     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
1306 
1307     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1308     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1309     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1310     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1311     resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
1312     resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
1313     resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
1314     resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
1315 
1316     resd0_in = vaddq_s16(resd0_in, temp_0);
1317     resd1_in = vaddq_s16(resd1_in, temp_0);
1318     resd2_in = vaddq_s16(resd2_in, temp_0);
1319     resd3_in = vaddq_s16(resd3_in, temp_0);
1320     resd4_in = vaddq_s16(resd4_in, temp_0);
1321     resd5_in = vaddq_s16(resd5_in, temp_0);
1322     resd6_in = vaddq_s16(resd6_in, temp_0);
1323     resd7_in = vaddq_s16(resd7_in, temp_0);
1324 
1325     resd0_in = vminq_s16(resd0_in, dup_max);
1326     resd0_in = vmaxq_s16(resd0_in, dup_min);
1327     resd1_in = vminq_s16(resd1_in, dup_max);
1328     resd1_in = vmaxq_s16(resd1_in, dup_min);
1329     resd2_in = vminq_s16(resd2_in, dup_max);
1330     resd2_in = vmaxq_s16(resd2_in, dup_min);
1331     resd3_in = vminq_s16(resd3_in, dup_max);
1332     resd3_in = vmaxq_s16(resd3_in, dup_min);
1333     resd4_in = vminq_s16(resd4_in, dup_max);
1334     resd4_in = vmaxq_s16(resd4_in, dup_min);
1335     resd5_in = vminq_s16(resd5_in, dup_max);
1336     resd5_in = vmaxq_s16(resd5_in, dup_min);
1337     resd6_in = vminq_s16(resd6_in, dup_max);
1338     resd6_in = vmaxq_s16(resd6_in, dup_min);
1339     resd7_in = vminq_s16(resd7_in, dup_max);
1340     resd7_in = vmaxq_s16(resd7_in, dup_min);
1341 
1342     pred0_64x2 = vreinterpretq_s64_s16(resd0_in);
1343     pred1_64x2 = vreinterpretq_s64_s16(resd1_in);
1344     pred2_64x2 = vreinterpretq_s64_s16(resd2_in);
1345     pred3_64x2 = vreinterpretq_s64_s16(resd3_in);
1346     pred4_64x2 = vreinterpretq_s64_s16(resd4_in);
1347     pred5_64x2 = vreinterpretq_s64_s16(resd5_in);
1348     pred6_64x2 = vreinterpretq_s64_s16(resd6_in);
1349     pred7_64x2 = vreinterpretq_s64_s16(resd7_in);
1350 
1351     pred_b0_r01 =
1352         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred0_64x2), vget_low_s64(pred1_64x2)));
1353     pred_b0_r23 =
1354         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred2_64x2), vget_low_s64(pred3_64x2)));
1355     pred_b1_r01 =
1356         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred0_64x2), vget_high_s64(pred1_64x2)));
1357     pred_b1_r23 =
1358         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred2_64x2), vget_high_s64(pred3_64x2)));
1359     pred_b2_r45 =
1360         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred4_64x2), vget_low_s64(pred5_64x2)));
1361     pred_b2_r67 =
1362         vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred6_64x2), vget_low_s64(pred7_64x2)));
1363     pred_b3_r45 =
1364         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred4_64x2), vget_high_s64(pred5_64x2)));
1365     pred_b3_r67 =
1366         vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred6_64x2), vget_high_s64(pred7_64x2)));
1367 
1368     dup_val_1 = vabsq_s16(pred_b0_r01);
1369     dup_val_2 = vabsq_s16(pred_b0_r23);
1370     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1371     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1372              dup_abs[6] || dup_abs[7];
1373 
1374     dup_val_1 = vabsq_s16(pred_b1_r01);
1375     dup_val_2 = vabsq_s16(pred_b1_r23);
1376     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1377     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1378              dup_abs[6] || dup_abs[7];
1379 
1380     dup_val_1 = vabsq_s16(pred_b2_r45);
1381     dup_val_2 = vabsq_s16(pred_b2_r67);
1382     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1383     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1384              dup_abs[6] || dup_abs[7];
1385 
1386     dup_val_1 = vabsq_s16(pred_b3_r45);
1387     dup_val_2 = vabsq_s16(pred_b3_r67);
1388     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1389     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1390              dup_abs[6] || dup_abs[7];
1391 
1392     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1393 
1394     pred0 = vaddq_s16(pred0, resd0_in);
1395     pred1 = vaddq_s16(pred1, resd1_in);
1396     pred2 = vaddq_s16(pred2, resd2_in);
1397     pred3 = vaddq_s16(pred3, resd3_in);
1398     pred4 = vaddq_s16(pred4, resd4_in);
1399     pred5 = vaddq_s16(pred5, resd5_in);
1400     pred6 = vaddq_s16(pred6, resd6_in);
1401     pred7 = vaddq_s16(pred7, resd7_in);
1402 
1403     pred0_in = vqmovun_s16(pred0);
1404     pred1_in = vqmovun_s16(pred1);
1405     pred2_in = vqmovun_s16(pred2);
1406     pred3_in = vqmovun_s16(pred3);
1407     pred4_in = vqmovun_s16(pred4);
1408     pred5_in = vqmovun_s16(pred5);
1409     pred6_in = vqmovun_s16(pred6);
1410     pred7_in = vqmovun_s16(pred7);
1411 
1412     vst1_u8((uint8_t *) (pu1_out), pred0_in);
1413     vst1_u8((uint8_t *) (pu1_out + out_strd), pred1_in);
1414     vst1_u8((uint8_t *) (pu1_out + out_strd * 2), pred2_in);
1415     vst1_u8((uint8_t *) (pu1_out + out_strd * 3), pred3_in);
1416     vst1_u8((uint8_t *) (pu1_out + out_strd * 4), pred4_in);
1417     vst1_u8((uint8_t *) (pu1_out + out_strd * 5), pred5_in);
1418     vst1_u8((uint8_t *) (pu1_out + out_strd * 6), pred6_in);
1419     vst1_u8((uint8_t *) (pu1_out + out_strd * 7), pred7_in);
1420 
1421     return nnz;
1422 }
1423