xref: /aosp_15_r20/external/libavc/decoder/arm/svc/isvcd_pred_residual_recon_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21 *******************************************************************************
22 * @file
23 *  isvcd_pred_residual_recon_neonintr.c
24 *
25 * @brief
26 *  Contains definition of functions for h264 inverse quantization inverse
27 *    transformation and resd comp
28 *
29 * @author
30 *  Kishore
31 *
32 *  @par List of Functions:
33 *  - isvcd_pred_residual_recon_16x16_neonintr()
34 *  - isvcd_pred_residual_recon_8x8_neonintr()
35 *  - isvcd_pred_residual_recon_4x4_neonintr()
36 *  - isvcd_pred_residual_recon_chroma_4x4_neonintr()
37 *  - isvcd_pred_residual_recon_chroma_8x8_neonintr()
38 *  - isvcd_residual_luma_4x4_neonintr()
39 *  - isvcd_residual_luma_8x8_neonintr()
40 *  - isvcd_residual_luma_16x16_neonintr()
41 *  - isvcd_residual_chroma_cb_cr_8x8_neonintr()
42 *
43 * @remarks
44 *
45 *******************************************************************************
46 */
47 
48 /*****************************************************************************/
49 /* File Includes                                                             */
50 /*****************************************************************************/
51 #include <string.h>
52 #include <arm_neon.h>
53 
54 /* User include files */
55 #include "ih264_typedefs.h"
56 #include "ih264_defs.h"
57 #include "ih264_trans_macros.h"
58 #include "ih264_macros.h"
59 #include "ih264_platform_macros.h"
60 #include "ih264_trans_data.h"
61 #include "ih264_size_defs.h"
62 #include "ih264_structs.h"
63 #include "isvcd_pred_residual_recon.h"
64 
65 /*****************************************************************************/
66 /*                                                                           */
67 /*  Function Name : isvcd_pred_residual_recon_4x4_neonintr                    */
68 /*                                                                           */
69 /*  Description   : this function computes the recon data from the           */
70 /*                   pred and residual buffer                                */
71 /*                                                                           */
72 /*  Inputs        :                                                          */
73 /*  Globals       : none                                                     */
74 /*  Processing    :                                                          */
75 /*                                                                           */
76 /*  Outputs       : none                                                     */
77 /*  Returns       : nnz                                                      */
78 /*                                                                           */
79 /*  Issues        : none                                                     */
80 /*                                                                           */
81 /*  Revision History:                                                        */
82 /*                                                                           */
83 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
84 /*         25 11 2021   Kishore               creation                       */
85 /*                                                                           */
86 /*****************************************************************************/
87 
isvcd_pred_residual_recon_4x4_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)88 WORD32 isvcd_pred_residual_recon_4x4_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
89                                               WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
90 {
91     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
92     int16x8_t pred0, pred1, pred2, pred3;
93     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
94     int16x8_t resd01_in, resd23_in;
95     WORD32 i4_nnz;
96     int16x8_t dup_val_1, dup_val_2, dup_abs;
97 
98     pred0_in = vld1_u8((uint8_t *) pu1_pred);
99     pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 1));
100     pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
101     pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
102 
103     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
104     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
105     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
106     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
107 
108     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
109     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
110     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
111     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
112 
113     resd01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
114                                                    vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
115 
116     resd23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
117                                                    vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
118 
119     dup_val_1 = vabsq_s16(resd01_in);
120     dup_val_2 = vabsq_s16(resd23_in);
121     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
122     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
123              dup_abs[6] || dup_abs[7];
124 
125     pred0 = vaddq_s16(pred0, resd0_in);
126     pred1 = vaddq_s16(pred1, resd1_in);
127     pred2 = vaddq_s16(pred2, resd2_in);
128     pred3 = vaddq_s16(pred3, resd3_in);
129 
130     pred0_in = vqmovun_s16(pred0);
131     pred1_in = vqmovun_s16(pred1);
132     pred2_in = vqmovun_s16(pred2);
133     pred3_in = vqmovun_s16(pred3);
134 
135     vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
136     vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred1_in), 0);
137     vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 2), vreinterpret_u32_u8(pred2_in), 0);
138     vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 3), vreinterpret_u32_u8(pred3_in), 0);
139 
140     return i4_nnz;
141 }
142 
143 /*****************************************************************************/
144 /*                                                                           */
145 /*  Function Name : isvcd_pred_residual_recon_8x8_neonintr                    */
146 /*                                                                           */
147 /*  Description   : this function computes the recon data from the           */
148 /*                   pred and residual buffer                                */
149 /*                                                                           */
150 /*  Inputs        :                                                          */
151 /*  Globals       : none                                                     */
152 /*  Processing    :                                                          */
153 /*                                                                           */
154 /*  Outputs       : none                                                     */
155 /*  Returns       : nnz                                                      */
156 /*                                                                           */
157 /*  Issues        : none                                                     */
158 /*                                                                           */
159 /*  Revision History:                                                        */
160 /*                                                                           */
161 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
162 /*         25 11 2021   Kishore               creation                       */
163 /*                                                                           */
164 /*****************************************************************************/
165 
isvcd_pred_residual_recon_8x8_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)166 WORD32 isvcd_pred_residual_recon_8x8_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
167                                               WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
168 {
169     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
170     uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
171     int16x8_t pred0, pred1, pred2, pred3;
172     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
173     int16x8_t pred4, pred5, pred6, pred7;
174     int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
175     int16x8_t dup_val_1, dup_val_2, dup_abs;
176     int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
177         resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
178 
179     int16x8_t resd_b0_r01_in;
180     int16x8_t resd_b0_r23_in;
181     int16x8_t resd_b1_r01_in;
182     int16x8_t resd_b1_r23_in;
183     int16x8_t resd_b2_r45_in;
184     int16x8_t resd_b2_r67_in;
185     int16x8_t resd_b3_r45_in;
186     int16x8_t resd_b3_r67_in;
187 
188     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
189 
190     pred0_in = vld1_u8((uint8_t *) pu1_pred);
191     pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
192     pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
193     pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
194     pred4_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 4));
195     pred5_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 5));
196     pred6_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 6));
197     pred7_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 7));
198 
199     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
200     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
201     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
202     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
203     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
204     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
205     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
206     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
207 
208     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
209     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
210     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
211     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
212     resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
213     resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
214     resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
215     resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
216 
217     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
218     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
219     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
220     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
221     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
222     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
223     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
224     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
225 
226     resd_b0_r01_in = vreinterpretq_s16_s64(
227         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
228     resd_b0_r23_in = vreinterpretq_s16_s64(
229         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
230     resd_b1_r01_in = vreinterpretq_s16_s64(
231         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
232     resd_b1_r23_in = vreinterpretq_s16_s64(
233         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
234     resd_b2_r45_in = vreinterpretq_s16_s64(
235         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
236     resd_b2_r67_in = vreinterpretq_s16_s64(
237         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
238     resd_b3_r45_in = vreinterpretq_s16_s64(
239         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
240     resd_b3_r67_in = vreinterpretq_s16_s64(
241         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
242 
243     dup_val_1 = vabsq_s16(resd_b0_r01_in);
244     dup_val_2 = vabsq_s16(resd_b0_r23_in);
245     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
246     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
247              dup_abs[6] || dup_abs[7];
248 
249     dup_val_1 = vabsq_s16(resd_b1_r01_in);
250     dup_val_2 = vabsq_s16(resd_b1_r23_in);
251     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
252     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
253              dup_abs[6] || dup_abs[7];
254 
255     dup_val_1 = vabsq_s16(resd_b2_r45_in);
256     dup_val_2 = vabsq_s16(resd_b2_r67_in);
257     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
258     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
259              dup_abs[6] || dup_abs[7];
260 
261     dup_val_1 = vabsq_s16(resd_b3_r45_in);
262     dup_val_2 = vabsq_s16(resd_b3_r67_in);
263     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
264     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
265              dup_abs[6] || dup_abs[7];
266 
267     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
268 
269     pred0 = vaddq_s16(pred0, resd0_in);
270     pred1 = vaddq_s16(pred1, resd1_in);
271     pred2 = vaddq_s16(pred2, resd2_in);
272     pred3 = vaddq_s16(pred3, resd3_in);
273     pred4 = vaddq_s16(pred4, resd4_in);
274     pred5 = vaddq_s16(pred5, resd5_in);
275     pred6 = vaddq_s16(pred6, resd6_in);
276     pred7 = vaddq_s16(pred7, resd7_in);
277 
278     pred0_in = vqmovun_s16(pred0);
279     pred1_in = vqmovun_s16(pred1);
280     pred2_in = vqmovun_s16(pred2);
281     pred3_in = vqmovun_s16(pred3);
282     pred4_in = vqmovun_s16(pred4);
283     pred5_in = vqmovun_s16(pred5);
284     pred6_in = vqmovun_s16(pred6);
285     pred7_in = vqmovun_s16(pred7);
286 
287     vst1_u8((uint8_t *) (pu1_out), pred0_in);
288     vst1_u8((uint8_t *) (pu1_out + out_strd), pred1_in);
289     vst1_u8((uint8_t *) (pu1_out + out_strd * 2), pred2_in);
290     vst1_u8((uint8_t *) (pu1_out + out_strd * 3), pred3_in);
291     vst1_u8((uint8_t *) (pu1_out + out_strd * 4), pred4_in);
292     vst1_u8((uint8_t *) (pu1_out + out_strd * 5), pred5_in);
293     vst1_u8((uint8_t *) (pu1_out + out_strd * 6), pred6_in);
294     vst1_u8((uint8_t *) (pu1_out + out_strd * 7), pred7_in);
295 
296     return nnz;
297 }
298 
299 /*****************************************************************************/
300 /*                                                                           */
301 /*  Function Name : isvcd_pred_residual_recon_16x16_neonintr                  */
302 /*                                                                           */
303 /*  Description   : this function computes the recon data from the           */
304 /*                   pred and residual buffer                                */
305 /*                                                                           */
306 /*  Inputs        :                                                          */
307 /*  Globals       : none                                                     */
308 /*  Processing    :                                                          */
309 /*                                                                           */
310 /*  Outputs       : none                                                     */
311 /*  Returns       : nnz                                                      */
312 /*                                                                           */
313 /*  Issues        : none                                                     */
314 /*                                                                           */
315 /*  Revision History:                                                        */
316 /*                                                                           */
317 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
318 /*         25 11 2021   Kishore               creation                       */
319 /*                                                                           */
320 /*****************************************************************************/
321 
isvcd_pred_residual_recon_16x16_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)322 WORD32 isvcd_pred_residual_recon_16x16_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
323                                                 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
324 {
325     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
326     uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
327     int16x8_t pred0, pred1, pred2, pred3;
328     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
329     int16x8_t pred4, pred5, pred6, pred7;
330     int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
331     int16x8_t dup_val_1, dup_val_2, dup_abs;
332     UWORD8 *pu1_pred_ptr = pu1_pred;
333     WORD16 *pi2_rsd_ptr = pi2_rsd;
334     UWORD8 *pu1_out_ptr = pu1_out;
335 
336     int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
337         resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
338 
339     int16x8_t resd_b0_r01_in;
340     int16x8_t resd_b0_r23_in;
341     int16x8_t resd_b1_r01_in;
342     int16x8_t resd_b1_r23_in;
343     int16x8_t resd_b2_r45_in;
344     int16x8_t resd_b2_r67_in;
345     int16x8_t resd_b3_r45_in;
346     int16x8_t resd_b3_r67_in;
347 
348     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
349 
350     /* First row of 8,  first 8x8 elements */
351     pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
352     pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
353     pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
354     pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
355     pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
356     pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
357     pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
358     pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
359 
360     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
361     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
362     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
363     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
364     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
365     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
366     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
367     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
368 
369     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
370     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
371     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
372     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
373     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
374     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
375     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
376     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
377 
378     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
379     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
380     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
381     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
382     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
383     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
384     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
385     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
386 
387     resd_b0_r01_in = vreinterpretq_s16_s64(
388         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
389     resd_b0_r23_in = vreinterpretq_s16_s64(
390         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
391     resd_b1_r01_in = vreinterpretq_s16_s64(
392         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
393     resd_b1_r23_in = vreinterpretq_s16_s64(
394         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
395     resd_b2_r45_in = vreinterpretq_s16_s64(
396         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
397     resd_b2_r67_in = vreinterpretq_s16_s64(
398         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
399     resd_b3_r45_in = vreinterpretq_s16_s64(
400         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
401     resd_b3_r67_in = vreinterpretq_s16_s64(
402         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
403 
404     dup_val_1 = vabsq_s16(resd_b0_r01_in);
405     dup_val_2 = vabsq_s16(resd_b0_r23_in);
406     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
407     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
408              dup_abs[6] || dup_abs[7];
409 
410     dup_val_1 = vabsq_s16(resd_b1_r01_in);
411     dup_val_2 = vabsq_s16(resd_b1_r23_in);
412     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
413     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
414              dup_abs[6] || dup_abs[7];
415 
416     dup_val_1 = vabsq_s16(resd_b2_r45_in);
417     dup_val_2 = vabsq_s16(resd_b2_r67_in);
418     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
419     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
420              dup_abs[6] || dup_abs[7];
421 
422     dup_val_1 = vabsq_s16(resd_b3_r45_in);
423     dup_val_2 = vabsq_s16(resd_b3_r67_in);
424     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
425     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
426              dup_abs[6] || dup_abs[7];
427 
428     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
429 
430     pred0 = vaddq_s16(pred0, resd0_in);
431     pred1 = vaddq_s16(pred1, resd1_in);
432     pred2 = vaddq_s16(pred2, resd2_in);
433     pred3 = vaddq_s16(pred3, resd3_in);
434     pred4 = vaddq_s16(pred4, resd4_in);
435     pred5 = vaddq_s16(pred5, resd5_in);
436     pred6 = vaddq_s16(pred6, resd6_in);
437     pred7 = vaddq_s16(pred7, resd7_in);
438 
439     pred0_in = vqmovun_s16(pred0);
440     pred1_in = vqmovun_s16(pred1);
441     pred2_in = vqmovun_s16(pred2);
442     pred3_in = vqmovun_s16(pred3);
443     pred4_in = vqmovun_s16(pred4);
444     pred5_in = vqmovun_s16(pred5);
445     pred6_in = vqmovun_s16(pred6);
446     pred7_in = vqmovun_s16(pred7);
447 
448     vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
449     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
450     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
451     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
452     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
453     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
454     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
455     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
456 
457     /* first row of 8, sec 8x8 elements */
458     pu1_out_ptr = pu1_out_ptr + 8;
459     pi2_rsd_ptr = pi2_rsd_ptr + 8;
460     pu1_pred_ptr = pu1_pred_ptr + 8;
461 
462     pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
463     pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
464     pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
465     pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
466     pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
467     pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
468     pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
469     pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
470 
471     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
472     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
473     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
474     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
475     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
476     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
477     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
478     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
479 
480     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
481     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
482     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
483     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
484     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
485     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
486     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
487     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
488 
489     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
490     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
491     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
492     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
493     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
494     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
495     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
496     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
497 
498     resd_b0_r01_in = vreinterpretq_s16_s64(
499         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
500     resd_b0_r23_in = vreinterpretq_s16_s64(
501         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
502     resd_b1_r01_in = vreinterpretq_s16_s64(
503         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
504     resd_b1_r23_in = vreinterpretq_s16_s64(
505         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
506     resd_b2_r45_in = vreinterpretq_s16_s64(
507         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
508     resd_b2_r67_in = vreinterpretq_s16_s64(
509         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
510     resd_b3_r45_in = vreinterpretq_s16_s64(
511         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
512     resd_b3_r67_in = vreinterpretq_s16_s64(
513         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
514 
515     dup_val_1 = vabsq_s16(resd_b0_r01_in);
516     dup_val_2 = vabsq_s16(resd_b0_r23_in);
517     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
518     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
519              dup_abs[6] || dup_abs[7];
520 
521     dup_val_1 = vabsq_s16(resd_b1_r01_in);
522     dup_val_2 = vabsq_s16(resd_b1_r23_in);
523     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
524     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
525              dup_abs[6] || dup_abs[7];
526 
527     dup_val_1 = vabsq_s16(resd_b2_r45_in);
528     dup_val_2 = vabsq_s16(resd_b2_r67_in);
529     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
530     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
531              dup_abs[6] || dup_abs[7];
532 
533     dup_val_1 = vabsq_s16(resd_b3_r45_in);
534     dup_val_2 = vabsq_s16(resd_b3_r67_in);
535     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
536     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
537              dup_abs[6] || dup_abs[7];
538 
539     nnz |= (nnz_b0 << 2 | (nnz_b1 << 3) | (nnz_b2 << 6) | (nnz_b3 << 7));
540 
541     pred0 = vaddq_s16(pred0, resd0_in);
542     pred1 = vaddq_s16(pred1, resd1_in);
543     pred2 = vaddq_s16(pred2, resd2_in);
544     pred3 = vaddq_s16(pred3, resd3_in);
545     pred4 = vaddq_s16(pred4, resd4_in);
546     pred5 = vaddq_s16(pred5, resd5_in);
547     pred6 = vaddq_s16(pred6, resd6_in);
548     pred7 = vaddq_s16(pred7, resd7_in);
549 
550     pred0_in = vqmovun_s16(pred0);
551     pred1_in = vqmovun_s16(pred1);
552     pred2_in = vqmovun_s16(pred2);
553     pred3_in = vqmovun_s16(pred3);
554     pred4_in = vqmovun_s16(pred4);
555     pred5_in = vqmovun_s16(pred5);
556     pred6_in = vqmovun_s16(pred6);
557     pred7_in = vqmovun_s16(pred7);
558 
559     vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
560     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
561     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
562     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
563     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
564     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
565     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
566     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
567 
568     pu1_out_ptr = pu1_out + (8 * out_strd);
569     pi2_rsd_ptr = pi2_rsd + (8 * rsd_strd);
570     pu1_pred_ptr = pu1_pred + (8 * pred_strd);
571 
572     /*Sec row of 8, first 8x8*/
573     pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
574     pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
575     pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
576     pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
577     pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
578     pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
579     pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
580     pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
581 
582     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
583     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
584     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
585     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
586     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
587     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
588     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
589     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
590 
591     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
592     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
593     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
594     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
595     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
596     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
597     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
598     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
599 
600     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
601     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
602     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
603     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
604     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
605     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
606     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
607     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
608 
609     resd_b0_r01_in = vreinterpretq_s16_s64(
610         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
611     resd_b0_r23_in = vreinterpretq_s16_s64(
612         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
613     resd_b1_r01_in = vreinterpretq_s16_s64(
614         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
615     resd_b1_r23_in = vreinterpretq_s16_s64(
616         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
617     resd_b2_r45_in = vreinterpretq_s16_s64(
618         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
619     resd_b2_r67_in = vreinterpretq_s16_s64(
620         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
621     resd_b3_r45_in = vreinterpretq_s16_s64(
622         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
623     resd_b3_r67_in = vreinterpretq_s16_s64(
624         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
625     dup_val_1 = vabsq_s16(resd_b0_r01_in);
626     dup_val_2 = vabsq_s16(resd_b0_r23_in);
627     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
628     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
629              dup_abs[6] || dup_abs[7];
630 
631     dup_val_1 = vabsq_s16(resd_b1_r01_in);
632     dup_val_2 = vabsq_s16(resd_b1_r23_in);
633     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
634     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
635              dup_abs[6] || dup_abs[7];
636 
637     dup_val_1 = vabsq_s16(resd_b2_r45_in);
638     dup_val_2 = vabsq_s16(resd_b2_r67_in);
639     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
640     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
641              dup_abs[6] || dup_abs[7];
642 
643     dup_val_1 = vabsq_s16(resd_b3_r45_in);
644     dup_val_2 = vabsq_s16(resd_b3_r67_in);
645     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
646     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
647              dup_abs[6] || dup_abs[7];
648 
649     nnz |= (nnz_b0 << 8 | (nnz_b1 << 9) | (nnz_b2 << 12) | (nnz_b3 << 13));
650 
651     pred0 = vaddq_s16(pred0, resd0_in);
652     pred1 = vaddq_s16(pred1, resd1_in);
653     pred2 = vaddq_s16(pred2, resd2_in);
654     pred3 = vaddq_s16(pred3, resd3_in);
655     pred4 = vaddq_s16(pred4, resd4_in);
656     pred5 = vaddq_s16(pred5, resd5_in);
657     pred6 = vaddq_s16(pred6, resd6_in);
658     pred7 = vaddq_s16(pred7, resd7_in);
659 
660     pred0_in = vqmovun_s16(pred0);
661     pred1_in = vqmovun_s16(pred1);
662     pred2_in = vqmovun_s16(pred2);
663     pred3_in = vqmovun_s16(pred3);
664     pred4_in = vqmovun_s16(pred4);
665     pred5_in = vqmovun_s16(pred5);
666     pred6_in = vqmovun_s16(pred6);
667     pred7_in = vqmovun_s16(pred7);
668 
669     vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
670     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
671     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
672     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
673     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
674     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
675     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
676     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
677 
678     /*Sec row of 8, Sec 8x8*/
679     pu1_out_ptr = pu1_out_ptr + 8;
680     pi2_rsd_ptr = pi2_rsd_ptr + 8;
681     pu1_pred_ptr = pu1_pred_ptr + 8;
682 
683     pred0_in = vld1_u8((uint8_t *) pu1_pred_ptr);
684     pred1_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd));
685     pred2_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 2));
686     pred3_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 3));
687     pred4_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 4));
688     pred5_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 5));
689     pred6_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 6));
690     pred7_in = vld1_u8((uint8_t *) pu1_pred_ptr + (pred_strd * 7));
691 
692     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
693     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
694     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
695     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
696     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
697     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
698     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
699     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
700 
701     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
702     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
703     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
704     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
705     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
706     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
707     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
708     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
709 
710     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
711     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
712     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
713     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
714     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
715     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
716     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
717     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
718 
719     resd_b0_r01_in = vreinterpretq_s16_s64(
720         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
721     resd_b0_r23_in = vreinterpretq_s16_s64(
722         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
723     resd_b1_r01_in = vreinterpretq_s16_s64(
724         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
725     resd_b1_r23_in = vreinterpretq_s16_s64(
726         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
727     resd_b2_r45_in = vreinterpretq_s16_s64(
728         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
729     resd_b2_r67_in = vreinterpretq_s16_s64(
730         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
731     resd_b3_r45_in = vreinterpretq_s16_s64(
732         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
733     resd_b3_r67_in = vreinterpretq_s16_s64(
734         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
735     dup_val_1 = vabsq_s16(resd_b0_r01_in);
736     dup_val_2 = vabsq_s16(resd_b0_r23_in);
737     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
738     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
739              dup_abs[6] || dup_abs[7];
740 
741     dup_val_1 = vabsq_s16(resd_b1_r01_in);
742     dup_val_2 = vabsq_s16(resd_b1_r23_in);
743     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
744     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
745              dup_abs[6] || dup_abs[7];
746 
747     dup_val_1 = vabsq_s16(resd_b2_r45_in);
748     dup_val_2 = vabsq_s16(resd_b2_r67_in);
749     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
750     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
751              dup_abs[6] || dup_abs[7];
752 
753     dup_val_1 = vabsq_s16(resd_b3_r45_in);
754     dup_val_2 = vabsq_s16(resd_b3_r67_in);
755     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
756     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
757              dup_abs[6] || dup_abs[7];
758 
759     nnz |= (nnz_b0 << 10 | (nnz_b1 << 11) | (nnz_b2 << 14) | (nnz_b3 << 15));
760 
761     pred0 = vaddq_s16(pred0, resd0_in);
762     pred1 = vaddq_s16(pred1, resd1_in);
763     pred2 = vaddq_s16(pred2, resd2_in);
764     pred3 = vaddq_s16(pred3, resd3_in);
765     pred4 = vaddq_s16(pred4, resd4_in);
766     pred5 = vaddq_s16(pred5, resd5_in);
767     pred6 = vaddq_s16(pred6, resd6_in);
768     pred7 = vaddq_s16(pred7, resd7_in);
769 
770     pred0_in = vqmovun_s16(pred0);
771     pred1_in = vqmovun_s16(pred1);
772     pred2_in = vqmovun_s16(pred2);
773     pred3_in = vqmovun_s16(pred3);
774     pred4_in = vqmovun_s16(pred4);
775     pred5_in = vqmovun_s16(pred5);
776     pred6_in = vqmovun_s16(pred6);
777     pred7_in = vqmovun_s16(pred7);
778 
779     vst1_u8((uint8_t *) (pu1_out_ptr), pred0_in);
780     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd), pred1_in);
781     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 2), pred2_in);
782     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 3), pred3_in);
783     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 4), pred4_in);
784     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 5), pred5_in);
785     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 6), pred6_in);
786     vst1_u8((uint8_t *) (pu1_out_ptr + out_strd * 7), pred7_in);
787 
788     return nnz;
789 }
790 
791 /*****************************************************************************/
792 /*                                                                           */
793 /*  Function Name : isvcd_pred_residual_recon_chroma_8x8_neonintr             */
794 /*                                                                           */
795 /*  Description   : this function computes the recon data from the           */
796 /*                   pred and residual buffer                                */
797 /*                                                                           */
798 /*  Inputs        :                                                          */
799 /*  Globals       : none                                                     */
800 /*  Processing    :                                                          */
801 /*                                                                           */
802 /*  Outputs       : none                                                     */
803 /*  Returns       : none                                                     */
804 /*                                                                           */
805 /*  Issues        : none                                                     */
806 /*                                                                           */
807 /*  Revision History:                                                        */
808 /*                                                                           */
809 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
810 /*         25 11 2021   Kishore               creation                       */
811 /*                                                                           */
812 /*****************************************************************************/
813 
isvcd_pred_residual_recon_chroma_8x8_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)814 void isvcd_pred_residual_recon_chroma_8x8_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd,
815                                                    UWORD8 *pu1_out, WORD32 pred_strd,
816                                                    WORD32 rsd_strd, WORD32 out_strd)
817 {
818     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
819     uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
820     int16x8_t pred0, pred1, pred2, pred3;
821     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
822     int16x8_t pred4, pred5, pred6, pred7;
823     int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
824 
825     UWORD8 *pu1_pred_ptr = pu1_pred;
826     WORD16 *pi2_rsd_ptr = pi2_rsd;
827     UWORD8 *pu1_out_ptr = pu1_out;
828     uint8x16_t pred0_inp_full, pred1_inp_full, pred2_inp_full, pred3_inp_full;
829     uint8x16_t pred4_inp_full, pred5_inp_full, pred6_inp_full, pred7_inp_full;
830     uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
831     uint8x8_t i4_out_horz_8x8_r4, i4_out_horz_8x8_r5, i4_out_horz_8x8_r6, i4_out_horz_8x8_r7;
832     uint8x8_t chroma_mask_8x8;
833 
834     pred0_inp_full = vld1q_u8((uint8_t *) pu1_pred);
835     pred1_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd));
836     pred2_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 2));
837     pred3_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 3));
838     pred4_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 4));
839     pred5_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 5));
840     pred6_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 6));
841     pred7_inp_full = vld1q_u8((uint8_t *) pu1_pred + (pred_strd * 7));
842 
843     pred0_in = vget_low_u8(pred0_inp_full);
844     pred1_in = vget_low_u8(pred1_inp_full);
845     pred2_in = vget_low_u8(pred2_inp_full);
846     pred3_in = vget_low_u8(pred3_inp_full);
847     pred4_in = vget_low_u8(pred4_inp_full);
848     pred5_in = vget_low_u8(pred5_inp_full);
849     pred6_in = vget_low_u8(pred6_inp_full);
850     pred7_in = vget_low_u8(pred7_inp_full);
851 
852     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
853     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
854     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
855     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
856     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
857     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
858     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
859     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
860 
861     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
862     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
863     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
864     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
865     resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
866     resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
867     resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
868     resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
869 
870     pred0 = vaddq_s16(pred0, resd0_in);
871     pred1 = vaddq_s16(pred1, resd1_in);
872     pred2 = vaddq_s16(pred2, resd2_in);
873     pred3 = vaddq_s16(pred3, resd3_in);
874     pred4 = vaddq_s16(pred4, resd4_in);
875     pred5 = vaddq_s16(pred5, resd5_in);
876     pred6 = vaddq_s16(pred6, resd6_in);
877     pred7 = vaddq_s16(pred7, resd7_in);
878 
879     pred0_in = vqmovun_s16(pred0);
880     pred1_in = vqmovun_s16(pred1);
881     pred2_in = vqmovun_s16(pred2);
882     pred3_in = vqmovun_s16(pred3);
883     pred4_in = vqmovun_s16(pred4);
884     pred5_in = vqmovun_s16(pred5);
885     pred6_in = vqmovun_s16(pred6);
886     pred7_in = vqmovun_s16(pred7);
887 
888     chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
889 
890     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
891     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
892     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
893     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
894     i4_out_horz_8x8_r4 = vld1_u8(pu1_out + out_strd * 4);
895     i4_out_horz_8x8_r5 = vld1_u8(pu1_out + out_strd * 5);
896     i4_out_horz_8x8_r6 = vld1_u8(pu1_out + out_strd * 6);
897     i4_out_horz_8x8_r7 = vld1_u8(pu1_out + out_strd * 7);
898 
899     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
900     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
901     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
902     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
903     i4_out_horz_8x8_r4 = vbsl_u8(chroma_mask_8x8, pred4_in, i4_out_horz_8x8_r4);
904     i4_out_horz_8x8_r5 = vbsl_u8(chroma_mask_8x8, pred5_in, i4_out_horz_8x8_r5);
905     i4_out_horz_8x8_r6 = vbsl_u8(chroma_mask_8x8, pred6_in, i4_out_horz_8x8_r6);
906     i4_out_horz_8x8_r7 = vbsl_u8(chroma_mask_8x8, pred7_in, i4_out_horz_8x8_r7);
907 
908     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
909     vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
910     vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
911     vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
912     vst1_u8((uint8_t *) (pu1_out + out_strd * 4), i4_out_horz_8x8_r4);
913     vst1_u8((uint8_t *) (pu1_out + out_strd * 5), i4_out_horz_8x8_r5);
914     vst1_u8((uint8_t *) (pu1_out + out_strd * 6), i4_out_horz_8x8_r6);
915     vst1_u8((uint8_t *) (pu1_out + out_strd * 7), i4_out_horz_8x8_r7);
916 
917     /* for the next 4 elements interleaved format */
918     pred0_in = vget_high_u8(pred0_inp_full);
919     pred1_in = vget_high_u8(pred1_inp_full);
920     pred2_in = vget_high_u8(pred2_inp_full);
921     pred3_in = vget_high_u8(pred3_inp_full);
922     pred4_in = vget_high_u8(pred4_inp_full);
923     pred5_in = vget_high_u8(pred5_inp_full);
924     pred6_in = vget_high_u8(pred6_inp_full);
925     pred7_in = vget_high_u8(pred7_inp_full);
926 
927     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
928     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
929     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
930     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
931     pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
932     pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
933     pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
934     pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
935 
936     pi2_rsd = pi2_rsd + 8;
937     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
938     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
939     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
940     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
941     resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
942     resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
943     resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
944     resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
945 
946     pred0 = vaddq_s16(pred0, resd0_in);
947     pred1 = vaddq_s16(pred1, resd1_in);
948     pred2 = vaddq_s16(pred2, resd2_in);
949     pred3 = vaddq_s16(pred3, resd3_in);
950     pred4 = vaddq_s16(pred4, resd4_in);
951     pred5 = vaddq_s16(pred5, resd5_in);
952     pred6 = vaddq_s16(pred6, resd6_in);
953     pred7 = vaddq_s16(pred7, resd7_in);
954 
955     pred0_in = vqmovun_s16(pred0);
956     pred1_in = vqmovun_s16(pred1);
957     pred2_in = vqmovun_s16(pred2);
958     pred3_in = vqmovun_s16(pred3);
959     pred4_in = vqmovun_s16(pred4);
960     pred5_in = vqmovun_s16(pred5);
961     pred6_in = vqmovun_s16(pred6);
962     pred7_in = vqmovun_s16(pred7);
963 
964     pu1_out = pu1_out + 8;
965     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
966     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
967     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
968     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
969     i4_out_horz_8x8_r4 = vld1_u8(pu1_out + out_strd * 4);
970     i4_out_horz_8x8_r5 = vld1_u8(pu1_out + out_strd * 5);
971     i4_out_horz_8x8_r6 = vld1_u8(pu1_out + out_strd * 6);
972     i4_out_horz_8x8_r7 = vld1_u8(pu1_out + out_strd * 7);
973 
974     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
975     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
976     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
977     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
978     i4_out_horz_8x8_r4 = vbsl_u8(chroma_mask_8x8, pred4_in, i4_out_horz_8x8_r4);
979     i4_out_horz_8x8_r5 = vbsl_u8(chroma_mask_8x8, pred5_in, i4_out_horz_8x8_r5);
980     i4_out_horz_8x8_r6 = vbsl_u8(chroma_mask_8x8, pred6_in, i4_out_horz_8x8_r6);
981     i4_out_horz_8x8_r7 = vbsl_u8(chroma_mask_8x8, pred7_in, i4_out_horz_8x8_r7);
982 
983     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
984     vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
985     vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
986     vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
987     vst1_u8((uint8_t *) (pu1_out + out_strd * 4), i4_out_horz_8x8_r4);
988     vst1_u8((uint8_t *) (pu1_out + out_strd * 5), i4_out_horz_8x8_r5);
989     vst1_u8((uint8_t *) (pu1_out + out_strd * 6), i4_out_horz_8x8_r6);
990     vst1_u8((uint8_t *) (pu1_out + out_strd * 7), i4_out_horz_8x8_r7);
991 
992     pu1_out = pu1_out_ptr;
993     pu1_pred = pu1_pred_ptr;
994     pi2_rsd = pi2_rsd_ptr;
995 }
996 
997 /*****************************************************************************/
998 /*                                                                           */
999 /*  Function Name : isvcd_pred_residual_recon_chroma_4x4_neonintr             */
1000 /*                                                                           */
1001 /*  Description   : this function computes the recon data from the           */
1002 /*                   pred and residual buffer                                */
1003 /*                                                                           */
1004 /*  Inputs        :                                                          */
1005 /*  Globals       : none                                                     */
1006 /*  Processing    :                                                          */
1007 /*                                                                           */
1008 /*  Outputs       : none                                                     */
1009 /*  Returns       : none                                                     */
1010 /*                                                                           */
1011 /*  Issues        : none                                                     */
1012 /*                                                                           */
1013 /*  Revision History:                                                        */
1014 /*                                                                           */
1015 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1016 /*         25 11 2021   Kishore               creation                       */
1017 /*                                                                           */
1018 /*****************************************************************************/
1019 
isvcd_pred_residual_recon_chroma_4x4_neonintr(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)1020 void isvcd_pred_residual_recon_chroma_4x4_neonintr(UWORD8 *pu1_pred, WORD16 *pi2_rsd,
1021                                                    UWORD8 *pu1_out, WORD32 pred_strd,
1022                                                    WORD32 rsd_strd, WORD32 out_strd)
1023 {
1024     uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1025     int16x8_t pred0, pred1, pred2, pred3;
1026     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1027 
1028     uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
1029     uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
1030 
1031     UWORD8 *pu1_pred_ptr = pu1_pred;
1032     WORD16 *pi2_rsd_ptr = pi2_rsd;
1033     UWORD8 *pu1_out_ptr = pu1_out;
1034 
1035     pred0_in = vld1_u8((uint8_t *) pu1_pred);
1036     pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
1037     pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 2));
1038     pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
1039 
1040     pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1041     pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1042     pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1043     pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1044 
1045     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1046     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1047     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1048     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1049 
1050     pred0 = vaddq_s16(pred0, resd0_in);
1051     pred1 = vaddq_s16(pred1, resd1_in);
1052     pred2 = vaddq_s16(pred2, resd2_in);
1053     pred3 = vaddq_s16(pred3, resd3_in);
1054 
1055     pred0_in = vqmovun_s16(pred0);
1056     pred1_in = vqmovun_s16(pred1);
1057     pred2_in = vqmovun_s16(pred2);
1058     pred3_in = vqmovun_s16(pred3);
1059 
1060     i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
1061     i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
1062     i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
1063     i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
1064 
1065     i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
1066     i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
1067     i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
1068     i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
1069 
1070     vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
1071     vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
1072     vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
1073     vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
1074 
1075     pu1_out = pu1_out_ptr;
1076     pu1_pred = pu1_pred_ptr;
1077     pi2_rsd = pi2_rsd_ptr;
1078 }
1079 
1080 /*****************************************************************************/
1081 /*                                                                           */
1082 /*  Function Name : isvcd_residual_luma_4x4_neonintr                          */
1083 /*                                                                           */
1084 /*  Description   : this function computes the nnz from resd                 */
1085 /*                                                                           */
1086 /*  Inputs        :                                                          */
1087 /*  Globals       : none                                                     */
1088 /*  Processing    :                                                          */
1089 /*                                                                           */
1090 /*  Outputs       : none                                                     */
1091 /*  Returns       : nnz                                                      */
1092 /*                                                                           */
1093 /*  Issues        : none                                                     */
1094 /*                                                                           */
1095 /*  Revision History:                                                        */
1096 /*                                                                           */
1097 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1098 /*         25 11 2021   Kishore               creation                       */
1099 /*                                                                           */
1100 /*****************************************************************************/
1101 
isvcd_residual_luma_4x4_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1102 WORD32 isvcd_residual_luma_4x4_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1103 {
1104     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1105     int16x8_t resd01_in, resd23_in;
1106     WORD32 i4_nnz;
1107     int16x8_t dup_val_1, dup_val_2, dup_abs;
1108     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1109     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1110     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1111     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1112 
1113     resd01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
1114                                                    vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
1115 
1116     resd23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
1117                                                    vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
1118 
1119     dup_val_1 = vabsq_s16(resd01_in);
1120     dup_val_2 = vabsq_s16(resd23_in);
1121     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1122     i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1123              dup_abs[6] || dup_abs[7];
1124 
1125     return i4_nnz;
1126 }
1127 
1128 /*****************************************************************************/
1129 /*                                                                           */
1130 /*  Function Name : isvcd_residual_luma_8x8_neonintr                          */
1131 /*                                                                           */
1132 /*  Description   : this function computes the nnz from resd                 */
1133 /*                                                                           */
1134 /*  Inputs        :                                                          */
1135 /*  Globals       : none                                                     */
1136 /*  Processing    :                                                          */
1137 /*                                                                           */
1138 /*  Outputs       : none                                                     */
1139 /*  Returns       : nnz                                                      */
1140 /*                                                                           */
1141 /*  Issues        : none                                                     */
1142 /*                                                                           */
1143 /*  Revision History:                                                        */
1144 /*                                                                           */
1145 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1146 /*         25 11 2021   Kishore               creation                       */
1147 /*                                                                           */
1148 /*****************************************************************************/
1149 
isvcd_residual_luma_8x8_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1150 WORD32 isvcd_residual_luma_8x8_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1151 {
1152     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1153     int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
1154 
1155     int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
1156         resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
1157 
1158     int16x8_t resd_b0_r01_in;
1159     int16x8_t resd_b0_r23_in;
1160     int16x8_t resd_b1_r01_in;
1161     int16x8_t resd_b1_r23_in;
1162     int16x8_t resd_b2_r45_in;
1163     int16x8_t resd_b2_r67_in;
1164     int16x8_t resd_b3_r45_in;
1165     int16x8_t resd_b3_r67_in;
1166 
1167     int16x8_t dup_val_1, dup_val_2, dup_abs;
1168     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1169 
1170     resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1171     resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1172     resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1173     resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1174     resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
1175     resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
1176     resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
1177     resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
1178 
1179     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1180     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1181     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1182     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1183     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1184     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1185     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1186     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1187 
1188     resd_b0_r01_in = vreinterpretq_s16_s64(
1189         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1190     resd_b0_r23_in = vreinterpretq_s16_s64(
1191         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1192     resd_b1_r01_in = vreinterpretq_s16_s64(
1193         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1194     resd_b1_r23_in = vreinterpretq_s16_s64(
1195         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1196     resd_b2_r45_in = vreinterpretq_s16_s64(
1197         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1198     resd_b2_r67_in = vreinterpretq_s16_s64(
1199         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1200     resd_b3_r45_in = vreinterpretq_s16_s64(
1201         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1202     resd_b3_r67_in = vreinterpretq_s16_s64(
1203         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1204     dup_val_1 = vabsq_s16(resd_b0_r01_in);
1205     dup_val_2 = vabsq_s16(resd_b0_r23_in);
1206     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1207     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1208              dup_abs[6] || dup_abs[7];
1209 
1210     dup_val_1 = vabsq_s16(resd_b1_r01_in);
1211     dup_val_2 = vabsq_s16(resd_b1_r23_in);
1212     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1213     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1214              dup_abs[6] || dup_abs[7];
1215 
1216     dup_val_1 = vabsq_s16(resd_b2_r45_in);
1217     dup_val_2 = vabsq_s16(resd_b2_r67_in);
1218     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1219     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1220              dup_abs[6] || dup_abs[7];
1221 
1222     dup_val_1 = vabsq_s16(resd_b3_r45_in);
1223     dup_val_2 = vabsq_s16(resd_b3_r67_in);
1224     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1225     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1226              dup_abs[6] || dup_abs[7];
1227 
1228     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1229 
1230     return nnz;
1231 }
1232 
1233 /*****************************************************************************/
1234 /*                                                                           */
1235 /*  Function Name : isvcd_residual_luma_16x16_neonintr                        */
1236 /*                                                                           */
1237 /*  Description   : this function computes the nnz from resd                 */
1238 /*                                                                           */
1239 /*  Inputs        :                                                          */
1240 /*  Globals       : none                                                     */
1241 /*  Processing    :                                                          */
1242 /*                                                                           */
1243 /*  Outputs       : none                                                     */
1244 /*  Returns       : nnz                                                      */
1245 /*                                                                           */
1246 /*  Issues        : none                                                     */
1247 /*                                                                           */
1248 /*  Revision History:                                                        */
1249 /*                                                                           */
1250 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1251 /*         25 11 2021   Kishore               creation                       */
1252 /*                                                                           */
1253 /*****************************************************************************/
1254 
isvcd_residual_luma_16x16_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1255 WORD32 isvcd_residual_luma_16x16_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1256 {
1257     int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1258     int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
1259 
1260     WORD16 *pi2_rsd_ptr = pi2_rsd;
1261     int64x2_t resd0_in_64x2, resd1_in_64x2, resd2_in_64x2, resd3_in_64x2, resd4_in_64x2,
1262         resd5_in_64x2, resd6_in_64x2, resd7_in_64x2;
1263 
1264     int16x8_t resd_b0_r01_in;
1265     int16x8_t resd_b0_r23_in;
1266     int16x8_t resd_b1_r01_in;
1267     int16x8_t resd_b1_r23_in;
1268     int16x8_t resd_b2_r45_in;
1269     int16x8_t resd_b2_r67_in;
1270     int16x8_t resd_b3_r45_in;
1271     int16x8_t resd_b3_r67_in;
1272 
1273     int16x8_t dup_val_1, dup_val_2, dup_abs;
1274     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1275 
1276     /* First row of 8,  first 8x8 elements */
1277     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1278     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1279     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1280     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1281     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1282     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1283     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1284     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1285 
1286     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1287     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1288     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1289     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1290     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1291     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1292     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1293     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1294 
1295     resd_b0_r01_in = vreinterpretq_s16_s64(
1296         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1297     resd_b0_r23_in = vreinterpretq_s16_s64(
1298         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1299     resd_b1_r01_in = vreinterpretq_s16_s64(
1300         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1301     resd_b1_r23_in = vreinterpretq_s16_s64(
1302         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1303     resd_b2_r45_in = vreinterpretq_s16_s64(
1304         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1305     resd_b2_r67_in = vreinterpretq_s16_s64(
1306         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1307     resd_b3_r45_in = vreinterpretq_s16_s64(
1308         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1309     resd_b3_r67_in = vreinterpretq_s16_s64(
1310         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1311     dup_val_1 = vabsq_s16(resd_b0_r01_in);
1312     dup_val_2 = vabsq_s16(resd_b0_r23_in);
1313     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1314     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1315              dup_abs[6] || dup_abs[7];
1316 
1317     dup_val_1 = vabsq_s16(resd_b1_r01_in);
1318     dup_val_2 = vabsq_s16(resd_b1_r23_in);
1319     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1320     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1321              dup_abs[6] || dup_abs[7];
1322 
1323     dup_val_1 = vabsq_s16(resd_b2_r45_in);
1324     dup_val_2 = vabsq_s16(resd_b2_r67_in);
1325     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1326     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1327              dup_abs[6] || dup_abs[7];
1328 
1329     dup_val_1 = vabsq_s16(resd_b3_r45_in);
1330     dup_val_2 = vabsq_s16(resd_b3_r67_in);
1331     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1332     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1333              dup_abs[6] || dup_abs[7];
1334 
1335     nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1336 
1337     /* first row of 8, sec 8x8 elements */
1338     pi2_rsd_ptr = pi2_rsd_ptr + 8;
1339 
1340     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1341     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1342     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1343     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1344     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1345     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1346     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1347     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1348 
1349     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1350     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1351     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1352     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1353     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1354     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1355     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1356     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1357 
1358     resd_b0_r01_in = vreinterpretq_s16_s64(
1359         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1360     resd_b0_r23_in = vreinterpretq_s16_s64(
1361         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1362     resd_b1_r01_in = vreinterpretq_s16_s64(
1363         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1364     resd_b1_r23_in = vreinterpretq_s16_s64(
1365         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1366     resd_b2_r45_in = vreinterpretq_s16_s64(
1367         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1368     resd_b2_r67_in = vreinterpretq_s16_s64(
1369         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1370     resd_b3_r45_in = vreinterpretq_s16_s64(
1371         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1372     resd_b3_r67_in = vreinterpretq_s16_s64(
1373         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1374     dup_val_1 = vabsq_s16(resd_b0_r01_in);
1375     dup_val_2 = vabsq_s16(resd_b0_r23_in);
1376     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1377     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1378              dup_abs[6] || dup_abs[7];
1379 
1380     dup_val_1 = vabsq_s16(resd_b1_r01_in);
1381     dup_val_2 = vabsq_s16(resd_b1_r23_in);
1382     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1383     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1384              dup_abs[6] || dup_abs[7];
1385 
1386     dup_val_1 = vabsq_s16(resd_b2_r45_in);
1387     dup_val_2 = vabsq_s16(resd_b2_r67_in);
1388     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1389     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1390              dup_abs[6] || dup_abs[7];
1391 
1392     dup_val_1 = vabsq_s16(resd_b3_r45_in);
1393     dup_val_2 = vabsq_s16(resd_b3_r67_in);
1394     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1395     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1396              dup_abs[6] || dup_abs[7];
1397 
1398     nnz |= (nnz_b0 << 2 | (nnz_b1 << 3) | (nnz_b2 << 6) | (nnz_b3 << 7));
1399 
1400     pi2_rsd_ptr = pi2_rsd + (8 * rsd_strd);
1401     /*Sec row of 8, first 8x8*/
1402     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1403     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1404     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1405     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1406     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1407     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1408     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1409     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1410 
1411     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1412     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1413     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1414     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1415     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1416     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1417     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1418     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1419 
1420     resd_b0_r01_in = vreinterpretq_s16_s64(
1421         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1422     resd_b0_r23_in = vreinterpretq_s16_s64(
1423         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1424     resd_b1_r01_in = vreinterpretq_s16_s64(
1425         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1426     resd_b1_r23_in = vreinterpretq_s16_s64(
1427         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1428     resd_b2_r45_in = vreinterpretq_s16_s64(
1429         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1430     resd_b2_r67_in = vreinterpretq_s16_s64(
1431         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1432     resd_b3_r45_in = vreinterpretq_s16_s64(
1433         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1434     resd_b3_r67_in = vreinterpretq_s16_s64(
1435         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1436     dup_val_1 = vabsq_s16(resd_b0_r01_in);
1437     dup_val_2 = vabsq_s16(resd_b0_r23_in);
1438     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1439     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1440              dup_abs[6] || dup_abs[7];
1441 
1442     dup_val_1 = vabsq_s16(resd_b1_r01_in);
1443     dup_val_2 = vabsq_s16(resd_b1_r23_in);
1444     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1445     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1446              dup_abs[6] || dup_abs[7];
1447 
1448     dup_val_1 = vabsq_s16(resd_b2_r45_in);
1449     dup_val_2 = vabsq_s16(resd_b2_r67_in);
1450     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1451     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1452              dup_abs[6] || dup_abs[7];
1453 
1454     dup_val_1 = vabsq_s16(resd_b3_r45_in);
1455     dup_val_2 = vabsq_s16(resd_b3_r67_in);
1456     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1457     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1458              dup_abs[6] || dup_abs[7];
1459 
1460     nnz |= (nnz_b0 << 8 | (nnz_b1 << 9) | (nnz_b2 << 12) | (nnz_b3 << 13));
1461 
1462     /*Sec row of 8, Sec 8x8*/
1463     pi2_rsd_ptr = pi2_rsd_ptr + 8;
1464 
1465     resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1466     resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1467     resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1468     resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1469     resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1470     resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1471     resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1472     resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1473 
1474     resd0_in_64x2 = vreinterpretq_s64_s16(resd0_in);
1475     resd1_in_64x2 = vreinterpretq_s64_s16(resd1_in);
1476     resd2_in_64x2 = vreinterpretq_s64_s16(resd2_in);
1477     resd3_in_64x2 = vreinterpretq_s64_s16(resd3_in);
1478     resd4_in_64x2 = vreinterpretq_s64_s16(resd4_in);
1479     resd5_in_64x2 = vreinterpretq_s64_s16(resd5_in);
1480     resd6_in_64x2 = vreinterpretq_s64_s16(resd6_in);
1481     resd7_in_64x2 = vreinterpretq_s64_s16(resd7_in);
1482 
1483     resd_b0_r01_in = vreinterpretq_s16_s64(
1484         vcombine_s64(vget_low_s64(resd0_in_64x2), vget_low_s64(resd1_in_64x2)));
1485     resd_b0_r23_in = vreinterpretq_s16_s64(
1486         vcombine_s64(vget_low_s64(resd2_in_64x2), vget_low_s64(resd3_in_64x2)));
1487     resd_b1_r01_in = vreinterpretq_s16_s64(
1488         vcombine_s64(vget_high_s64(resd0_in_64x2), vget_high_s64(resd1_in_64x2)));
1489     resd_b1_r23_in = vreinterpretq_s16_s64(
1490         vcombine_s64(vget_high_s64(resd2_in_64x2), vget_high_s64(resd3_in_64x2)));
1491     resd_b2_r45_in = vreinterpretq_s16_s64(
1492         vcombine_s64(vget_low_s64(resd4_in_64x2), vget_low_s64(resd5_in_64x2)));
1493     resd_b2_r67_in = vreinterpretq_s16_s64(
1494         vcombine_s64(vget_low_s64(resd6_in_64x2), vget_low_s64(resd7_in_64x2)));
1495     resd_b3_r45_in = vreinterpretq_s16_s64(
1496         vcombine_s64(vget_high_s64(resd4_in_64x2), vget_high_s64(resd5_in_64x2)));
1497     resd_b3_r67_in = vreinterpretq_s16_s64(
1498         vcombine_s64(vget_high_s64(resd6_in_64x2), vget_high_s64(resd7_in_64x2)));
1499     dup_val_1 = vabsq_s16(resd_b0_r01_in);
1500     dup_val_2 = vabsq_s16(resd_b0_r23_in);
1501     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1502     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1503              dup_abs[6] || dup_abs[7];
1504 
1505     dup_val_1 = vabsq_s16(resd_b1_r01_in);
1506     dup_val_2 = vabsq_s16(resd_b1_r23_in);
1507     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1508     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1509              dup_abs[6] || dup_abs[7];
1510 
1511     dup_val_1 = vabsq_s16(resd_b2_r45_in);
1512     dup_val_2 = vabsq_s16(resd_b2_r67_in);
1513     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1514     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1515              dup_abs[6] || dup_abs[7];
1516 
1517     dup_val_1 = vabsq_s16(resd_b3_r45_in);
1518     dup_val_2 = vabsq_s16(resd_b3_r67_in);
1519     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1520     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1521              dup_abs[6] || dup_abs[7];
1522 
1523     nnz |= (nnz_b0 << 10 | (nnz_b1 << 11) | (nnz_b2 << 14) | (nnz_b3 << 15));
1524     return nnz;
1525 }
1526 
1527 /*****************************************************************************/
1528 /*                                                                           */
1529 /*  Function Name : isvcd_residual_chroma_cb_cr_8x8_neonintr                  */
1530 /*                                                                           */
1531 /*  Description   : this function computes the nnz from resd                 */
1532 /*                                                                           */
1533 /*  Inputs        :                                                          */
1534 /*  Globals       : none                                                     */
1535 /*  Processing    :                                                          */
1536 /*                                                                           */
1537 /*  Outputs       : none                                                     */
1538 /*  Returns       : nnz                                                      */
1539 /*                                                                           */
1540 /*  Issues        : none                                                     */
1541 /*                                                                           */
1542 /*  Revision History:                                                        */
1543 /*                                                                           */
1544 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1545 /*         25 11 2021   Kishore               creation                       */
1546 /*                                                                           */
1547 /*****************************************************************************/
1548 
isvcd_residual_chroma_cb_cr_8x8_neonintr(WORD16 * pi2_rsd,WORD32 rsd_strd)1549 WORD32 isvcd_residual_chroma_cb_cr_8x8_neonintr(WORD16 *pi2_rsd, WORD32 rsd_strd)
1550 {
1551     int16x8x2_t resd0_in, resd1_in, resd2_in, resd3_in;
1552     int16x8x2_t resd4_in, resd5_in, resd6_in, resd7_in;
1553 
1554     int64x2_t resd0_cr_64x2, resd1_cr_64x2, resd2_cr_64x2, resd3_cr_64x2, resd4_cr_64x2,
1555         resd5_cr_64x2, resd6_cr_64x2, resd7_cr_64x2;
1556 
1557     int16x8_t resd_b0_r01_cr;
1558     int16x8_t resd_b0_r23_cr;
1559     int16x8_t resd_b1_r01_cr;
1560     int16x8_t resd_b1_r23_cr;
1561     int16x8_t resd_b2_r45_cr;
1562     int16x8_t resd_b2_r67_cr;
1563     int16x8_t resd_b3_r45_cr;
1564     int16x8_t resd_b3_r67_cr;
1565 
1566     int64x2_t resd0_cb_64x2, resd1_cb_64x2, resd2_cb_64x2, resd3_cb_64x2, resd4_cb_64x2,
1567         resd5_cb_64x2, resd6_cb_64x2, resd7_cb_64x2;
1568 
1569     int16x8_t resd_b0_r01_cb;
1570     int16x8_t resd_b0_r23_cb;
1571     int16x8_t resd_b1_r01_cb;
1572     int16x8_t resd_b1_r23_cb;
1573     int16x8_t resd_b2_r45_cb;
1574     int16x8_t resd_b2_r67_cb;
1575     int16x8_t resd_b3_r45_cb;
1576     int16x8_t resd_b3_r67_cb;
1577 
1578     WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1579     int16x8_t dup_val_1, dup_val_2, dup_abs;
1580 
1581     resd0_in = vld2q_s16((int16_t *) pi2_rsd);
1582     resd1_in = vld2q_s16((int16_t *) pi2_rsd + rsd_strd);
1583     resd2_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1584     resd3_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1585     resd4_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
1586     resd5_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
1587     resd6_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
1588     resd7_in = vld2q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
1589 
1590     resd0_cb_64x2 = vreinterpretq_s64_s16(resd0_in.val[0]);
1591     resd1_cb_64x2 = vreinterpretq_s64_s16(resd1_in.val[0]);
1592     resd2_cb_64x2 = vreinterpretq_s64_s16(resd2_in.val[0]);
1593     resd3_cb_64x2 = vreinterpretq_s64_s16(resd3_in.val[0]);
1594     resd4_cb_64x2 = vreinterpretq_s64_s16(resd4_in.val[0]);
1595     resd5_cb_64x2 = vreinterpretq_s64_s16(resd5_in.val[0]);
1596     resd6_cb_64x2 = vreinterpretq_s64_s16(resd6_in.val[0]);
1597     resd7_cb_64x2 = vreinterpretq_s64_s16(resd7_in.val[0]);
1598 
1599     resd_b0_r01_cb = vreinterpretq_s16_s64(
1600         vcombine_s64(vget_low_s64(resd0_cb_64x2), vget_low_s64(resd1_cb_64x2)));
1601     resd_b0_r23_cb = vreinterpretq_s16_s64(
1602         vcombine_s64(vget_low_s64(resd2_cb_64x2), vget_low_s64(resd3_cb_64x2)));
1603     resd_b1_r01_cb = vreinterpretq_s16_s64(
1604         vcombine_s64(vget_high_s64(resd0_cb_64x2), vget_high_s64(resd1_cb_64x2)));
1605     resd_b1_r23_cb = vreinterpretq_s16_s64(
1606         vcombine_s64(vget_high_s64(resd2_cb_64x2), vget_high_s64(resd3_cb_64x2)));
1607     resd_b2_r45_cb = vreinterpretq_s16_s64(
1608         vcombine_s64(vget_low_s64(resd4_cb_64x2), vget_low_s64(resd5_cb_64x2)));
1609     resd_b2_r67_cb = vreinterpretq_s16_s64(
1610         vcombine_s64(vget_low_s64(resd6_cb_64x2), vget_low_s64(resd7_cb_64x2)));
1611     resd_b3_r45_cb = vreinterpretq_s16_s64(
1612         vcombine_s64(vget_high_s64(resd4_cb_64x2), vget_high_s64(resd5_cb_64x2)));
1613     resd_b3_r67_cb = vreinterpretq_s16_s64(
1614         vcombine_s64(vget_high_s64(resd6_cb_64x2), vget_high_s64(resd7_cb_64x2)));
1615 
1616     resd0_cr_64x2 = vreinterpretq_s64_s16(resd0_in.val[1]);
1617     resd1_cr_64x2 = vreinterpretq_s64_s16(resd1_in.val[1]);
1618     resd2_cr_64x2 = vreinterpretq_s64_s16(resd2_in.val[1]);
1619     resd3_cr_64x2 = vreinterpretq_s64_s16(resd3_in.val[1]);
1620     resd4_cr_64x2 = vreinterpretq_s64_s16(resd4_in.val[1]);
1621     resd5_cr_64x2 = vreinterpretq_s64_s16(resd5_in.val[1]);
1622     resd6_cr_64x2 = vreinterpretq_s64_s16(resd6_in.val[1]);
1623     resd7_cr_64x2 = vreinterpretq_s64_s16(resd7_in.val[1]);
1624 
1625     resd_b0_r01_cr = vreinterpretq_s16_s64(
1626         vcombine_s64(vget_low_s64(resd0_cr_64x2), vget_low_s64(resd1_cr_64x2)));
1627     resd_b0_r23_cr = vreinterpretq_s16_s64(
1628         vcombine_s64(vget_low_s64(resd2_cr_64x2), vget_low_s64(resd3_cr_64x2)));
1629     resd_b1_r01_cr = vreinterpretq_s16_s64(
1630         vcombine_s64(vget_high_s64(resd0_cr_64x2), vget_high_s64(resd1_cr_64x2)));
1631     resd_b1_r23_cr = vreinterpretq_s16_s64(
1632         vcombine_s64(vget_high_s64(resd2_cr_64x2), vget_high_s64(resd3_cr_64x2)));
1633     resd_b2_r45_cr = vreinterpretq_s16_s64(
1634         vcombine_s64(vget_low_s64(resd4_cr_64x2), vget_low_s64(resd5_cr_64x2)));
1635     resd_b2_r67_cr = vreinterpretq_s16_s64(
1636         vcombine_s64(vget_low_s64(resd6_cr_64x2), vget_low_s64(resd7_cr_64x2)));
1637     resd_b3_r45_cr = vreinterpretq_s16_s64(
1638         vcombine_s64(vget_high_s64(resd4_cr_64x2), vget_high_s64(resd5_cr_64x2)));
1639     resd_b3_r67_cr = vreinterpretq_s16_s64(
1640         vcombine_s64(vget_high_s64(resd6_cr_64x2), vget_high_s64(resd7_cr_64x2)));
1641 
1642     dup_val_1 = vabsq_s16(resd_b0_r01_cr);
1643     dup_val_2 = vabsq_s16(resd_b0_r23_cr);
1644     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1645     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1646              dup_abs[6] || dup_abs[7];
1647 
1648     dup_val_1 = vabsq_s16(resd_b1_r01_cr);
1649     dup_val_2 = vabsq_s16(resd_b1_r23_cr);
1650     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1651     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1652              dup_abs[6] || dup_abs[7];
1653 
1654     dup_val_1 = vabsq_s16(resd_b2_r45_cr);
1655     dup_val_2 = vabsq_s16(resd_b2_r67_cr);
1656     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1657     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1658              dup_abs[6] || dup_abs[7];
1659 
1660     dup_val_1 = vabsq_s16(resd_b3_r45_cr);
1661     dup_val_2 = vabsq_s16(resd_b3_r67_cr);
1662     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1663     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1664              dup_abs[6] || dup_abs[7];
1665 
1666     nnz = ((nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 2) | (nnz_b3 << 3)) << 4);
1667 
1668     dup_val_1 = vabsq_s16(resd_b0_r01_cb);
1669     dup_val_2 = vabsq_s16(resd_b0_r23_cb);
1670     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1671     nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1672              dup_abs[6] || dup_abs[7];
1673 
1674     dup_val_1 = vabsq_s16(resd_b1_r01_cb);
1675     dup_val_2 = vabsq_s16(resd_b1_r23_cb);
1676     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1677     nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1678              dup_abs[6] || dup_abs[7];
1679 
1680     dup_val_1 = vabsq_s16(resd_b2_r45_cb);
1681     dup_val_2 = vabsq_s16(resd_b2_r67_cb);
1682     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1683     nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1684              dup_abs[6] || dup_abs[7];
1685 
1686     dup_val_1 = vabsq_s16(resd_b3_r45_cb);
1687     dup_val_2 = vabsq_s16(resd_b3_r67_cb);
1688     dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1689     nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1690              dup_abs[6] || dup_abs[7];
1691 
1692     nnz |= ((nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 2) | (nnz_b3 << 3)));
1693     return nnz;
1694 }
1695