xref: /aosp_15_r20/external/libavc/decoder/arm/svc/isvcd_residual_resamp_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21 *******************************************************************************
22 * @file
23 *  isvcd_residual_resamp_neonintr.c
24 *
25 * @brief
26 *  Contains routines that resample for SVC resampling
27 *
28 * @author
29 *  Kishore
30 *
31 *  @par List of Functions:
32 *  - isvcd_pred_residual_recon_4x4_neonintr()
33 *  - isvcd_pred_residual_recon_8x8_neonintr()
34 *  - isvcd_pred_residual_recon_16x16_neonintr()
35 *  - isvcd_pred_residual_recon_chroma_4x4_neonintr()
36 *  - isvcd_pred_residual_recon_chroma_8x8_neonintr()
37 *  - isvcd_residual_luma_4x4_neonintr()
38 *  - isvcd_residual_luma_8x8_neonintr()
39 *  - isvcd_residual_luma_16x16_neonintr()
40 *  - isvcd_residual_chroma_cb_cr_8x8_neonintr()
41 *
42 * @remarks
43 *
44 *******************************************************************************
45 */
46 
47 /*!
48  **************************************************************************
49  * \file isvcd_residual_resamp_neonintr.c
50  *
51  * \brief
52  *    Contains routines that resample for SVC resampling
53  *
54  * Detailed_description
55  *
56  * \date
57  *
58  *
59  * \author : kishore
60  **************************************************************************
61  */
62 #include <assert.h>
63 #include <string.h>
64 #include <arm_neon.h>
65 
66 #include "ih264_typedefs.h"
67 #include "ih264_macros.h"
68 #include "ih264_platform_macros.h"
69 #include "isvcd_structs.h"
70 #include "ih264_debug.h"
71 
72 /*****************************************************************************/
73 /*                                                                           */
74 /*  Function Name : isvcd_residual_luma_dyadic_neonintr                       */
75 /*                                                                           */
76 /*  Description   : this fucntion does the upsampling of luma residuals for  */
77 /*                  Dyadic cases                                             */
78 /*                                                                           */
79 /*  Inputs        : pv_residual_samp_ctxt : Residual upsampling context      */
80 /*                  pu1_inp_data : input 8 bit data pointer                  */
81 /*                  i4_inp_data_stride : input buffer stride                 */
82 /*                  pi2_out_res : output 16 bit buffer pointer               */
83 /*                  i4_out_res_stride : Output buffer stride                 */
84 /*                  pu1_inp_bitmap : input packed sign bit data pointer      */
85 /*                  i4_inp_bitmap_stride : sign bit buffer stride            */
86 /*                  ps_ref_mb_mode : reference mb mode pointer of base layer */
87 /*                  ps_coord : mb co-ordinate pointer                        */
88 /*  Globals       : none                                                     */
89 /*  Processing    : it does the upsampling with fixed phase values and       */
90 /*                  reference layer transform size                           */
91 /*  Outputs       : Upsampled residuals for luma                             */
92 /*  Returns       : none                                                     */
93 /*                                                                           */
94 /*  Issues        : none                                                     */
95 /*                                                                           */
96 /*  Revision History:                                                        */
97 /*                                                                           */
98 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
99 /*         26 05 2021   Dolan          creation                              */
100 /*                                                                           */
101 /*****************************************************************************/
102 
isvcd_residual_luma_dyadic_neonintr(void * pv_residual_samp_ctxt,WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_out_res,WORD32 i4_out_res_stride,mem_element_t * ps_ref_mb_mode,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_ref_nnz,WORD32 i4_ref_tx_size)103 void isvcd_residual_luma_dyadic_neonintr(void *pv_residual_samp_ctxt, WORD16 *pi2_inp_data,
104                                          WORD32 i4_inp_data_stride, WORD16 *pi2_out_res,
105                                          WORD32 i4_out_res_stride, mem_element_t *ps_ref_mb_mode,
106                                          UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_ref_nnz,
107                                          WORD32 i4_ref_tx_size)
108 {
109     WORD16 *pi2_refarray_buffer;
110     WORD32 i4_blk_ctr;
111     residual_sampling_ctxt_t *ps_ctxt;
112     UNUSED(ps_ref_mb_mode);
113     UNUSED(u2_mb_x);
114     UNUSED(u2_mb_y);
115 
116     ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
117     pi2_refarray_buffer = ps_ctxt->pi2_refarray_buffer;
118 
119     /* based on transform size the counter and interpolation width and */
120     /* height are intialised as follows                                */
121     if((i4_ref_tx_size) && (0 != i4_ref_nnz))
122     {
123         WORD16 *pi2_ref_data_byte;
124         WORD32 *pi4_ref_array;
125         WORD32 i4_i, i4_j;
126         /* ----------- Horizontal Interpolation ---------------- */
127         int16x8_t i2_coeff_add_16x8_r0;
128         int16x8_t i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1;
129         int16x8_t i2_coeff_16x8_sl_r0_0, i2_coeff_16x8_sl_r0_1;
130         int16x8_t result_16x8_r0_0, result_16x8_r0_1;
131         int16x8_t final_result_16x8_r0_0, final_result_16x8_r0_1;
132 
133         int16x8_t i2_coeff_add_16x8_r1;
134         int16x8_t i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1;
135         int16x8_t i2_coeff_16x8_sl_r1_0, i2_coeff_16x8_sl_r1_1;
136         int16x8_t result_16x8_r1_0, result_16x8_r1_1;
137         int16x8_t final_result_16x8_r1_0, final_result_16x8_r1_1;
138         int16x8x2_t result_16x8x2_t_0;
139 
140         pi2_ref_data_byte = pi2_inp_data;
141 
142         /* ----------- Horizontal Interpolation ---------------- */
143         pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
144 
145         for(i4_i = 0; i4_i < BLOCK_HEIGHT; i4_i += 2)
146         {
147             i2_coeff_16x8_r0_0 = vld1q_s16(pi2_ref_data_byte);
148             i2_coeff_16x8_r0_1 = vld1q_s16((pi2_ref_data_byte + 1));
149 
150             i2_coeff_16x8_r1_0 = vld1q_s16(pi2_ref_data_byte + i4_inp_data_stride);
151             i2_coeff_16x8_r1_1 = vld1q_s16((pi2_ref_data_byte + i4_inp_data_stride + 1));
152 
153             i2_coeff_add_16x8_r0 = vaddq_s16(i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1);
154             i2_coeff_16x8_sl_r0_0 = vshlq_n_s16(i2_coeff_16x8_r0_0, 1);
155             i2_coeff_16x8_sl_r0_1 = vshlq_n_s16(i2_coeff_16x8_r0_1, 1);
156 
157             i2_coeff_add_16x8_r1 = vaddq_s16(i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1);
158             i2_coeff_16x8_sl_r1_0 = vshlq_n_s16(i2_coeff_16x8_r1_0, 1);
159             i2_coeff_16x8_sl_r1_1 = vshlq_n_s16(i2_coeff_16x8_r1_1, 1);
160 
161             result_16x8_r0_0 = vaddq_s16(i2_coeff_16x8_sl_r0_0, i2_coeff_add_16x8_r0);
162             result_16x8_r0_1 = vaddq_s16(i2_coeff_16x8_sl_r0_1, i2_coeff_add_16x8_r0);
163 
164             result_16x8_r1_0 = vaddq_s16(i2_coeff_16x8_sl_r1_0, i2_coeff_add_16x8_r1);
165             result_16x8_r1_1 = vaddq_s16(i2_coeff_16x8_sl_r1_1, i2_coeff_add_16x8_r1);
166 
167             result_16x8x2_t_0 = vzipq_s16(result_16x8_r0_0, result_16x8_r0_1);
168             final_result_16x8_r0_0 = result_16x8x2_t_0.val[0];
169             final_result_16x8_r0_1 = result_16x8x2_t_0.val[1];
170 
171             result_16x8x2_t_0 = vzipq_s16(result_16x8_r1_0, result_16x8_r1_1);
172             final_result_16x8_r1_0 = result_16x8x2_t_0.val[0];
173             final_result_16x8_r1_1 = result_16x8x2_t_0.val[1];
174 
175             vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8_r0_0)));
176             vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8_r0_0)));
177             vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8_r0_1)));
178             vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8_r0_1)));
179             pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
180             pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
181             pi4_ref_array += 16;
182             pi2_ref_data_byte += i4_inp_data_stride;
183 
184             vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8_r1_0)));
185             vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8_r1_0)));
186             vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8_r1_1)));
187             vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8_r1_1)));
188 
189             pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
190             pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
191             pi4_ref_array += 16;
192             /* vertical loop uopdates */
193             pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
194         }
195         /* ----------- Vertical Interpolation ---------------- */
196         pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
197 
198         {
199             WORD32 *pi4_ref_array_temp;
200             WORD16 *pi2_out;
201             int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r1_3,
202                 i4_horz_samp_32x4_r1_4;
203             int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2, i4_horz_samp_32x4_r2_3,
204                 i4_horz_samp_32x4_r2_4;
205 
206             int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2, i4_horz_res_32x4_r1_3,
207                 i4_horz_res_32x4_r1_4;
208             int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2, i4_horz_res_32x4_r2_3,
209                 i4_horz_res_32x4_r2_4;
210             int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2, i4_horz_res_32x4_r3_3,
211                 i4_horz_res_32x4_r3_4;
212             int32x4_t horz_add_32x4_r2_1, horz_add_32x4_r2_2, horz_add_32x4_r2_3,
213                 horz_add_32x4_r2_4;
214 
215             int16x8_t comb_horz_16x8_1, comb_horz_16x8_2, comb_horz_16x8_3, comb_horz_16x8_4;
216             pi4_ref_array_temp = pi4_ref_array;
217             pi2_out = pi2_out_res;
218 
219             i4_horz_samp_32x4_r1_1 = vld1q_s32(pi4_ref_array_temp);
220             i4_horz_samp_32x4_r1_2 = vld1q_s32(pi4_ref_array_temp + 4);
221             i4_horz_samp_32x4_r1_3 = vld1q_s32(pi4_ref_array_temp + 8);
222             i4_horz_samp_32x4_r1_4 = vld1q_s32(pi4_ref_array_temp + 12);
223 
224             /* populate the first inter sample */
225             i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
226             i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
227             i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
228             i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
229 
230             comb_horz_16x8_1 =
231                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
232             comb_horz_16x8_2 =
233                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
234             vst1q_s16(pi2_out, comb_horz_16x8_1);
235             vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
236 
237             pi2_out += i4_out_res_stride;
238 
239             for(i4_j = 0; i4_j < 14; i4_j += 2)
240             {
241                 pi4_ref_array_temp += MB_WIDTH;
242                 i4_horz_samp_32x4_r2_1 = vld1q_s32(pi4_ref_array_temp);
243                 i4_horz_samp_32x4_r2_2 = vld1q_s32(pi4_ref_array_temp + 4);
244                 i4_horz_samp_32x4_r2_3 = vld1q_s32(pi4_ref_array_temp + 8);
245                 i4_horz_samp_32x4_r2_4 = vld1q_s32(pi4_ref_array_temp + 12);
246 
247                 horz_add_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r2_1);
248                 horz_add_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r2_2);
249                 horz_add_32x4_r2_3 = vaddq_s32(i4_horz_samp_32x4_r1_3, i4_horz_samp_32x4_r2_3);
250                 horz_add_32x4_r2_4 = vaddq_s32(i4_horz_samp_32x4_r1_4, i4_horz_samp_32x4_r2_4);
251 
252                 i4_horz_res_32x4_r2_1 =
253                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_1, 1), horz_add_32x4_r2_1);
254                 i4_horz_res_32x4_r2_2 =
255                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_2, 1), horz_add_32x4_r2_2);
256                 i4_horz_res_32x4_r2_3 =
257                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_3, 1), horz_add_32x4_r2_3);
258                 i4_horz_res_32x4_r2_4 =
259                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_4, 1), horz_add_32x4_r2_4);
260 
261                 i4_horz_res_32x4_r3_1 =
262                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_1, 1), horz_add_32x4_r2_1);
263                 i4_horz_res_32x4_r3_2 =
264                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_2, 1), horz_add_32x4_r2_2);
265                 i4_horz_res_32x4_r3_3 =
266                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_3, 1), horz_add_32x4_r2_3);
267                 i4_horz_res_32x4_r3_4 =
268                     vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_4, 1), horz_add_32x4_r2_4);
269 
270                 i4_horz_res_32x4_r2_1 = vrshrq_n_s32(i4_horz_res_32x4_r2_1, 4);
271                 i4_horz_res_32x4_r2_2 = vrshrq_n_s32(i4_horz_res_32x4_r2_2, 4);
272                 i4_horz_res_32x4_r2_3 = vrshrq_n_s32(i4_horz_res_32x4_r2_3, 4);
273                 i4_horz_res_32x4_r2_4 = vrshrq_n_s32(i4_horz_res_32x4_r2_4, 4);
274 
275                 i4_horz_res_32x4_r3_1 = vrshrq_n_s32(i4_horz_res_32x4_r3_1, 4);
276                 i4_horz_res_32x4_r3_2 = vrshrq_n_s32(i4_horz_res_32x4_r3_2, 4);
277                 i4_horz_res_32x4_r3_3 = vrshrq_n_s32(i4_horz_res_32x4_r3_3, 4);
278                 i4_horz_res_32x4_r3_4 = vrshrq_n_s32(i4_horz_res_32x4_r3_4, 4);
279 
280                 comb_horz_16x8_1 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_1),
281                                                 vmovn_s32(i4_horz_res_32x4_r2_2));
282                 comb_horz_16x8_2 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_3),
283                                                 vmovn_s32(i4_horz_res_32x4_r2_4));
284 
285                 comb_horz_16x8_3 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_1),
286                                                 vmovn_s32(i4_horz_res_32x4_r3_2));
287                 comb_horz_16x8_4 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_3),
288                                                 vmovn_s32(i4_horz_res_32x4_r3_4));
289 
290                 /* populate 2 samples based on current coeffs */
291                 vst1q_s16(pi2_out, comb_horz_16x8_1);
292                 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
293                 pi2_out += i4_out_res_stride;
294 
295                 vst1q_s16(pi2_out, comb_horz_16x8_3);
296                 vst1q_s16(pi2_out + 8, comb_horz_16x8_4);
297                 pi2_out += i4_out_res_stride;
298 
299                 /* store the coeff 2 to coeff 1 */
300                 /* (used in next iteration)     */
301                 i4_horz_samp_32x4_r1_1 = i4_horz_samp_32x4_r2_1;
302                 i4_horz_samp_32x4_r1_2 = i4_horz_samp_32x4_r2_2;
303                 i4_horz_samp_32x4_r1_3 = i4_horz_samp_32x4_r2_3;
304                 i4_horz_samp_32x4_r1_4 = i4_horz_samp_32x4_r2_4;
305             }
306 
307             /* populate the first inter sample */
308             i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
309             i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
310             i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
311             i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
312 
313             comb_horz_16x8_1 =
314                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
315             comb_horz_16x8_2 =
316                 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
317             vst1q_s16(pi2_out, comb_horz_16x8_1);
318             vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
319 
320             /* horizontal loop updates */
321             pi4_ref_array++;
322             pi2_out_res++;
323         }
324     }
325     else
326     {
327         /* ----------------------------------------------------------------- */
328         /* LOOP over number of blocks                                        */
329         /* ----------------------------------------------------------------- */
330         for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
331         {
332             /* if reference layer is not coded then no processing */
333             if(0 != (i4_ref_nnz & 0x1))
334             {
335                 int16x8_t i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_1;
336                 int16x8_t i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_1;
337                 int16x8_t i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_1;
338                 int16x8_t i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_1;
339                 int16x8_t i2_add_16x8_r0_0;
340                 int16x8_t i2_add_16x8_r1_0;
341                 int16x8_t i2_add_16x8_r2_0;
342                 int16x8_t i2_add_16x8_r3_0;
343                 int16x8_t i2_res_16x8_r0_0, i2_res_16x8_r0_1;
344                 int16x8_t i2_res_16x8_r1_0, i2_res_16x8_r1_1;
345                 int16x8_t i2_res_16x8_r2_0, i2_res_16x8_r2_1;
346                 int16x8_t i2_res_16x8_r3_0, i2_res_16x8_r3_1;
347 
348                 i2_coeff1_16x8_r0_0 = vld1q_s16(pi2_inp_data);
349                 i2_coeff1_16x8_r1_0 = vld1q_s16(pi2_inp_data + i4_inp_data_stride);
350                 i2_coeff1_16x8_r2_0 = vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1));
351                 i2_coeff1_16x8_r3_0 =
352                     vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1) + i4_inp_data_stride);
353 
354                 i2_coeff1_16x8_r0_1 = vextq_s16(i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_0, 1);
355                 i2_coeff1_16x8_r1_1 = vextq_s16(i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_0, 1);
356                 i2_coeff1_16x8_r2_1 = vextq_s16(i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_0, 1);
357                 i2_coeff1_16x8_r3_1 = vextq_s16(i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_0, 1);
358 
359                 i2_add_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_coeff1_16x8_r0_0);
360                 i2_add_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_coeff1_16x8_r1_0);
361                 i2_add_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_coeff1_16x8_r2_0);
362                 i2_add_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_coeff1_16x8_r3_0);
363 
364                 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
365                 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
366                 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
367                 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
368 
369                 i2_coeff1_16x8_r0_1 = vshlq_n_s16(i2_coeff1_16x8_r0_1, 1);
370                 i2_coeff1_16x8_r1_1 = vshlq_n_s16(i2_coeff1_16x8_r1_1, 1);
371                 i2_coeff1_16x8_r2_1 = vshlq_n_s16(i2_coeff1_16x8_r2_1, 1);
372                 i2_coeff1_16x8_r3_1 = vshlq_n_s16(i2_coeff1_16x8_r3_1, 1);
373 
374                 i2_res_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_0, i2_add_16x8_r0_0);
375                 i2_res_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_0, i2_add_16x8_r1_0);
376                 i2_res_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_0, i2_add_16x8_r2_0);
377                 i2_res_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_0, i2_add_16x8_r3_0);
378 
379                 i2_res_16x8_r0_1 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_add_16x8_r0_0);
380                 i2_res_16x8_r1_1 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_add_16x8_r1_0);
381                 i2_res_16x8_r2_1 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_add_16x8_r2_0);
382                 i2_res_16x8_r3_1 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_add_16x8_r3_0);
383 
384                 i2_res_16x8_r0_0 = vzipq_s16(i2_res_16x8_r0_0, i2_res_16x8_r0_1).val[0];
385                 i2_res_16x8_r1_0 = vzipq_s16(i2_res_16x8_r1_0, i2_res_16x8_r1_1).val[0];
386                 i2_res_16x8_r2_0 = vzipq_s16(i2_res_16x8_r2_0, i2_res_16x8_r2_1).val[0];
387                 i2_res_16x8_r3_0 = vzipq_s16(i2_res_16x8_r3_0, i2_res_16x8_r3_1).val[0];
388 
389                 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
390                 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
391                 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
392                 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
393 
394                 vst1q_s16(pi2_refarray_buffer + 1, i2_res_16x8_r0_0);
395                 vst1q_lane_s16(pi2_refarray_buffer, i2_coeff1_16x8_r0_0, 0);
396                 vst1q_lane_s16(pi2_refarray_buffer + 7, i2_coeff1_16x8_r0_0, 3);
397 
398                 vst1q_s16(pi2_refarray_buffer + 9, i2_res_16x8_r1_0);
399                 vst1q_lane_s16(pi2_refarray_buffer + 8, i2_coeff1_16x8_r1_0, 0);
400                 vst1q_lane_s16(pi2_refarray_buffer + 15, i2_coeff1_16x8_r1_0, 3);
401 
402                 vst1q_s16(pi2_refarray_buffer + 17, i2_res_16x8_r2_0);
403                 vst1q_lane_s16(pi2_refarray_buffer + 16, i2_coeff1_16x8_r2_0, 0);
404                 vst1q_lane_s16(pi2_refarray_buffer + 23, i2_coeff1_16x8_r2_0, 3);
405 
406                 vst1q_s16(pi2_refarray_buffer + 25, i2_res_16x8_r3_0);
407                 vst1q_lane_s16(pi2_refarray_buffer + 24, i2_coeff1_16x8_r3_0, 0);
408                 vst1q_lane_s16(pi2_refarray_buffer + 31, i2_coeff1_16x8_r3_0, 3);
409 
410                 {
411                     int16x4_t i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r0_2;
412                     int16x4_t i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r1_2;
413                     int16x4_t i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r2_2;
414                     int16x4_t i4_horz_samp_16x4_r3_1, i4_horz_samp_16x4_r3_2;
415 
416                     int32x4_t i4_horz_samp_32x4_r0_1, i4_horz_samp_32x4_r0_2;
417                     int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2;
418                     int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2;
419                     int32x4_t i4_horz_samp_32x4_r3_1, i4_horz_samp_32x4_r3_2;
420 
421                     int32x4_t i4_horz_add_32x4_r1_1, i4_horz_add_32x4_r1_2;
422                     int32x4_t i4_horz_add_32x4_r2_1, i4_horz_add_32x4_r2_2;
423                     int32x4_t i4_horz_add_32x4_r3_1, i4_horz_add_32x4_r3_2;
424 
425                     int16x4_t i4_horz_res_16x4_r0_1, i4_horz_res_16x4_r0_2;
426                     int16x4_t i4_horz_res_16x4_r1_1, i4_horz_res_16x4_r1_2;
427                     int16x4_t i4_horz_res_16x4_r2_1, i4_horz_res_16x4_r2_2;
428                     int16x4_t i4_horz_res_16x4_r3_1, i4_horz_res_16x4_r3_2;
429                     int16x4_t i4_horz_res_16x4_r4_1, i4_horz_res_16x4_r4_2;
430                     int16x4_t i4_horz_res_16x4_r5_1, i4_horz_res_16x4_r5_2;
431                     int16x4_t i4_horz_res_16x4_r6_1, i4_horz_res_16x4_r6_2;
432                     int16x4_t i4_horz_res_16x4_r7_1, i4_horz_res_16x4_r7_2;
433 
434                     int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2;
435                     int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2;
436                     int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2;
437                     int32x4_t i4_horz_res_32x4_r4_1, i4_horz_res_32x4_r4_2;
438                     int32x4_t i4_horz_res_32x4_r5_1, i4_horz_res_32x4_r5_2;
439                     int32x4_t i4_horz_res_32x4_r6_1, i4_horz_res_32x4_r6_2;
440 
441                     i4_horz_samp_16x4_r0_1 = vld1_s16(pi2_refarray_buffer);
442                     i4_horz_samp_16x4_r0_2 = vld1_s16(pi2_refarray_buffer + 4);
443 
444                     i4_horz_samp_16x4_r1_1 = vld1_s16(pi2_refarray_buffer + 8);
445                     i4_horz_samp_16x4_r1_2 = vld1_s16(pi2_refarray_buffer + 12);
446 
447                     i4_horz_samp_16x4_r2_1 = vld1_s16(pi2_refarray_buffer + 16);
448                     i4_horz_samp_16x4_r2_2 = vld1_s16(pi2_refarray_buffer + 20);
449 
450                     i4_horz_samp_16x4_r3_1 = vld1_s16(pi2_refarray_buffer + 24);
451                     i4_horz_samp_16x4_r3_2 = vld1_s16(pi2_refarray_buffer + 28);
452 
453                     i4_horz_res_16x4_r0_1 = vrshr_n_s16(i4_horz_samp_16x4_r0_1, 2);
454                     i4_horz_res_16x4_r0_2 = vrshr_n_s16(i4_horz_samp_16x4_r0_2, 2);
455 
456                     i4_horz_add_32x4_r1_1 =
457                         vaddl_s16(i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r1_1);
458                     i4_horz_add_32x4_r1_2 =
459                         vaddl_s16(i4_horz_samp_16x4_r0_2, i4_horz_samp_16x4_r1_2);
460 
461                     i4_horz_add_32x4_r2_1 =
462                         vaddl_s16(i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r2_1);
463                     i4_horz_add_32x4_r2_2 =
464                         vaddl_s16(i4_horz_samp_16x4_r1_2, i4_horz_samp_16x4_r2_2);
465 
466                     i4_horz_add_32x4_r3_1 =
467                         vaddl_s16(i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r3_1);
468                     i4_horz_add_32x4_r3_2 =
469                         vaddl_s16(i4_horz_samp_16x4_r2_2, i4_horz_samp_16x4_r3_2);
470 
471                     i4_horz_samp_32x4_r0_1 = vshll_n_s16(i4_horz_samp_16x4_r0_1, 1);
472                     i4_horz_samp_32x4_r0_2 = vshll_n_s16(i4_horz_samp_16x4_r0_2, 1);
473 
474                     i4_horz_samp_32x4_r1_1 = vshll_n_s16(i4_horz_samp_16x4_r1_1, 1);
475                     i4_horz_samp_32x4_r1_2 = vshll_n_s16(i4_horz_samp_16x4_r1_2, 1);
476 
477                     i4_horz_samp_32x4_r2_1 = vshll_n_s16(i4_horz_samp_16x4_r2_1, 1);
478                     i4_horz_samp_32x4_r2_2 = vshll_n_s16(i4_horz_samp_16x4_r2_2, 1);
479 
480                     i4_horz_samp_32x4_r3_1 = vshll_n_s16(i4_horz_samp_16x4_r3_1, 1);
481                     i4_horz_samp_32x4_r3_2 = vshll_n_s16(i4_horz_samp_16x4_r3_2, 1);
482 
483                     i4_horz_res_32x4_r1_1 =
484                         vaddq_s32(i4_horz_samp_32x4_r0_1, i4_horz_add_32x4_r1_1);
485                     i4_horz_res_32x4_r1_2 =
486                         vaddq_s32(i4_horz_samp_32x4_r0_2, i4_horz_add_32x4_r1_2);
487 
488                     i4_horz_res_32x4_r2_1 =
489                         vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r1_1);
490                     i4_horz_res_32x4_r2_2 =
491                         vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r1_2);
492 
493                     i4_horz_res_32x4_r3_1 =
494                         vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r2_1);
495                     i4_horz_res_32x4_r3_2 =
496                         vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r2_2);
497 
498                     i4_horz_res_32x4_r4_1 =
499                         vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r2_1);
500                     i4_horz_res_32x4_r4_2 =
501                         vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r2_2);
502 
503                     i4_horz_res_32x4_r5_1 =
504                         vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r3_1);
505                     i4_horz_res_32x4_r5_2 =
506                         vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r3_2);
507 
508                     i4_horz_res_32x4_r6_1 =
509                         vaddq_s32(i4_horz_samp_32x4_r3_1, i4_horz_add_32x4_r3_1);
510                     i4_horz_res_32x4_r6_2 =
511                         vaddq_s32(i4_horz_samp_32x4_r3_2, i4_horz_add_32x4_r3_2);
512 
513                     i4_horz_res_16x4_r1_1 = vqrshrn_n_s32(i4_horz_res_32x4_r1_1, 4);
514                     i4_horz_res_16x4_r1_2 = vqrshrn_n_s32(i4_horz_res_32x4_r1_2, 4);
515 
516                     i4_horz_res_16x4_r2_1 = vqrshrn_n_s32(i4_horz_res_32x4_r2_1, 4);
517                     i4_horz_res_16x4_r2_2 = vqrshrn_n_s32(i4_horz_res_32x4_r2_2, 4);
518 
519                     i4_horz_res_16x4_r3_1 = vqrshrn_n_s32(i4_horz_res_32x4_r3_1, 4);
520                     i4_horz_res_16x4_r3_2 = vqrshrn_n_s32(i4_horz_res_32x4_r3_2, 4);
521 
522                     i4_horz_res_16x4_r4_1 = vqrshrn_n_s32(i4_horz_res_32x4_r4_1, 4);
523                     i4_horz_res_16x4_r4_2 = vqrshrn_n_s32(i4_horz_res_32x4_r4_2, 4);
524 
525                     i4_horz_res_16x4_r5_1 = vqrshrn_n_s32(i4_horz_res_32x4_r5_1, 4);
526                     i4_horz_res_16x4_r5_2 = vqrshrn_n_s32(i4_horz_res_32x4_r5_2, 4);
527 
528                     i4_horz_res_16x4_r6_1 = vqrshrn_n_s32(i4_horz_res_32x4_r6_1, 4);
529                     i4_horz_res_16x4_r6_2 = vqrshrn_n_s32(i4_horz_res_32x4_r6_2, 4);
530 
531                     i4_horz_res_16x4_r7_1 = vrshr_n_s16(i4_horz_samp_16x4_r3_1, 2);
532                     i4_horz_res_16x4_r7_2 = vrshr_n_s16(i4_horz_samp_16x4_r3_2, 2);
533 
534                     vst1_s16(pi2_out_res, i4_horz_res_16x4_r0_1);
535                     vst1_s16(pi2_out_res + 4, i4_horz_res_16x4_r0_2);
536 
537                     vst1_s16(pi2_out_res + i4_out_res_stride, i4_horz_res_16x4_r1_1);
538                     vst1_s16(pi2_out_res + i4_out_res_stride + 4, i4_horz_res_16x4_r1_2);
539 
540                     vst1_s16(pi2_out_res + (i4_out_res_stride << 1), i4_horz_res_16x4_r2_1);
541                     vst1_s16(pi2_out_res + (i4_out_res_stride << 1) + 4, i4_horz_res_16x4_r2_2);
542 
543                     vst1_s16(pi2_out_res + (i4_out_res_stride * 3), i4_horz_res_16x4_r3_1);
544                     vst1_s16(pi2_out_res + (i4_out_res_stride * 3) + 4, i4_horz_res_16x4_r3_2);
545 
546                     vst1_s16(pi2_out_res + (i4_out_res_stride << 2), i4_horz_res_16x4_r4_1);
547                     vst1_s16(pi2_out_res + (i4_out_res_stride << 2) + 4, i4_horz_res_16x4_r4_2);
548 
549                     vst1_s16(pi2_out_res + (i4_out_res_stride * 5), i4_horz_res_16x4_r5_1);
550                     vst1_s16(pi2_out_res + (i4_out_res_stride * 5) + 4, i4_horz_res_16x4_r5_2);
551 
552                     vst1_s16(pi2_out_res + (i4_out_res_stride * 6), i4_horz_res_16x4_r6_1);
553                     vst1_s16(pi2_out_res + (i4_out_res_stride * 6) + 4, i4_horz_res_16x4_r6_2);
554 
555                     vst1_s16(pi2_out_res + (i4_out_res_stride * 7), i4_horz_res_16x4_r7_1);
556                     vst1_s16(pi2_out_res + (i4_out_res_stride * 7) + 4, i4_horz_res_16x4_r7_2);
557 
558                     pi2_out_res += BLOCK_WIDTH;
559                 }
560             }
561             else
562             {
563                 pi2_out_res += BLOCK_WIDTH;
564             }
565 
566             /* Block level loop updates */
567             if(1 == i4_blk_ctr)
568             {
569                 pi2_inp_data -= SUB_BLOCK_WIDTH;
570                 pi2_inp_data += (i4_inp_data_stride * SUB_BLOCK_HEIGHT);
571                 pi2_out_res -= MB_WIDTH;
572                 pi2_out_res += (i4_out_res_stride * BLOCK_HEIGHT);
573                 i4_ref_nnz >>= 2;
574             }
575             else
576             {
577                 pi2_inp_data += SUB_BLOCK_WIDTH;
578             }
579 
580             i4_ref_nnz >>= 1;
581         } /* end of loop over all the blocks */
582     }
583 
584     return;
585 }
586 
587 /*****************************************************************************/
588 /*                                                                           */
589 /*  Function Name : isvcd_interpolate_residual_neonintr                       */
590 /*                                                                           */
591 /*  Description   : this fucntion does the upsampling of residuals.          */
592 /*                                                                           */
593 /*  Inputs        : pv_residual_samp_ctxt : Residual upsampling context      */
594 /*                  pu1_inp_data : input 8 bit data pointer                  */
595 /*                  i4_inp_data_stride : input buffer stride                 */
596 /*                  pi2_out_res : output 16 bit buffer pointer               */
597 /*                  i4_out_res_stride : Output buffer stride                 */
598 /*                  pu1_inp_bitmap : input packed sign bit data pointer      */
599 /*                  i4_inp_bitmap_stride : sign bit buffer stride            */
600 /*                  ps_ref_mb_mode : reference mb mode pointer of base layer */
601 /*                  ps_coord : mb co-ordinate pointer                        */
602 /*  Globals       : none                                                     */
603 /*  Processing    : it does the upsampling with fixed phase values and       */
604 /*                  reference layer transform size                           */
605 /*  Outputs       : Upsampled residuals.                                     */
606 /*  Returns       : none                                                     */
607 /*                                                                           */
608 /*  Issues        : none                                                     */
609 /*                                                                           */
610 /*  Revision History:                                                        */
611 /*                                                                           */
612 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
613 /*         26 05 2021   Dolan          creation                              */
614 /*                                                                           */
615 /*****************************************************************************/
616 
isvcd_interpolate_residual_neonintr(void * pv_residual_samp_ctxt,WORD16 * pi2_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_chroma_flag)617 void isvcd_interpolate_residual_neonintr(void *pv_residual_samp_ctxt, WORD16 *pi2_out,
618                                          WORD32 i4_out_stride, WORD32 i4_refarray_wd,
619                                          UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_chroma_flag)
620 {
621     residual_sampling_ctxt_t *ps_ctxt;
622     residual_samp_map_ctxt_t *ps_map_ctxt;
623     res_lyr_ctxt *ps_lyr_ctxt;
624     ref_pixel_map_t *ps_x_pos_phase;
625     ref_pixel_map_t *ps_y_pos_phase;
626 
627     WORD32 i4_x, i4_y;
628     WORD32 i4_frm_mb_x, i4_frm_mb_y;
629     WORD32 i4_temp_array_ht;
630     WORD32 i4_mb_wd;
631     WORD32 i4_mb_ht;
632     WORD16 *pi2_ref_array;
633     UWORD8 *pu1_ref_x_ptr_incr, *pu1_ref_y_ptr_incr;
634 
635     UWORD8 arr_y_ref_pos[16] = {0};
636     UWORD8 arr_x_ref_pos[16] = {0};
637     UWORD8 arr_x_ref_pos_low[16] = {0};
638     UWORD8 arr_x_phase[16] = {0};
639     UWORD8 arr_y_phase[16] = {0};
640     UWORD8 *pi1_y_ref_pos;
641     UWORD8 *pi1_x_ref_pos;
642     UWORD8 *pi1_x_ref_pos_low;
643     UWORD8 *pi1_y_phase;
644     UWORD8 *pi1_x_phase;
645 
646     ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
647     ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
648     pi2_ref_array = ps_ctxt->pi2_refarray_buffer;
649     pu1_ref_x_ptr_incr = ps_ctxt->pu1_ref_x_ptr_incr;
650     pu1_ref_y_ptr_incr = ps_ctxt->pu1_ref_y_ptr_incr;
651 
652     if(1 == i4_chroma_flag)
653         ps_map_ctxt = &ps_lyr_ctxt->s_chroma_map_ctxt;
654     else
655         ps_map_ctxt = &ps_lyr_ctxt->s_luma_map_ctxt;
656 
657     i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
658     i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
659 
660     ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
661     ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
662 
663     i4_temp_array_ht = i4_mb_ht;
664     i4_frm_mb_y = u2_mb_y * i4_mb_ht;
665     i4_frm_mb_x = u2_mb_x * i4_mb_wd;
666 
667     /* --------------------------------------------------------------------- */
668     /* Loop for interpolation                                                */
669     /* --------------------------------------------------------------------- */
670     if(i4_chroma_flag == 0)
671     {
672         int16x8_t ref_arr_16x8_r0_0, ref_arr_16x8_r0_1;
673         int16x8_t ref_arr_16x8_r1_0, ref_arr_16x8_r1_1;
674         uint8x16_t x_ref_pos_mask_r0_0, x_ref_rnd_mask_r0_0;
675         uint8x16_t u1_incr_8x16_r0_0, x_ref_pos_mask_temp_r0_0, u1_incr_not_8x16_r0_0,
676             u1_y_incr_8x16_r0_0, phs_mask_8x16_0;
677         uint8x16_t u1_incr_8x16_r1_0, x_ref_pos_mask_temp_r1_0, u1_incr_not_8x16_r1_0;
678         int16x8_t ref_arr_temp0_16x8_r0_0, res_16x8_r0_0, vert_res_16x8_r0_0;
679         int16x8_t ref_arr_temp0_16x8_r1_0, res_16x8_r1_0, vert_res_16x8_r1_0;
680         int16x8_t ref_arr_temp1_16x8_r0_0;
681         int16x8_t ref_arr_temp1_16x8_r1_0;
682 
683         uint8x16_t x_ref_pos_mask_temp_r0_1, u1_incr_not_8x16_r0_1;
684         uint8x16_t x_ref_pos_mask_temp_r1_1, u1_incr_not_8x16_r1_1;
685         int16x8_t ref_arr_temp0_16x8_r0_1, res_16x8_r0_1, vert_res_16x8_r0_1;
686         int16x8_t ref_arr_temp0_16x8_r1_1, res_16x8_r1_1, vert_res_16x8_r1_1;
687         int16x8_t ref_arr_temp1_16x8_r0_1;
688         int16x8_t ref_arr_temp1_16x8_r1_1;
689 
690         uint16x8_t u1_y_incr_16x8_r0_0, u1_y_incr_16x8_r0_1;
691 
692         uint8x16_t u1_incr_not_8x16_r0_0_even, u1_incr_not_8x16_r1_0_even,
693             x_ref_pos_mask_temp_r0_0_even, x_ref_pos_mask_temp_r1_0_even;
694         uint8x16_t u1_incr_not_8x16_r0_0_odd, u1_incr_not_8x16_r1_0_odd,
695             x_ref_pos_mask_temp_r0_0_odd, x_ref_pos_mask_temp_r1_0_odd;
696         uint8x16x2_t u1_incr_not_8x16_2, x_ref_pos_mask_temp;
697         int16x8_t prev_res_16x8_r0_0;
698         int16x8_t prev_res_16x8_r1_0;
699         int16x8_t prev_res_16x8_r0_1;
700         int16x8_t prev_res_16x8_r1_1;
701         uint8x8x2_t u1_incr_8x8x2_t;
702         uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
703         uint16x8_t u1_prev_y_incr_16x8_r0_0;
704         uint16x8_t u1_prev_y_incr_16x8_r0_1;
705 
706         WORD32 zero_r0_r1 = 0;
707 
708         int32x4_t res_32x4_l_0, res_32x4_h_0;
709         int32x4_t res_32x4_l_1, res_32x4_h_1;
710         int16x8_t res_16x8_l, res_16x8_h;
711         uint16x8_t phs_mask_16x8_0, phs_mask_16x8_1;
712         int16x8_t const_16_16x8, phs_mask_16min_16x8_0;
713         int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs;
714         uint8x16_t phs_mask_div8_8x16_0, mid_indx_8x16;
715         int16x8_t phs_mask_16min_16x8_1;
716         uint16x8_t ones = vdupq_n_u16(0xFFFF);
717         uint8x16_t const_ones = vdupq_n_u8(1);
718         uint8x8x2_t u1_temp_8x8x2_t;
719         uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
720 
721         WORD16 *pi2_ref_array_temp;
722         UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
723         WORD32 i4_y_phase;
724         WORD32 out_stride_temp;
725         WORD32 strt_indx_h = 0;
726 
727         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
728         {
729             arr_y_phase[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
730             arr_y_ref_pos[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
731         }
732         pi1_y_ref_pos = arr_y_ref_pos;
733         pi1_y_phase = arr_y_phase;
734 
735         strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos);
736         for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
737         {
738             arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
739             arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
740         }
741 
742         pi1_x_ref_pos = arr_x_ref_pos;
743         pi1_x_phase = arr_x_phase;
744 
745         phs_mask_8x16_0 = vld1q_u8((pi1_x_phase));
746         phs_mask_16x8_0 = vmovl_u8(vget_low_u8(phs_mask_8x16_0));
747         phs_mask_16x8_1 = vmovl_u8(vld1_u8((pi1_x_phase + 8)));
748         x_ref_pos_mask_r0_0 = vld1q_u8((pi1_x_ref_pos));
749         const_16_16x8 = vdupq_n_s16(16);
750         phs_mask_div8_8x16_0 = vshrq_n_u8(phs_mask_8x16_0, 3);
751 
752         phs_mask_16min_16x8_0 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_0));
753         phs_mask_16min_16x8_1 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_1));
754 
755         x_ref_rnd_mask_r0_0 = vaddq_u8(x_ref_pos_mask_r0_0, phs_mask_div8_8x16_0);
756         mid_indx_8x16 = vdupq_n_u8((strt_indx_h << 1));
757 
758         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
759         {
760             if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
761             {
762                 if(!zero_r0_r1)
763                 {
764                     res_16x8_l = vdupq_n_s16(0);
765                     res_16x8_h = vdupq_n_s16(0);
766 
767                     out_stride_temp = (i4_y * i4_out_stride);
768                     vst1q_s16((pi2_out + out_stride_temp), res_16x8_l);
769                     vst1q_s16((pi2_out + out_stride_temp + 8), res_16x8_h);
770                     continue;
771                 }
772 
773                 res_16x8_r0_0 = prev_res_16x8_r0_0;
774                 res_16x8_r1_0 = prev_res_16x8_r1_0;
775                 res_16x8_r0_1 = prev_res_16x8_r0_1;
776                 res_16x8_r1_1 = prev_res_16x8_r1_1;
777 
778                 u1_y_incr_16x8_r0_0 = u1_prev_y_incr_16x8_r0_0;
779                 u1_y_incr_16x8_r0_1 = u1_prev_y_incr_16x8_r0_1;
780             }
781             else
782             {
783                 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
784                 pu1_ref_x_ptr_incr_temp =
785                     pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
786                 ref_arr_16x8_r0_0 = vld1q_s16((pi2_ref_array_temp));
787                 ref_arr_16x8_r1_0 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd));
788                 ref_arr_16x8_r0_1 = vld1q_s16((pi2_ref_array_temp + strt_indx_h));
789                 ref_arr_16x8_r1_1 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd + strt_indx_h));
790 
791                 dup_val_1 = vabsq_s16(ref_arr_16x8_r0_0);
792                 dup_val_2 = vabsq_s16(ref_arr_16x8_r1_0);
793                 dup_val_3 = vabsq_s16(ref_arr_16x8_r0_1);
794                 dup_val_4 = vabsq_s16(ref_arr_16x8_r1_1);
795                 dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2);
796                 dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4);
797                 dup_abs = vqaddq_s16(dup_val_1, dup_val_5);
798                 zero_r0_r1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] ||
799                              dup_abs[5] || dup_abs[6] || dup_abs[7];
800                 if(zero_r0_r1)
801                 {
802                     u1_incr_8x16_r0_0 = vld1q_u8((pu1_ref_x_ptr_incr_temp));
803                     u1_incr_8x16_r1_0 = vld1q_u8((pu1_ref_x_ptr_incr_temp + i4_refarray_wd));
804                     u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_incr_8x16_r0_0);
805                     u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_incr_8x16_r0_0);
806                     u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
807                     u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
808                     u1_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
809 
810                     u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_incr_8x16_r1_0);
811                     u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_incr_8x16_r1_0);
812                     u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
813                     u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
814                     u1_incr_8x16_r1_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
815                     u1_incr_not_8x16_r0_0 = vbicq_u8(phs_mask_div8_8x16_0, u1_incr_8x16_r0_0);
816                     u1_incr_not_8x16_r1_0 = vbicq_u8(phs_mask_div8_8x16_0, u1_incr_8x16_r1_0);
817 
818                     u1_incr_not_8x16_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, x_ref_pos_mask_r0_0);
819                     u1_incr_not_8x16_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, x_ref_pos_mask_r0_0);
820 
821                     x_ref_pos_mask_temp_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, u1_incr_8x16_r0_0);
822                     x_ref_pos_mask_temp_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, u1_incr_8x16_r1_0);
823 
824                     u1_incr_not_8x16_r0_0_even = vshlq_n_u8(u1_incr_not_8x16_r0_0, 1);
825                     u1_incr_not_8x16_r1_0_even = vshlq_n_u8(u1_incr_not_8x16_r1_0, 1);
826                     x_ref_pos_mask_temp_r0_0_even = vshlq_n_u8(x_ref_pos_mask_temp_r0_0, 1);
827                     x_ref_pos_mask_temp_r1_0_even = vshlq_n_u8(x_ref_pos_mask_temp_r1_0, 1);
828 
829                     u1_incr_not_8x16_r0_0_odd = vaddq_u8(u1_incr_not_8x16_r0_0_even, const_ones);
830                     u1_incr_not_8x16_r1_0_odd = vaddq_u8(u1_incr_not_8x16_r1_0_even, const_ones);
831                     x_ref_pos_mask_temp_r0_0_odd =
832                         vaddq_u8(x_ref_pos_mask_temp_r0_0_even, const_ones);
833                     x_ref_pos_mask_temp_r1_0_odd =
834                         vaddq_u8(x_ref_pos_mask_temp_r1_0_even, const_ones);
835 
836                     u1_incr_not_8x16_2 =
837                         vzipq_u8(u1_incr_not_8x16_r0_0_even, u1_incr_not_8x16_r0_0_odd);
838 
839                     u1_incr_not_8x16_r0_0 = u1_incr_not_8x16_2.val[0];
840                     u1_incr_not_8x16_r0_1 = u1_incr_not_8x16_2.val[1];
841 
842                     u1_incr_not_8x16_2 =
843                         vzipq_u8(u1_incr_not_8x16_r1_0_even, u1_incr_not_8x16_r1_0_odd);
844                     u1_incr_not_8x16_r1_0 = u1_incr_not_8x16_2.val[0];
845                     u1_incr_not_8x16_r1_1 = u1_incr_not_8x16_2.val[1];
846 
847                     x_ref_pos_mask_temp =
848                         vzipq_u8(x_ref_pos_mask_temp_r0_0_even, x_ref_pos_mask_temp_r0_0_odd);
849                     x_ref_pos_mask_temp_r0_0 = x_ref_pos_mask_temp.val[0];
850                     x_ref_pos_mask_temp_r0_1 = x_ref_pos_mask_temp.val[1];
851 
852                     x_ref_pos_mask_temp =
853                         vzipq_u8(x_ref_pos_mask_temp_r1_0_even, x_ref_pos_mask_temp_r1_0_odd);
854                     x_ref_pos_mask_temp_r1_0 = x_ref_pos_mask_temp.val[0];
855                     x_ref_pos_mask_temp_r1_1 = x_ref_pos_mask_temp.val[1];
856                     u1_incr_not_8x16_r0_1 = vsubq_u8(u1_incr_not_8x16_r0_1, mid_indx_8x16);
857                     u1_incr_not_8x16_r1_1 = vsubq_u8(u1_incr_not_8x16_r1_1, mid_indx_8x16);
858                     x_ref_pos_mask_temp_r0_1 = vsubq_u8(x_ref_pos_mask_temp_r0_1, mid_indx_8x16);
859                     x_ref_pos_mask_temp_r1_1 = vsubq_u8(x_ref_pos_mask_temp_r1_1, mid_indx_8x16);
860 
861                     u1_temp_8x8x2_t.val[0] =
862                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
863                     u1_temp_8x8x2_t.val[1] =
864                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
865                     u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_0));
866                     u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_0));
867                     ref_arr_temp0_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
868                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
869 
870                     u1_temp_8x8x2_t.val[0] =
871                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
872                     u1_temp_8x8x2_t.val[1] =
873                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
874                     u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_0));
875                     u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_0));
876                     ref_arr_temp0_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
877                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
878 
879                     u1_temp_8x8x2_t.val[0] =
880                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
881                     u1_temp_8x8x2_t.val[1] =
882                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
883                     u1_temp_8x8_t0 =
884                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_0));
885                     u1_temp_8x8_t1 =
886                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_0));
887                     ref_arr_temp1_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
888                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
889 
890                     u1_temp_8x8x2_t.val[0] =
891                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
892                     u1_temp_8x8x2_t.val[1] =
893                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
894                     u1_temp_8x8_t0 =
895                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_0));
896                     u1_temp_8x8_t1 =
897                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_0));
898                     ref_arr_temp1_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
899                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
900 
901                     u1_temp_8x8x2_t.val[0] =
902                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
903                     u1_temp_8x8x2_t.val[1] =
904                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
905                     u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_1));
906                     u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_1));
907                     ref_arr_temp0_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
908                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
909 
910                     u1_temp_8x8x2_t.val[0] =
911                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
912                     u1_temp_8x8x2_t.val[1] =
913                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
914                     u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_1));
915                     u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_1));
916                     ref_arr_temp0_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
917                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
918 
919                     u1_temp_8x8x2_t.val[0] =
920                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
921                     u1_temp_8x8x2_t.val[1] =
922                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
923                     u1_temp_8x8_t0 =
924                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_1));
925                     u1_temp_8x8_t1 =
926                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_1));
927                     ref_arr_temp1_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
928                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
929 
930                     u1_temp_8x8x2_t.val[0] =
931                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
932                     u1_temp_8x8x2_t.val[1] =
933                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
934                     u1_temp_8x8_t0 =
935                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_1));
936                     u1_temp_8x8_t1 =
937                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_1));
938                     ref_arr_temp1_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
939                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
940 
941                     res_16x8_r0_0 = vmulq_s16(ref_arr_temp0_16x8_r0_0, phs_mask_16min_16x8_0);
942                     res_16x8_r1_0 = vmulq_s16(ref_arr_temp0_16x8_r1_0, phs_mask_16min_16x8_0);
943                     res_16x8_r0_0 = vmlaq_s16(res_16x8_r0_0, ref_arr_temp1_16x8_r0_0,
944                                               vreinterpretq_s16_u16(phs_mask_16x8_0));
945                     res_16x8_r1_0 = vmlaq_s16(res_16x8_r1_0, ref_arr_temp1_16x8_r1_0,
946                                               vreinterpretq_s16_u16(phs_mask_16x8_0));
947                     res_16x8_r0_1 = vmulq_s16(ref_arr_temp0_16x8_r0_1, phs_mask_16min_16x8_1);
948                     res_16x8_r1_1 = vmulq_s16(ref_arr_temp0_16x8_r1_1, phs_mask_16min_16x8_1);
949                     res_16x8_r0_1 = vmlaq_s16(res_16x8_r0_1, ref_arr_temp1_16x8_r0_1,
950                                               vreinterpretq_s16_u16(phs_mask_16x8_1));
951                     res_16x8_r1_1 = vmlaq_s16(res_16x8_r1_1, ref_arr_temp1_16x8_r1_1,
952                                               vreinterpretq_s16_u16(phs_mask_16x8_1));
953 
954                     pu1_ref_y_ptr_incr_temp =
955                         pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
956                     u1_y_incr_8x16_r0_0 = vld1q_u8((pu1_ref_y_ptr_incr_temp));
957 
958                     u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_y_incr_8x16_r0_0);
959                     u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_y_incr_8x16_r0_0);
960                     u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_rnd_mask_r0_0));
961                     u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_rnd_mask_r0_0));
962                     u1_y_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
963                     u1_y_incr_16x8_r0_0 = vmovl_u8(vget_low_u8(u1_y_incr_8x16_r0_0));
964                     u1_y_incr_16x8_r0_1 = vmovl_u8(vget_high_u8(u1_y_incr_8x16_r0_0));
965                     u1_y_incr_16x8_r0_0 = vtstq_u16(u1_y_incr_16x8_r0_0, ones);
966                     u1_y_incr_16x8_r0_1 = vtstq_u16(u1_y_incr_16x8_r0_1, ones);
967 
968                     prev_res_16x8_r0_0 = res_16x8_r0_0;
969                     prev_res_16x8_r1_0 = res_16x8_r1_0;
970                     prev_res_16x8_r0_1 = res_16x8_r0_1;
971                     prev_res_16x8_r1_1 = res_16x8_r1_1;
972 
973                     u1_prev_y_incr_16x8_r0_0 = u1_y_incr_16x8_r0_0;
974                     u1_prev_y_incr_16x8_r0_1 = u1_y_incr_16x8_r0_1;
975                 }
976             }
977 
978             if(!zero_r0_r1)
979             {
980                 res_16x8_l = vdupq_n_s16(0);
981                 res_16x8_h = vdupq_n_s16(0);
982             }
983             else
984             {
985                 i4_y_phase = pi1_y_phase[i4_y];
986 
987                 if((i4_y_phase) >> 3)
988                 {
989                     vert_res_16x8_r0_0 =
990                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r1_0);
991                     vert_res_16x8_r1_0 =
992                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r1_0);
993                     vert_res_16x8_r0_1 =
994                         vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r0_1, res_16x8_r1_1);
995                     vert_res_16x8_r1_1 =
996                         vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r1_1, res_16x8_r1_1);
997                 }
998                 else
999                 {
1000                     vert_res_16x8_r0_0 =
1001                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r0_0);
1002                     vert_res_16x8_r1_0 =
1003                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r0_0);
1004                     vert_res_16x8_r0_1 =
1005                         vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r0_1, res_16x8_r0_1);
1006                     vert_res_16x8_r1_1 =
1007                         vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r1_1, res_16x8_r0_1);
1008                 }
1009 
1010                 res_32x4_l_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1011                 res_32x4_l_0 =
1012                     vmlal_n_s16(res_32x4_l_0, vget_low_s16(vert_res_16x8_r1_0), i4_y_phase);
1013 
1014                 res_32x4_l_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1015                 res_32x4_l_1 =
1016                     vmlal_n_s16(res_32x4_l_1, vget_high_s16(vert_res_16x8_r1_0), i4_y_phase);
1017                 res_32x4_h_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_1), 16 - i4_y_phase);
1018                 res_32x4_h_0 =
1019                     vmlal_n_s16(res_32x4_h_0, vget_low_s16(vert_res_16x8_r1_1), i4_y_phase);
1020                 res_32x4_h_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_1), 16 - i4_y_phase);
1021                 res_32x4_h_1 =
1022                     vmlal_n_s16(res_32x4_h_1, vget_high_s16(vert_res_16x8_r1_1), i4_y_phase);
1023 
1024                 res_32x4_l_0 = vrshrq_n_s32(res_32x4_l_0, 8);
1025                 res_32x4_l_1 = vrshrq_n_s32(res_32x4_l_1, 8);
1026                 res_32x4_h_0 = vrshrq_n_s32(res_32x4_h_0, 8);
1027                 res_32x4_h_1 = vrshrq_n_s32(res_32x4_h_1, 8);
1028 
1029                 res_16x8_l = vcombine_s16(vmovn_s32(res_32x4_l_0), vmovn_s32(res_32x4_l_1));
1030                 res_16x8_h = vcombine_s16(vmovn_s32(res_32x4_h_0), vmovn_s32(res_32x4_h_1));
1031             }
1032 
1033             out_stride_temp = (i4_y * i4_out_stride);
1034             vst1q_s16((pi2_out + out_stride_temp), res_16x8_l);
1035             vst1q_s16((pi2_out + out_stride_temp + 8), res_16x8_h);
1036         }
1037     }
1038     else
1039     {
1040         int16x8_t ref_arr_16x8_r0_0;
1041         int16x8_t ref_arr_16x8_r1_0;
1042         uint8x16_t x_ref_pos_mask_r0_0, x_ref_rnd_mask_r0_0;
1043         uint16x8_t u1_incr_16x8_r0_0, u1_incr_mask_16x8_r0_0, phs_mask_16x8_0,
1044             u1_incr_not_16x8_r0_0, u1_y_incr_16x8_r0_0;
1045         uint16x8_t u1_incr_16x8_r1_0, u1_incr_mask_16x8_r1_0, u1_incr_not_16x8_r1_0;
1046         uint8x16_t u1_incr_8x16_r0_0, x_ref_pos_mask_temp_r0_0, u1_incr_mask_8x16_r0_0,
1047             u1_incr_not_8x16_r0_0, u1_y_incr_8x16_r0_0;
1048         uint8x16_t u1_incr_8x16_r1_0, x_ref_pos_mask_temp_r1_0, u1_incr_mask_8x16_r1_0,
1049             u1_incr_not_8x16_r1_0;
1050         int16x8_t ref_arr_temp0_16x8_r0_0, res_16x8_r0_0, vert_res_16x8_r0_0;
1051         int16x8_t ref_arr_temp0_16x8_r1_0, res_16x8_r1_0, vert_res_16x8_r1_0;
1052         int16x8_t ref_arr_temp1_16x8_r0_0;
1053         int16x8_t ref_arr_temp1_16x8_r1_0;
1054 
1055         int32x4_t res_32x4_l_0;
1056         int32x4_t res_32x4_l_1;
1057         int16x8_t out_16x8_0, out_16x8_1;
1058         uint16x8_t phs_mask_div8_16x8_0, phs_mask_div8_msb_16x8_0;
1059         int16x8_t const_16_16x8, phs_mask_16min_16x8_0;
1060         uint8x8x2_t u1_temp_8x8x2_t;
1061         uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
1062 
1063         uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
1064         uint16x8_t ones = vdupq_n_u16(0xFFFF);
1065 
1066         WORD16 *pi2_ref_array_temp;
1067         UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
1068         WORD32 i4_y_phase;
1069         uint8x8x2_t u1_incr_8x8x2_t;
1070         uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
1071         int16x8_t prev_res_16x8_r0_0;
1072         int16x8_t prev_res_16x8_r1_0;
1073         int16x8_t dup_val_1, dup_val_2, dup_abs;
1074         uint16x8_t u1_prev_y_incr_16x8_r0_0;
1075 
1076         WORD32 out_stride_temp;
1077         WORD32 zero_r0_r1 = 0;
1078         WORD32 i4_x2 = 0;
1079         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1080         {
1081             arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
1082             arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
1083         }
1084         pi1_y_ref_pos = arr_y_ref_pos;
1085         pi1_y_phase = arr_y_phase;
1086 
1087         for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
1088         {
1089             arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
1090             arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
1091             i4_x2 = i4_x << 1;
1092             arr_x_ref_pos_low[i4_x2] = (arr_x_ref_pos[i4_x]) << 1;
1093             arr_x_ref_pos_low[i4_x2 + 1] = arr_x_ref_pos_low[i4_x2] + 1;
1094         }
1095 
1096         pi1_x_ref_pos_low = arr_x_ref_pos_low;
1097         pi1_x_phase = arr_x_phase;
1098 
1099         phs_mask_16x8_0 = vmovl_u8(vld1_u8((pi1_x_phase)));
1100         x_ref_pos_mask_r0_0 = vld1q_u8((pi1_x_ref_pos_low));
1101         const_16_16x8 = vdupq_n_s16(16);
1102         phs_mask_div8_16x8_0 = vshrq_n_u16(phs_mask_16x8_0, 3);
1103         phs_mask_div8_msb_16x8_0 = vsliq_n_u16(phs_mask_div8_16x8_0, phs_mask_div8_16x8_0, 8);
1104 
1105         phs_mask_16min_16x8_0 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_0));
1106 
1107         x_ref_rnd_mask_r0_0 = vaddq_u8(
1108             x_ref_pos_mask_r0_0, vreinterpretq_u8_u16(vshlq_n_u16(phs_mask_div8_msb_16x8_0, 1)));
1109         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1110         {
1111             if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
1112             {
1113                 if(!zero_r0_r1)
1114                 {
1115                     res_32x4_l_0 = vdupq_n_s32(0);
1116                     res_32x4_l_1 = vdupq_n_s32(0);
1117 
1118                     out_stride_temp = (i4_y * i4_out_stride);
1119 
1120                     out_16x8_0 = vld1q_s16(pi2_out + out_stride_temp);
1121                     out_16x8_1 = vld1q_s16(pi2_out + out_stride_temp + 8);
1122                     out_16x8_0 = vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_0),
1123                                            out_16x8_0);
1124                     out_16x8_1 = vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_1),
1125                                            out_16x8_1);
1126                     vst1q_s16((pi2_out + out_stride_temp), out_16x8_0);
1127                     vst1q_s16((pi2_out + out_stride_temp + 8), out_16x8_1);
1128                     continue;
1129                 }
1130 
1131                 res_16x8_r0_0 = prev_res_16x8_r0_0;
1132                 res_16x8_r1_0 = prev_res_16x8_r1_0;
1133 
1134                 u1_y_incr_16x8_r0_0 = u1_prev_y_incr_16x8_r0_0;
1135             }
1136             else
1137             {
1138                 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1139                 pu1_ref_x_ptr_incr_temp =
1140                     pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1141                 ref_arr_16x8_r0_0 = vld1q_s16((pi2_ref_array_temp));
1142                 ref_arr_16x8_r1_0 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd));
1143 
1144                 dup_val_1 = vabsq_s16(ref_arr_16x8_r0_0);
1145                 dup_val_2 = vabsq_s16(ref_arr_16x8_r1_0);
1146                 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1147                 zero_r0_r1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] ||
1148                              dup_abs[5] || dup_abs[6] || dup_abs[7];
1149                 if(zero_r0_r1)
1150                 {
1151                     u1_incr_16x8_r0_0 = (vmovl_u8(vld1_u8((pu1_ref_x_ptr_incr_temp))));
1152                     u1_incr_16x8_r1_0 =
1153                         (vmovl_u8(vld1_u8((pu1_ref_x_ptr_incr_temp + i4_refarray_wd))));
1154                     u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_incr_16x8_r0_0));
1155                     u1_incr_8x8x2_t.val[1] = vget_high_u8(vreinterpretq_u8_u16(u1_incr_16x8_r0_0));
1156                     u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
1157                     u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
1158                     u1_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
1159 
1160                     u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_incr_16x8_r1_0));
1161                     u1_incr_8x8x2_t.val[1] = vget_high_u8(vreinterpretq_u8_u16(u1_incr_16x8_r1_0));
1162                     u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
1163                     u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
1164                     u1_incr_8x16_r1_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
1165 
1166                     u1_incr_16x8_r0_0 = vreinterpretq_u16_u8(u1_incr_8x16_r0_0);
1167                     u1_incr_16x8_r1_0 = vreinterpretq_u16_u8(u1_incr_8x16_r1_0);
1168                     u1_incr_mask_16x8_r0_0 = vsliq_n_u16(u1_incr_16x8_r0_0, u1_incr_16x8_r0_0, 8);
1169                     u1_incr_mask_16x8_r1_0 = vsliq_n_u16(u1_incr_16x8_r1_0, u1_incr_16x8_r1_0, 8);
1170 
1171                     u1_incr_not_16x8_r0_0 =
1172                         vbicq_u16(phs_mask_div8_msb_16x8_0, u1_incr_mask_16x8_r0_0);
1173                     u1_incr_not_16x8_r1_0 =
1174                         vbicq_u16(phs_mask_div8_msb_16x8_0, u1_incr_mask_16x8_r1_0);
1175 
1176                     u1_incr_mask_8x16_r0_0 =
1177                         vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_mask_16x8_r0_0, 1));
1178                     u1_incr_mask_8x16_r1_0 =
1179                         vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_mask_16x8_r1_0, 1));
1180                     u1_incr_not_8x16_r0_0 =
1181                         vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_not_16x8_r0_0, 1));
1182                     u1_incr_not_8x16_r1_0 =
1183                         vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_not_16x8_r1_0, 1));
1184 
1185                     u1_incr_not_8x16_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, x_ref_pos_mask_r0_0);
1186                     u1_incr_not_8x16_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, x_ref_pos_mask_r0_0);
1187 
1188                     x_ref_pos_mask_temp_r0_0 =
1189                         vaddq_u8(u1_incr_not_8x16_r0_0, u1_incr_mask_8x16_r0_0);
1190                     x_ref_pos_mask_temp_r1_0 =
1191                         vaddq_u8(u1_incr_not_8x16_r1_0, u1_incr_mask_8x16_r1_0);
1192 
1193                     u1_temp_8x8x2_t.val[0] =
1194                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1195                     u1_temp_8x8x2_t.val[1] =
1196                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1197                     u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_0));
1198                     u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_0));
1199                     ref_arr_temp0_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
1200                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1201 
1202                     u1_temp_8x8x2_t.val[0] =
1203                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1204                     u1_temp_8x8x2_t.val[1] =
1205                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1206                     u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_0));
1207                     u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_0));
1208                     ref_arr_temp0_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
1209                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1210 
1211                     u1_temp_8x8x2_t.val[0] =
1212                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1213                     u1_temp_8x8x2_t.val[1] =
1214                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1215                     u1_temp_8x8_t0 =
1216                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_0));
1217                     u1_temp_8x8_t1 =
1218                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_0));
1219                     ref_arr_temp1_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
1220                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1221 
1222                     u1_temp_8x8x2_t.val[0] =
1223                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1224                     u1_temp_8x8x2_t.val[1] =
1225                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1226                     u1_temp_8x8_t0 =
1227                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_0));
1228                     u1_temp_8x8_t1 =
1229                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_0));
1230                     ref_arr_temp1_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
1231                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1232                     res_16x8_r0_0 = vmulq_s16(ref_arr_temp0_16x8_r0_0, phs_mask_16min_16x8_0);
1233                     res_16x8_r1_0 = vmulq_s16(ref_arr_temp0_16x8_r1_0, phs_mask_16min_16x8_0);
1234                     res_16x8_r0_0 = vmlaq_s16(res_16x8_r0_0, ref_arr_temp1_16x8_r0_0,
1235                                               vreinterpretq_s16_u16(phs_mask_16x8_0));
1236                     res_16x8_r1_0 = vmlaq_s16(res_16x8_r1_0, ref_arr_temp1_16x8_r1_0,
1237                                               vreinterpretq_s16_u16(phs_mask_16x8_0));
1238 
1239                     pu1_ref_y_ptr_incr_temp =
1240                         pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
1241                     u1_y_incr_16x8_r0_0 = vmovl_u8(vld1_u8((pu1_ref_y_ptr_incr_temp)));
1242                     u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_y_incr_16x8_r0_0));
1243                     u1_incr_8x8x2_t.val[1] =
1244                         vget_high_u8(vreinterpretq_u8_u16(u1_y_incr_16x8_r0_0));
1245                     u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_rnd_mask_r0_0));
1246                     u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_rnd_mask_r0_0));
1247                     u1_y_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
1248 
1249                     u1_y_incr_16x8_r0_0 = vreinterpretq_u16_u8(u1_y_incr_8x16_r0_0);
1250 
1251                     u1_y_incr_16x8_r0_0 = vtstq_u16(u1_y_incr_16x8_r0_0, ones);
1252 
1253                     prev_res_16x8_r0_0 = res_16x8_r0_0;
1254                     prev_res_16x8_r1_0 = res_16x8_r1_0;
1255 
1256                     u1_prev_y_incr_16x8_r0_0 = u1_y_incr_16x8_r0_0;
1257                 }
1258             }
1259 
1260             if(!zero_r0_r1)
1261             {
1262                 res_32x4_l_0 = vdupq_n_s32(0);
1263                 res_32x4_l_1 = vdupq_n_s32(0);
1264             }
1265             else
1266             {
1267                 i4_y_phase = pi1_y_phase[i4_y];
1268 
1269                 if((i4_y_phase) >> 3)
1270                 {
1271                     vert_res_16x8_r0_0 =
1272                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r1_0);
1273                     vert_res_16x8_r1_0 =
1274                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r1_0);
1275                 }
1276                 else
1277                 {
1278                     vert_res_16x8_r0_0 =
1279                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r0_0);
1280                     vert_res_16x8_r1_0 =
1281                         vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r0_0);
1282                 }
1283                 res_32x4_l_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1284                 res_32x4_l_0 =
1285                     vmlal_n_s16(res_32x4_l_0, vget_low_s16(vert_res_16x8_r1_0), i4_y_phase);
1286 
1287                 res_32x4_l_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1288                 res_32x4_l_1 =
1289                     vmlal_n_s16(res_32x4_l_1, vget_high_s16(vert_res_16x8_r1_0), i4_y_phase);
1290 
1291                 res_32x4_l_0 = vrshrq_n_s32(res_32x4_l_0, 8);
1292                 res_32x4_l_1 = vrshrq_n_s32(res_32x4_l_1, 8);
1293             }
1294             out_stride_temp = (i4_y * i4_out_stride);
1295 
1296             out_16x8_0 = vld1q_s16(pi2_out + out_stride_temp);
1297             out_16x8_1 = vld1q_s16(pi2_out + out_stride_temp + 8);
1298             out_16x8_0 =
1299                 vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_0), out_16x8_0);
1300             out_16x8_1 =
1301                 vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_1), out_16x8_1);
1302             vst1q_s16((pi2_out + out_stride_temp), out_16x8_0);
1303             vst1q_s16((pi2_out + out_stride_temp + 8), out_16x8_1);
1304         }
1305     }
1306     return;
1307 } /* End of Interpolation Function */
1308 
1309 /*****************************************************************************/
1310 /*                                                                           */
1311 /*  Function Name : isvcd_residual_reflayer_const_non_boundary_mb_neonintr    */
1312 /*                                                                           */
1313 /*  Description   :                                                          */
1314 /*                                                                           */
1315 /*  Inputs        :                                                          */
1316 /*  Globals       : none                                                     */
1317 /*  Processing    :                                                          */
1318 /*                                                                           */
1319 /*  Outputs       : none                                                     */
1320 /*  Returns       : none                                                     */
1321 /*                                                                           */
1322 /*  Issues        : none                                                     */
1323 /*                                                                           */
1324 /*  Revision History:                                                        */
1325 /*                                                                           */
1326 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1327 /*         25 11 2021   Dolan           creation                             */
1328 /*                                                                           */
1329 /*****************************************************************************/
1330 
isvcd_residual_reflayer_const_non_boundary_mb_neonintr(WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_ref_array,WORD32 i4_refarray_wd,WORD32 i4_refarray_ht,WORD32 i4_ref_mb_type_q0,WORD32 i4_ref_mb_type_q1,WORD32 i4_ref_mb_type_q2,WORD32 i4_ref_mb_type_q3,WORD32 i4_mb_quard1_part_x,WORD32 i4_mb_quard1_part_y,WORD32 i4_chroma_flag)1331 void isvcd_residual_reflayer_const_non_boundary_mb_neonintr(
1332     WORD16 *pi2_inp_data, WORD32 i4_inp_data_stride, WORD16 *pi2_ref_array, WORD32 i4_refarray_wd,
1333     WORD32 i4_refarray_ht, WORD32 i4_ref_mb_type_q0, WORD32 i4_ref_mb_type_q1,
1334     WORD32 i4_ref_mb_type_q2, WORD32 i4_ref_mb_type_q3, WORD32 i4_mb_quard1_part_x,
1335     WORD32 i4_mb_quard1_part_y, WORD32 i4_chroma_flag)
1336 {
1337     WORD32 i4_y;
1338     WORD16 *pi2_ref_data_byte;
1339     WORD16 *pi2_ref_array_temp;
1340 
1341     if(i4_chroma_flag == 0)
1342     {
1343         WORD16 index_0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
1344         int16x8_t ref_mb_type_16x8_q0, ref_mb_type_16x8_q1, ref_mb_type_16x8_q2,
1345             ref_mb_type_16x8_q3, mb_quard1_part_x_16x8;
1346         int16x8_t ref_mb_type_16x8_0, ref_mb_type_16x8_1;
1347         int16x8_t ref_mb_type_16x8_low_0, ref_mb_type_16x8_low_1;
1348         uint16x8_t mb_type_mask_16x8_0, mb_type_mask_16x8_1;
1349         uint16x8_t mb_type_mask_16x8_low_0, mb_type_mask_16x8_low_1;
1350         uint16x8_t mask_16x8_0;
1351 
1352         int16x8_t index_arr_0;
1353         int16x8_t inp_data_16x8_0, inp_data_16x8_1;
1354         int16x8_t res_16x8_0, res_16x8_1;
1355         int16x8_t one_16x8 = vdupq_n_s16(1);
1356         int16x8_t zero_16x8 = vdupq_n_s16(0);
1357 
1358         index_arr_0 = vld1q_s16(&index_0[0]);
1359 
1360         ref_mb_type_16x8_q0 = vdupq_n_s16(i4_ref_mb_type_q0);
1361         ref_mb_type_16x8_q1 = vdupq_n_s16(i4_ref_mb_type_q1);
1362         ref_mb_type_16x8_q2 = vdupq_n_s16(i4_ref_mb_type_q2);
1363         ref_mb_type_16x8_q3 = vdupq_n_s16(i4_ref_mb_type_q3);
1364         if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1365         {
1366             // Quard 0
1367             ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1368             ref_mb_type_16x8_1 = ref_mb_type_16x8_q0;
1369             mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1370             mb_type_mask_16x8_1 = mb_type_mask_16x8_0;
1371         }
1372         else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1373                 (i4_mb_quard1_part_x < i4_refarray_wd))
1374         {
1375             // Quard 0 & 1
1376             if(i4_mb_quard1_part_x == 8)
1377             {
1378                 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1379                 ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1380             }
1381             else if(i4_mb_quard1_part_x < 8)
1382             {
1383                 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1384                 mask_16x8_0 =
1385                     vcltq_s16(index_arr_0, mb_quard1_part_x_16x8);  // return 1 if a<b, else 0
1386 
1387                 ref_mb_type_16x8_0 =
1388                     vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1389                 ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1390             }
1391             else
1392             {
1393                 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x - 8));
1394                 mask_16x8_0 =
1395                     vcleq_s16(index_arr_0, mb_quard1_part_x_16x8);  // return 1 if a<b, else 0
1396 
1397                 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1398                 ref_mb_type_16x8_1 =
1399                     vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1400             }
1401 
1402             mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1403             mb_type_mask_16x8_1 = vceqq_s16(ref_mb_type_16x8_1, one_16x8);
1404         }
1405         else
1406         {
1407             if(i4_mb_quard1_part_x >= i4_refarray_wd)
1408             {
1409                 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1410                 ref_mb_type_16x8_1 = ref_mb_type_16x8_q0;
1411 
1412                 ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1413                 ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q2;
1414             }
1415             else
1416             {
1417                 // Quard 0, 1, 2, 3
1418                 if(i4_mb_quard1_part_x == 8)
1419                 {
1420                     ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1421                     ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1422 
1423                     ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1424                     ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q3;
1425                 }
1426                 else if(i4_mb_quard1_part_x < 8)
1427                 {
1428                     mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1429                     mask_16x8_0 =
1430                         vcltq_s16(index_arr_0, mb_quard1_part_x_16x8);  // return 1 if a<b, else 0
1431 
1432                     ref_mb_type_16x8_0 =
1433                         vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1434                     ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1435 
1436                     ref_mb_type_16x8_low_0 =
1437                         vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3);
1438                     ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q3;
1439                 }
1440                 else
1441                 {
1442                     mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x - 8));
1443                     mask_16x8_0 =
1444                         vcltq_s16(index_arr_0, mb_quard1_part_x_16x8);  // return 1 if a<b, else 0
1445 
1446                     ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1447                     ref_mb_type_16x8_1 =
1448                         vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1449 
1450                     ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1451                     ref_mb_type_16x8_low_1 =
1452                         vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3);
1453                 }
1454                 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1455                 mb_type_mask_16x8_1 = vceqq_s16(ref_mb_type_16x8_1, one_16x8);
1456 
1457                 mb_type_mask_16x8_low_0 = vceqq_s16(ref_mb_type_16x8_low_0, one_16x8);
1458                 mb_type_mask_16x8_low_1 = vceqq_s16(ref_mb_type_16x8_low_1, one_16x8);
1459             }
1460         }
1461 
1462         if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1463         {
1464             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1465             {
1466                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1467                 inp_data_16x8_0 = vld1q_s16((pi2_ref_data_byte));
1468                 inp_data_16x8_1 = vld1q_s16((pi2_ref_data_byte + 8));
1469                 if(i4_y < i4_mb_quard1_part_y)
1470                 {
1471                     res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8_0, zero_16x8);
1472                     res_16x8_1 = vbslq_s16(mb_type_mask_16x8_1, inp_data_16x8_1, zero_16x8);
1473                 }
1474                 else
1475                 {
1476                     res_16x8_0 = vbslq_s16(mb_type_mask_16x8_low_0, inp_data_16x8_0, zero_16x8);
1477                     res_16x8_1 = vbslq_s16(mb_type_mask_16x8_low_1, inp_data_16x8_1, zero_16x8);
1478                 }
1479                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1480                 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1481                 vst1q_s16((pi2_ref_array_temp + 8), res_16x8_1);
1482             }
1483         }
1484         else
1485         {
1486             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1487             {
1488                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1489                 inp_data_16x8_0 = vld1q_s16((pi2_ref_data_byte));
1490                 inp_data_16x8_1 = vld1q_s16((pi2_ref_data_byte + 8));
1491 
1492                 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8_0, zero_16x8);
1493                 res_16x8_1 = vbslq_s16(mb_type_mask_16x8_1, inp_data_16x8_1, zero_16x8);
1494 
1495                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1496                 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1497                 vst1q_s16((pi2_ref_array_temp + 8), res_16x8_1);
1498             }
1499         }
1500     }
1501     else
1502     {
1503         WORD16 index_0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
1504         int16x8_t ref_mb_type_16x8_q0, ref_mb_type_16x8_q1, ref_mb_type_16x8_q2,
1505             ref_mb_type_16x8_q3, mb_quard1_part_x_16x8;
1506         int16x8_t ref_mb_type_16x8_0;
1507         int16x8_t ref_mb_type_16x8_low_0;
1508         uint16x8_t mb_type_mask_16x8_0;
1509         uint16x8_t mb_type_mask_16x8_low_0;
1510         uint16x8_t mask_16x8_0;
1511 
1512         int16x8_t index_arr_0;
1513         int16x8x2_t inp_data_16x8x2;
1514         int16x8_t inp_data_16x8;
1515         int16x8_t res_16x8_0;
1516         int16x8_t one_16x8 = vdupq_n_s16(1);
1517         int16x8_t zero_16x8 = vdupq_n_s16(0);
1518         index_arr_0 = vld1q_s16(&index_0[0]);
1519 
1520         ref_mb_type_16x8_q0 = vdupq_n_s16(i4_ref_mb_type_q0);
1521         ref_mb_type_16x8_q1 = vdupq_n_s16(i4_ref_mb_type_q1);
1522         ref_mb_type_16x8_q2 = vdupq_n_s16(i4_ref_mb_type_q2);
1523         ref_mb_type_16x8_q3 = vdupq_n_s16(i4_ref_mb_type_q3);
1524         if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1525         {
1526             // Quard 0
1527             ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1528             mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1529         }
1530         else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1531                 (i4_mb_quard1_part_x < i4_refarray_wd))
1532         {
1533             // Quard 0 & 1
1534             mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1535             mask_16x8_0 = vcltq_s16(index_arr_0,
1536                                     mb_quard1_part_x_16x8);  // return 1 if a<b, else 0
1537 
1538             ref_mb_type_16x8_0 = vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1539             mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1540         }
1541         else
1542         {
1543             if(i4_mb_quard1_part_x >= i4_refarray_wd)
1544             {
1545                 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1546                 ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1547             }
1548             else
1549             {
1550                 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1551                 mask_16x8_0 =
1552                     vcltq_s16(index_arr_0, mb_quard1_part_x_16x8);  // return 1 if a<b, else 0
1553 
1554                 ref_mb_type_16x8_0 =
1555                     vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1556                 ref_mb_type_16x8_low_0 =
1557                     vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3);
1558 
1559                 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1560                 mb_type_mask_16x8_low_0 = vceqq_s16(ref_mb_type_16x8_low_0, one_16x8);
1561             }
1562         }
1563 
1564         if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1565         {
1566             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1567             {
1568                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1569                 inp_data_16x8x2 = vld2q_s16((pi2_ref_data_byte));
1570                 inp_data_16x8 = inp_data_16x8x2.val[0];
1571 
1572                 if(i4_y < i4_mb_quard1_part_y)
1573                 {
1574                     res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8, zero_16x8);
1575                 }
1576                 else
1577                 {
1578                     res_16x8_0 = vbslq_s16(mb_type_mask_16x8_low_0, inp_data_16x8, zero_16x8);
1579                 }
1580                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1581                 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1582             }
1583         }
1584         else
1585         {
1586             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1587             {
1588                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1589                 inp_data_16x8x2 = vld2q_s16((pi2_ref_data_byte));
1590                 inp_data_16x8 = inp_data_16x8x2.val[0];
1591 
1592                 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8, zero_16x8);
1593 
1594                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1595                 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1596             }
1597         }
1598     }
1599 }
1600