1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_residual_resamp_neonintr.c
24 *
25 * @brief
26 * Contains routines that resample for SVC resampling
27 *
28 * @author
29 * Kishore
30 *
31 * @par List of Functions:
32 * - isvcd_pred_residual_recon_4x4_neonintr()
33 * - isvcd_pred_residual_recon_8x8_neonintr()
34 * - isvcd_pred_residual_recon_16x16_neonintr()
35 * - isvcd_pred_residual_recon_chroma_4x4_neonintr()
36 * - isvcd_pred_residual_recon_chroma_8x8_neonintr()
37 * - isvcd_residual_luma_4x4_neonintr()
38 * - isvcd_residual_luma_8x8_neonintr()
39 * - isvcd_residual_luma_16x16_neonintr()
40 * - isvcd_residual_chroma_cb_cr_8x8_neonintr()
41 *
42 * @remarks
43 *
44 *******************************************************************************
45 */
46
47 /*!
48 **************************************************************************
49 * \file isvcd_residual_resamp_neonintr.c
50 *
51 * \brief
52 * Contains routines that resample for SVC resampling
53 *
54 * Detailed_description
55 *
56 * \date
57 *
58 *
59 * \author : kishore
60 **************************************************************************
61 */
62 #include <assert.h>
63 #include <string.h>
64 #include <arm_neon.h>
65
66 #include "ih264_typedefs.h"
67 #include "ih264_macros.h"
68 #include "ih264_platform_macros.h"
69 #include "isvcd_structs.h"
70 #include "ih264_debug.h"
71
72 /*****************************************************************************/
73 /* */
74 /* Function Name : isvcd_residual_luma_dyadic_neonintr */
75 /* */
76 /* Description : this fucntion does the upsampling of luma residuals for */
77 /* Dyadic cases */
78 /* */
79 /* Inputs : pv_residual_samp_ctxt : Residual upsampling context */
80 /* pu1_inp_data : input 8 bit data pointer */
81 /* i4_inp_data_stride : input buffer stride */
82 /* pi2_out_res : output 16 bit buffer pointer */
83 /* i4_out_res_stride : Output buffer stride */
84 /* pu1_inp_bitmap : input packed sign bit data pointer */
85 /* i4_inp_bitmap_stride : sign bit buffer stride */
86 /* ps_ref_mb_mode : reference mb mode pointer of base layer */
87 /* ps_coord : mb co-ordinate pointer */
88 /* Globals : none */
89 /* Processing : it does the upsampling with fixed phase values and */
90 /* reference layer transform size */
91 /* Outputs : Upsampled residuals for luma */
92 /* Returns : none */
93 /* */
94 /* Issues : none */
95 /* */
96 /* Revision History: */
97 /* */
98 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
99 /* 26 05 2021 Dolan creation */
100 /* */
101 /*****************************************************************************/
102
isvcd_residual_luma_dyadic_neonintr(void * pv_residual_samp_ctxt,WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_out_res,WORD32 i4_out_res_stride,mem_element_t * ps_ref_mb_mode,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_ref_nnz,WORD32 i4_ref_tx_size)103 void isvcd_residual_luma_dyadic_neonintr(void *pv_residual_samp_ctxt, WORD16 *pi2_inp_data,
104 WORD32 i4_inp_data_stride, WORD16 *pi2_out_res,
105 WORD32 i4_out_res_stride, mem_element_t *ps_ref_mb_mode,
106 UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_ref_nnz,
107 WORD32 i4_ref_tx_size)
108 {
109 WORD16 *pi2_refarray_buffer;
110 WORD32 i4_blk_ctr;
111 residual_sampling_ctxt_t *ps_ctxt;
112 UNUSED(ps_ref_mb_mode);
113 UNUSED(u2_mb_x);
114 UNUSED(u2_mb_y);
115
116 ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
117 pi2_refarray_buffer = ps_ctxt->pi2_refarray_buffer;
118
119 /* based on transform size the counter and interpolation width and */
120 /* height are intialised as follows */
121 if((i4_ref_tx_size) && (0 != i4_ref_nnz))
122 {
123 WORD16 *pi2_ref_data_byte;
124 WORD32 *pi4_ref_array;
125 WORD32 i4_i, i4_j;
126 /* ----------- Horizontal Interpolation ---------------- */
127 int16x8_t i2_coeff_add_16x8_r0;
128 int16x8_t i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1;
129 int16x8_t i2_coeff_16x8_sl_r0_0, i2_coeff_16x8_sl_r0_1;
130 int16x8_t result_16x8_r0_0, result_16x8_r0_1;
131 int16x8_t final_result_16x8_r0_0, final_result_16x8_r0_1;
132
133 int16x8_t i2_coeff_add_16x8_r1;
134 int16x8_t i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1;
135 int16x8_t i2_coeff_16x8_sl_r1_0, i2_coeff_16x8_sl_r1_1;
136 int16x8_t result_16x8_r1_0, result_16x8_r1_1;
137 int16x8_t final_result_16x8_r1_0, final_result_16x8_r1_1;
138 int16x8x2_t result_16x8x2_t_0;
139
140 pi2_ref_data_byte = pi2_inp_data;
141
142 /* ----------- Horizontal Interpolation ---------------- */
143 pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
144
145 for(i4_i = 0; i4_i < BLOCK_HEIGHT; i4_i += 2)
146 {
147 i2_coeff_16x8_r0_0 = vld1q_s16(pi2_ref_data_byte);
148 i2_coeff_16x8_r0_1 = vld1q_s16((pi2_ref_data_byte + 1));
149
150 i2_coeff_16x8_r1_0 = vld1q_s16(pi2_ref_data_byte + i4_inp_data_stride);
151 i2_coeff_16x8_r1_1 = vld1q_s16((pi2_ref_data_byte + i4_inp_data_stride + 1));
152
153 i2_coeff_add_16x8_r0 = vaddq_s16(i2_coeff_16x8_r0_0, i2_coeff_16x8_r0_1);
154 i2_coeff_16x8_sl_r0_0 = vshlq_n_s16(i2_coeff_16x8_r0_0, 1);
155 i2_coeff_16x8_sl_r0_1 = vshlq_n_s16(i2_coeff_16x8_r0_1, 1);
156
157 i2_coeff_add_16x8_r1 = vaddq_s16(i2_coeff_16x8_r1_0, i2_coeff_16x8_r1_1);
158 i2_coeff_16x8_sl_r1_0 = vshlq_n_s16(i2_coeff_16x8_r1_0, 1);
159 i2_coeff_16x8_sl_r1_1 = vshlq_n_s16(i2_coeff_16x8_r1_1, 1);
160
161 result_16x8_r0_0 = vaddq_s16(i2_coeff_16x8_sl_r0_0, i2_coeff_add_16x8_r0);
162 result_16x8_r0_1 = vaddq_s16(i2_coeff_16x8_sl_r0_1, i2_coeff_add_16x8_r0);
163
164 result_16x8_r1_0 = vaddq_s16(i2_coeff_16x8_sl_r1_0, i2_coeff_add_16x8_r1);
165 result_16x8_r1_1 = vaddq_s16(i2_coeff_16x8_sl_r1_1, i2_coeff_add_16x8_r1);
166
167 result_16x8x2_t_0 = vzipq_s16(result_16x8_r0_0, result_16x8_r0_1);
168 final_result_16x8_r0_0 = result_16x8x2_t_0.val[0];
169 final_result_16x8_r0_1 = result_16x8x2_t_0.val[1];
170
171 result_16x8x2_t_0 = vzipq_s16(result_16x8_r1_0, result_16x8_r1_1);
172 final_result_16x8_r1_0 = result_16x8x2_t_0.val[0];
173 final_result_16x8_r1_1 = result_16x8x2_t_0.val[1];
174
175 vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8_r0_0)));
176 vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8_r0_0)));
177 vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8_r0_1)));
178 vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8_r0_1)));
179 pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
180 pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
181 pi4_ref_array += 16;
182 pi2_ref_data_byte += i4_inp_data_stride;
183
184 vst1q_s32(pi4_ref_array + 1, vmovl_s16(vget_low_s16(final_result_16x8_r1_0)));
185 vst1q_s32(pi4_ref_array + 5, vmovl_s16(vget_high_s16(final_result_16x8_r1_0)));
186 vst1q_s32(pi4_ref_array + 9, vmovl_s16(vget_low_s16(final_result_16x8_r1_1)));
187 vst1q_s32(pi4_ref_array + 13, vmovl_s16(vget_high_s16(final_result_16x8_r1_1)));
188
189 pi4_ref_array[0] = pi2_ref_data_byte[0] << 2;
190 pi4_ref_array[15] = pi2_ref_data_byte[7] << 2;
191 pi4_ref_array += 16;
192 /* vertical loop uopdates */
193 pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
194 }
195 /* ----------- Vertical Interpolation ---------------- */
196 pi4_ref_array = (WORD32 *) pi2_refarray_buffer;
197
198 {
199 WORD32 *pi4_ref_array_temp;
200 WORD16 *pi2_out;
201 int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r1_3,
202 i4_horz_samp_32x4_r1_4;
203 int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2, i4_horz_samp_32x4_r2_3,
204 i4_horz_samp_32x4_r2_4;
205
206 int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2, i4_horz_res_32x4_r1_3,
207 i4_horz_res_32x4_r1_4;
208 int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2, i4_horz_res_32x4_r2_3,
209 i4_horz_res_32x4_r2_4;
210 int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2, i4_horz_res_32x4_r3_3,
211 i4_horz_res_32x4_r3_4;
212 int32x4_t horz_add_32x4_r2_1, horz_add_32x4_r2_2, horz_add_32x4_r2_3,
213 horz_add_32x4_r2_4;
214
215 int16x8_t comb_horz_16x8_1, comb_horz_16x8_2, comb_horz_16x8_3, comb_horz_16x8_4;
216 pi4_ref_array_temp = pi4_ref_array;
217 pi2_out = pi2_out_res;
218
219 i4_horz_samp_32x4_r1_1 = vld1q_s32(pi4_ref_array_temp);
220 i4_horz_samp_32x4_r1_2 = vld1q_s32(pi4_ref_array_temp + 4);
221 i4_horz_samp_32x4_r1_3 = vld1q_s32(pi4_ref_array_temp + 8);
222 i4_horz_samp_32x4_r1_4 = vld1q_s32(pi4_ref_array_temp + 12);
223
224 /* populate the first inter sample */
225 i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
226 i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
227 i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
228 i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
229
230 comb_horz_16x8_1 =
231 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
232 comb_horz_16x8_2 =
233 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
234 vst1q_s16(pi2_out, comb_horz_16x8_1);
235 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
236
237 pi2_out += i4_out_res_stride;
238
239 for(i4_j = 0; i4_j < 14; i4_j += 2)
240 {
241 pi4_ref_array_temp += MB_WIDTH;
242 i4_horz_samp_32x4_r2_1 = vld1q_s32(pi4_ref_array_temp);
243 i4_horz_samp_32x4_r2_2 = vld1q_s32(pi4_ref_array_temp + 4);
244 i4_horz_samp_32x4_r2_3 = vld1q_s32(pi4_ref_array_temp + 8);
245 i4_horz_samp_32x4_r2_4 = vld1q_s32(pi4_ref_array_temp + 12);
246
247 horz_add_32x4_r2_1 = vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r2_1);
248 horz_add_32x4_r2_2 = vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_samp_32x4_r2_2);
249 horz_add_32x4_r2_3 = vaddq_s32(i4_horz_samp_32x4_r1_3, i4_horz_samp_32x4_r2_3);
250 horz_add_32x4_r2_4 = vaddq_s32(i4_horz_samp_32x4_r1_4, i4_horz_samp_32x4_r2_4);
251
252 i4_horz_res_32x4_r2_1 =
253 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_1, 1), horz_add_32x4_r2_1);
254 i4_horz_res_32x4_r2_2 =
255 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_2, 1), horz_add_32x4_r2_2);
256 i4_horz_res_32x4_r2_3 =
257 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_3, 1), horz_add_32x4_r2_3);
258 i4_horz_res_32x4_r2_4 =
259 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r1_4, 1), horz_add_32x4_r2_4);
260
261 i4_horz_res_32x4_r3_1 =
262 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_1, 1), horz_add_32x4_r2_1);
263 i4_horz_res_32x4_r3_2 =
264 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_2, 1), horz_add_32x4_r2_2);
265 i4_horz_res_32x4_r3_3 =
266 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_3, 1), horz_add_32x4_r2_3);
267 i4_horz_res_32x4_r3_4 =
268 vaddq_s32(vshlq_n_s32(i4_horz_samp_32x4_r2_4, 1), horz_add_32x4_r2_4);
269
270 i4_horz_res_32x4_r2_1 = vrshrq_n_s32(i4_horz_res_32x4_r2_1, 4);
271 i4_horz_res_32x4_r2_2 = vrshrq_n_s32(i4_horz_res_32x4_r2_2, 4);
272 i4_horz_res_32x4_r2_3 = vrshrq_n_s32(i4_horz_res_32x4_r2_3, 4);
273 i4_horz_res_32x4_r2_4 = vrshrq_n_s32(i4_horz_res_32x4_r2_4, 4);
274
275 i4_horz_res_32x4_r3_1 = vrshrq_n_s32(i4_horz_res_32x4_r3_1, 4);
276 i4_horz_res_32x4_r3_2 = vrshrq_n_s32(i4_horz_res_32x4_r3_2, 4);
277 i4_horz_res_32x4_r3_3 = vrshrq_n_s32(i4_horz_res_32x4_r3_3, 4);
278 i4_horz_res_32x4_r3_4 = vrshrq_n_s32(i4_horz_res_32x4_r3_4, 4);
279
280 comb_horz_16x8_1 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_1),
281 vmovn_s32(i4_horz_res_32x4_r2_2));
282 comb_horz_16x8_2 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r2_3),
283 vmovn_s32(i4_horz_res_32x4_r2_4));
284
285 comb_horz_16x8_3 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_1),
286 vmovn_s32(i4_horz_res_32x4_r3_2));
287 comb_horz_16x8_4 = vcombine_s16(vmovn_s32(i4_horz_res_32x4_r3_3),
288 vmovn_s32(i4_horz_res_32x4_r3_4));
289
290 /* populate 2 samples based on current coeffs */
291 vst1q_s16(pi2_out, comb_horz_16x8_1);
292 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
293 pi2_out += i4_out_res_stride;
294
295 vst1q_s16(pi2_out, comb_horz_16x8_3);
296 vst1q_s16(pi2_out + 8, comb_horz_16x8_4);
297 pi2_out += i4_out_res_stride;
298
299 /* store the coeff 2 to coeff 1 */
300 /* (used in next iteration) */
301 i4_horz_samp_32x4_r1_1 = i4_horz_samp_32x4_r2_1;
302 i4_horz_samp_32x4_r1_2 = i4_horz_samp_32x4_r2_2;
303 i4_horz_samp_32x4_r1_3 = i4_horz_samp_32x4_r2_3;
304 i4_horz_samp_32x4_r1_4 = i4_horz_samp_32x4_r2_4;
305 }
306
307 /* populate the first inter sample */
308 i4_horz_res_32x4_r1_1 = vrshrq_n_s32(i4_horz_samp_32x4_r1_1, 2);
309 i4_horz_res_32x4_r1_2 = vrshrq_n_s32(i4_horz_samp_32x4_r1_2, 2);
310 i4_horz_res_32x4_r1_3 = vrshrq_n_s32(i4_horz_samp_32x4_r1_3, 2);
311 i4_horz_res_32x4_r1_4 = vrshrq_n_s32(i4_horz_samp_32x4_r1_4, 2);
312
313 comb_horz_16x8_1 =
314 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_1), vmovn_s32(i4_horz_res_32x4_r1_2));
315 comb_horz_16x8_2 =
316 vcombine_s16(vmovn_s32(i4_horz_res_32x4_r1_3), vmovn_s32(i4_horz_res_32x4_r1_4));
317 vst1q_s16(pi2_out, comb_horz_16x8_1);
318 vst1q_s16(pi2_out + 8, comb_horz_16x8_2);
319
320 /* horizontal loop updates */
321 pi4_ref_array++;
322 pi2_out_res++;
323 }
324 }
325 else
326 {
327 /* ----------------------------------------------------------------- */
328 /* LOOP over number of blocks */
329 /* ----------------------------------------------------------------- */
330 for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
331 {
332 /* if reference layer is not coded then no processing */
333 if(0 != (i4_ref_nnz & 0x1))
334 {
335 int16x8_t i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_1;
336 int16x8_t i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_1;
337 int16x8_t i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_1;
338 int16x8_t i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_1;
339 int16x8_t i2_add_16x8_r0_0;
340 int16x8_t i2_add_16x8_r1_0;
341 int16x8_t i2_add_16x8_r2_0;
342 int16x8_t i2_add_16x8_r3_0;
343 int16x8_t i2_res_16x8_r0_0, i2_res_16x8_r0_1;
344 int16x8_t i2_res_16x8_r1_0, i2_res_16x8_r1_1;
345 int16x8_t i2_res_16x8_r2_0, i2_res_16x8_r2_1;
346 int16x8_t i2_res_16x8_r3_0, i2_res_16x8_r3_1;
347
348 i2_coeff1_16x8_r0_0 = vld1q_s16(pi2_inp_data);
349 i2_coeff1_16x8_r1_0 = vld1q_s16(pi2_inp_data + i4_inp_data_stride);
350 i2_coeff1_16x8_r2_0 = vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1));
351 i2_coeff1_16x8_r3_0 =
352 vld1q_s16(pi2_inp_data + (i4_inp_data_stride << 1) + i4_inp_data_stride);
353
354 i2_coeff1_16x8_r0_1 = vextq_s16(i2_coeff1_16x8_r0_0, i2_coeff1_16x8_r0_0, 1);
355 i2_coeff1_16x8_r1_1 = vextq_s16(i2_coeff1_16x8_r1_0, i2_coeff1_16x8_r1_0, 1);
356 i2_coeff1_16x8_r2_1 = vextq_s16(i2_coeff1_16x8_r2_0, i2_coeff1_16x8_r2_0, 1);
357 i2_coeff1_16x8_r3_1 = vextq_s16(i2_coeff1_16x8_r3_0, i2_coeff1_16x8_r3_0, 1);
358
359 i2_add_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_coeff1_16x8_r0_0);
360 i2_add_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_coeff1_16x8_r1_0);
361 i2_add_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_coeff1_16x8_r2_0);
362 i2_add_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_coeff1_16x8_r3_0);
363
364 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
365 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
366 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
367 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
368
369 i2_coeff1_16x8_r0_1 = vshlq_n_s16(i2_coeff1_16x8_r0_1, 1);
370 i2_coeff1_16x8_r1_1 = vshlq_n_s16(i2_coeff1_16x8_r1_1, 1);
371 i2_coeff1_16x8_r2_1 = vshlq_n_s16(i2_coeff1_16x8_r2_1, 1);
372 i2_coeff1_16x8_r3_1 = vshlq_n_s16(i2_coeff1_16x8_r3_1, 1);
373
374 i2_res_16x8_r0_0 = vaddq_s16(i2_coeff1_16x8_r0_0, i2_add_16x8_r0_0);
375 i2_res_16x8_r1_0 = vaddq_s16(i2_coeff1_16x8_r1_0, i2_add_16x8_r1_0);
376 i2_res_16x8_r2_0 = vaddq_s16(i2_coeff1_16x8_r2_0, i2_add_16x8_r2_0);
377 i2_res_16x8_r3_0 = vaddq_s16(i2_coeff1_16x8_r3_0, i2_add_16x8_r3_0);
378
379 i2_res_16x8_r0_1 = vaddq_s16(i2_coeff1_16x8_r0_1, i2_add_16x8_r0_0);
380 i2_res_16x8_r1_1 = vaddq_s16(i2_coeff1_16x8_r1_1, i2_add_16x8_r1_0);
381 i2_res_16x8_r2_1 = vaddq_s16(i2_coeff1_16x8_r2_1, i2_add_16x8_r2_0);
382 i2_res_16x8_r3_1 = vaddq_s16(i2_coeff1_16x8_r3_1, i2_add_16x8_r3_0);
383
384 i2_res_16x8_r0_0 = vzipq_s16(i2_res_16x8_r0_0, i2_res_16x8_r0_1).val[0];
385 i2_res_16x8_r1_0 = vzipq_s16(i2_res_16x8_r1_0, i2_res_16x8_r1_1).val[0];
386 i2_res_16x8_r2_0 = vzipq_s16(i2_res_16x8_r2_0, i2_res_16x8_r2_1).val[0];
387 i2_res_16x8_r3_0 = vzipq_s16(i2_res_16x8_r3_0, i2_res_16x8_r3_1).val[0];
388
389 i2_coeff1_16x8_r0_0 = vshlq_n_s16(i2_coeff1_16x8_r0_0, 1);
390 i2_coeff1_16x8_r1_0 = vshlq_n_s16(i2_coeff1_16x8_r1_0, 1);
391 i2_coeff1_16x8_r2_0 = vshlq_n_s16(i2_coeff1_16x8_r2_0, 1);
392 i2_coeff1_16x8_r3_0 = vshlq_n_s16(i2_coeff1_16x8_r3_0, 1);
393
394 vst1q_s16(pi2_refarray_buffer + 1, i2_res_16x8_r0_0);
395 vst1q_lane_s16(pi2_refarray_buffer, i2_coeff1_16x8_r0_0, 0);
396 vst1q_lane_s16(pi2_refarray_buffer + 7, i2_coeff1_16x8_r0_0, 3);
397
398 vst1q_s16(pi2_refarray_buffer + 9, i2_res_16x8_r1_0);
399 vst1q_lane_s16(pi2_refarray_buffer + 8, i2_coeff1_16x8_r1_0, 0);
400 vst1q_lane_s16(pi2_refarray_buffer + 15, i2_coeff1_16x8_r1_0, 3);
401
402 vst1q_s16(pi2_refarray_buffer + 17, i2_res_16x8_r2_0);
403 vst1q_lane_s16(pi2_refarray_buffer + 16, i2_coeff1_16x8_r2_0, 0);
404 vst1q_lane_s16(pi2_refarray_buffer + 23, i2_coeff1_16x8_r2_0, 3);
405
406 vst1q_s16(pi2_refarray_buffer + 25, i2_res_16x8_r3_0);
407 vst1q_lane_s16(pi2_refarray_buffer + 24, i2_coeff1_16x8_r3_0, 0);
408 vst1q_lane_s16(pi2_refarray_buffer + 31, i2_coeff1_16x8_r3_0, 3);
409
410 {
411 int16x4_t i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r0_2;
412 int16x4_t i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r1_2;
413 int16x4_t i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r2_2;
414 int16x4_t i4_horz_samp_16x4_r3_1, i4_horz_samp_16x4_r3_2;
415
416 int32x4_t i4_horz_samp_32x4_r0_1, i4_horz_samp_32x4_r0_2;
417 int32x4_t i4_horz_samp_32x4_r1_1, i4_horz_samp_32x4_r1_2;
418 int32x4_t i4_horz_samp_32x4_r2_1, i4_horz_samp_32x4_r2_2;
419 int32x4_t i4_horz_samp_32x4_r3_1, i4_horz_samp_32x4_r3_2;
420
421 int32x4_t i4_horz_add_32x4_r1_1, i4_horz_add_32x4_r1_2;
422 int32x4_t i4_horz_add_32x4_r2_1, i4_horz_add_32x4_r2_2;
423 int32x4_t i4_horz_add_32x4_r3_1, i4_horz_add_32x4_r3_2;
424
425 int16x4_t i4_horz_res_16x4_r0_1, i4_horz_res_16x4_r0_2;
426 int16x4_t i4_horz_res_16x4_r1_1, i4_horz_res_16x4_r1_2;
427 int16x4_t i4_horz_res_16x4_r2_1, i4_horz_res_16x4_r2_2;
428 int16x4_t i4_horz_res_16x4_r3_1, i4_horz_res_16x4_r3_2;
429 int16x4_t i4_horz_res_16x4_r4_1, i4_horz_res_16x4_r4_2;
430 int16x4_t i4_horz_res_16x4_r5_1, i4_horz_res_16x4_r5_2;
431 int16x4_t i4_horz_res_16x4_r6_1, i4_horz_res_16x4_r6_2;
432 int16x4_t i4_horz_res_16x4_r7_1, i4_horz_res_16x4_r7_2;
433
434 int32x4_t i4_horz_res_32x4_r1_1, i4_horz_res_32x4_r1_2;
435 int32x4_t i4_horz_res_32x4_r2_1, i4_horz_res_32x4_r2_2;
436 int32x4_t i4_horz_res_32x4_r3_1, i4_horz_res_32x4_r3_2;
437 int32x4_t i4_horz_res_32x4_r4_1, i4_horz_res_32x4_r4_2;
438 int32x4_t i4_horz_res_32x4_r5_1, i4_horz_res_32x4_r5_2;
439 int32x4_t i4_horz_res_32x4_r6_1, i4_horz_res_32x4_r6_2;
440
441 i4_horz_samp_16x4_r0_1 = vld1_s16(pi2_refarray_buffer);
442 i4_horz_samp_16x4_r0_2 = vld1_s16(pi2_refarray_buffer + 4);
443
444 i4_horz_samp_16x4_r1_1 = vld1_s16(pi2_refarray_buffer + 8);
445 i4_horz_samp_16x4_r1_2 = vld1_s16(pi2_refarray_buffer + 12);
446
447 i4_horz_samp_16x4_r2_1 = vld1_s16(pi2_refarray_buffer + 16);
448 i4_horz_samp_16x4_r2_2 = vld1_s16(pi2_refarray_buffer + 20);
449
450 i4_horz_samp_16x4_r3_1 = vld1_s16(pi2_refarray_buffer + 24);
451 i4_horz_samp_16x4_r3_2 = vld1_s16(pi2_refarray_buffer + 28);
452
453 i4_horz_res_16x4_r0_1 = vrshr_n_s16(i4_horz_samp_16x4_r0_1, 2);
454 i4_horz_res_16x4_r0_2 = vrshr_n_s16(i4_horz_samp_16x4_r0_2, 2);
455
456 i4_horz_add_32x4_r1_1 =
457 vaddl_s16(i4_horz_samp_16x4_r0_1, i4_horz_samp_16x4_r1_1);
458 i4_horz_add_32x4_r1_2 =
459 vaddl_s16(i4_horz_samp_16x4_r0_2, i4_horz_samp_16x4_r1_2);
460
461 i4_horz_add_32x4_r2_1 =
462 vaddl_s16(i4_horz_samp_16x4_r1_1, i4_horz_samp_16x4_r2_1);
463 i4_horz_add_32x4_r2_2 =
464 vaddl_s16(i4_horz_samp_16x4_r1_2, i4_horz_samp_16x4_r2_2);
465
466 i4_horz_add_32x4_r3_1 =
467 vaddl_s16(i4_horz_samp_16x4_r2_1, i4_horz_samp_16x4_r3_1);
468 i4_horz_add_32x4_r3_2 =
469 vaddl_s16(i4_horz_samp_16x4_r2_2, i4_horz_samp_16x4_r3_2);
470
471 i4_horz_samp_32x4_r0_1 = vshll_n_s16(i4_horz_samp_16x4_r0_1, 1);
472 i4_horz_samp_32x4_r0_2 = vshll_n_s16(i4_horz_samp_16x4_r0_2, 1);
473
474 i4_horz_samp_32x4_r1_1 = vshll_n_s16(i4_horz_samp_16x4_r1_1, 1);
475 i4_horz_samp_32x4_r1_2 = vshll_n_s16(i4_horz_samp_16x4_r1_2, 1);
476
477 i4_horz_samp_32x4_r2_1 = vshll_n_s16(i4_horz_samp_16x4_r2_1, 1);
478 i4_horz_samp_32x4_r2_2 = vshll_n_s16(i4_horz_samp_16x4_r2_2, 1);
479
480 i4_horz_samp_32x4_r3_1 = vshll_n_s16(i4_horz_samp_16x4_r3_1, 1);
481 i4_horz_samp_32x4_r3_2 = vshll_n_s16(i4_horz_samp_16x4_r3_2, 1);
482
483 i4_horz_res_32x4_r1_1 =
484 vaddq_s32(i4_horz_samp_32x4_r0_1, i4_horz_add_32x4_r1_1);
485 i4_horz_res_32x4_r1_2 =
486 vaddq_s32(i4_horz_samp_32x4_r0_2, i4_horz_add_32x4_r1_2);
487
488 i4_horz_res_32x4_r2_1 =
489 vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r1_1);
490 i4_horz_res_32x4_r2_2 =
491 vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r1_2);
492
493 i4_horz_res_32x4_r3_1 =
494 vaddq_s32(i4_horz_samp_32x4_r1_1, i4_horz_add_32x4_r2_1);
495 i4_horz_res_32x4_r3_2 =
496 vaddq_s32(i4_horz_samp_32x4_r1_2, i4_horz_add_32x4_r2_2);
497
498 i4_horz_res_32x4_r4_1 =
499 vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r2_1);
500 i4_horz_res_32x4_r4_2 =
501 vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r2_2);
502
503 i4_horz_res_32x4_r5_1 =
504 vaddq_s32(i4_horz_samp_32x4_r2_1, i4_horz_add_32x4_r3_1);
505 i4_horz_res_32x4_r5_2 =
506 vaddq_s32(i4_horz_samp_32x4_r2_2, i4_horz_add_32x4_r3_2);
507
508 i4_horz_res_32x4_r6_1 =
509 vaddq_s32(i4_horz_samp_32x4_r3_1, i4_horz_add_32x4_r3_1);
510 i4_horz_res_32x4_r6_2 =
511 vaddq_s32(i4_horz_samp_32x4_r3_2, i4_horz_add_32x4_r3_2);
512
513 i4_horz_res_16x4_r1_1 = vqrshrn_n_s32(i4_horz_res_32x4_r1_1, 4);
514 i4_horz_res_16x4_r1_2 = vqrshrn_n_s32(i4_horz_res_32x4_r1_2, 4);
515
516 i4_horz_res_16x4_r2_1 = vqrshrn_n_s32(i4_horz_res_32x4_r2_1, 4);
517 i4_horz_res_16x4_r2_2 = vqrshrn_n_s32(i4_horz_res_32x4_r2_2, 4);
518
519 i4_horz_res_16x4_r3_1 = vqrshrn_n_s32(i4_horz_res_32x4_r3_1, 4);
520 i4_horz_res_16x4_r3_2 = vqrshrn_n_s32(i4_horz_res_32x4_r3_2, 4);
521
522 i4_horz_res_16x4_r4_1 = vqrshrn_n_s32(i4_horz_res_32x4_r4_1, 4);
523 i4_horz_res_16x4_r4_2 = vqrshrn_n_s32(i4_horz_res_32x4_r4_2, 4);
524
525 i4_horz_res_16x4_r5_1 = vqrshrn_n_s32(i4_horz_res_32x4_r5_1, 4);
526 i4_horz_res_16x4_r5_2 = vqrshrn_n_s32(i4_horz_res_32x4_r5_2, 4);
527
528 i4_horz_res_16x4_r6_1 = vqrshrn_n_s32(i4_horz_res_32x4_r6_1, 4);
529 i4_horz_res_16x4_r6_2 = vqrshrn_n_s32(i4_horz_res_32x4_r6_2, 4);
530
531 i4_horz_res_16x4_r7_1 = vrshr_n_s16(i4_horz_samp_16x4_r3_1, 2);
532 i4_horz_res_16x4_r7_2 = vrshr_n_s16(i4_horz_samp_16x4_r3_2, 2);
533
534 vst1_s16(pi2_out_res, i4_horz_res_16x4_r0_1);
535 vst1_s16(pi2_out_res + 4, i4_horz_res_16x4_r0_2);
536
537 vst1_s16(pi2_out_res + i4_out_res_stride, i4_horz_res_16x4_r1_1);
538 vst1_s16(pi2_out_res + i4_out_res_stride + 4, i4_horz_res_16x4_r1_2);
539
540 vst1_s16(pi2_out_res + (i4_out_res_stride << 1), i4_horz_res_16x4_r2_1);
541 vst1_s16(pi2_out_res + (i4_out_res_stride << 1) + 4, i4_horz_res_16x4_r2_2);
542
543 vst1_s16(pi2_out_res + (i4_out_res_stride * 3), i4_horz_res_16x4_r3_1);
544 vst1_s16(pi2_out_res + (i4_out_res_stride * 3) + 4, i4_horz_res_16x4_r3_2);
545
546 vst1_s16(pi2_out_res + (i4_out_res_stride << 2), i4_horz_res_16x4_r4_1);
547 vst1_s16(pi2_out_res + (i4_out_res_stride << 2) + 4, i4_horz_res_16x4_r4_2);
548
549 vst1_s16(pi2_out_res + (i4_out_res_stride * 5), i4_horz_res_16x4_r5_1);
550 vst1_s16(pi2_out_res + (i4_out_res_stride * 5) + 4, i4_horz_res_16x4_r5_2);
551
552 vst1_s16(pi2_out_res + (i4_out_res_stride * 6), i4_horz_res_16x4_r6_1);
553 vst1_s16(pi2_out_res + (i4_out_res_stride * 6) + 4, i4_horz_res_16x4_r6_2);
554
555 vst1_s16(pi2_out_res + (i4_out_res_stride * 7), i4_horz_res_16x4_r7_1);
556 vst1_s16(pi2_out_res + (i4_out_res_stride * 7) + 4, i4_horz_res_16x4_r7_2);
557
558 pi2_out_res += BLOCK_WIDTH;
559 }
560 }
561 else
562 {
563 pi2_out_res += BLOCK_WIDTH;
564 }
565
566 /* Block level loop updates */
567 if(1 == i4_blk_ctr)
568 {
569 pi2_inp_data -= SUB_BLOCK_WIDTH;
570 pi2_inp_data += (i4_inp_data_stride * SUB_BLOCK_HEIGHT);
571 pi2_out_res -= MB_WIDTH;
572 pi2_out_res += (i4_out_res_stride * BLOCK_HEIGHT);
573 i4_ref_nnz >>= 2;
574 }
575 else
576 {
577 pi2_inp_data += SUB_BLOCK_WIDTH;
578 }
579
580 i4_ref_nnz >>= 1;
581 } /* end of loop over all the blocks */
582 }
583
584 return;
585 }
586
587 /*****************************************************************************/
588 /* */
589 /* Function Name : isvcd_interpolate_residual_neonintr */
590 /* */
591 /* Description : this fucntion does the upsampling of residuals. */
592 /* */
593 /* Inputs : pv_residual_samp_ctxt : Residual upsampling context */
594 /* pu1_inp_data : input 8 bit data pointer */
595 /* i4_inp_data_stride : input buffer stride */
596 /* pi2_out_res : output 16 bit buffer pointer */
597 /* i4_out_res_stride : Output buffer stride */
598 /* pu1_inp_bitmap : input packed sign bit data pointer */
599 /* i4_inp_bitmap_stride : sign bit buffer stride */
600 /* ps_ref_mb_mode : reference mb mode pointer of base layer */
601 /* ps_coord : mb co-ordinate pointer */
602 /* Globals : none */
603 /* Processing : it does the upsampling with fixed phase values and */
604 /* reference layer transform size */
605 /* Outputs : Upsampled residuals. */
606 /* Returns : none */
607 /* */
608 /* Issues : none */
609 /* */
610 /* Revision History: */
611 /* */
612 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
613 /* 26 05 2021 Dolan creation */
614 /* */
615 /*****************************************************************************/
616
isvcd_interpolate_residual_neonintr(void * pv_residual_samp_ctxt,WORD16 * pi2_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_chroma_flag)617 void isvcd_interpolate_residual_neonintr(void *pv_residual_samp_ctxt, WORD16 *pi2_out,
618 WORD32 i4_out_stride, WORD32 i4_refarray_wd,
619 UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_chroma_flag)
620 {
621 residual_sampling_ctxt_t *ps_ctxt;
622 residual_samp_map_ctxt_t *ps_map_ctxt;
623 res_lyr_ctxt *ps_lyr_ctxt;
624 ref_pixel_map_t *ps_x_pos_phase;
625 ref_pixel_map_t *ps_y_pos_phase;
626
627 WORD32 i4_x, i4_y;
628 WORD32 i4_frm_mb_x, i4_frm_mb_y;
629 WORD32 i4_temp_array_ht;
630 WORD32 i4_mb_wd;
631 WORD32 i4_mb_ht;
632 WORD16 *pi2_ref_array;
633 UWORD8 *pu1_ref_x_ptr_incr, *pu1_ref_y_ptr_incr;
634
635 UWORD8 arr_y_ref_pos[16] = {0};
636 UWORD8 arr_x_ref_pos[16] = {0};
637 UWORD8 arr_x_ref_pos_low[16] = {0};
638 UWORD8 arr_x_phase[16] = {0};
639 UWORD8 arr_y_phase[16] = {0};
640 UWORD8 *pi1_y_ref_pos;
641 UWORD8 *pi1_x_ref_pos;
642 UWORD8 *pi1_x_ref_pos_low;
643 UWORD8 *pi1_y_phase;
644 UWORD8 *pi1_x_phase;
645
646 ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
647 ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
648 pi2_ref_array = ps_ctxt->pi2_refarray_buffer;
649 pu1_ref_x_ptr_incr = ps_ctxt->pu1_ref_x_ptr_incr;
650 pu1_ref_y_ptr_incr = ps_ctxt->pu1_ref_y_ptr_incr;
651
652 if(1 == i4_chroma_flag)
653 ps_map_ctxt = &ps_lyr_ctxt->s_chroma_map_ctxt;
654 else
655 ps_map_ctxt = &ps_lyr_ctxt->s_luma_map_ctxt;
656
657 i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
658 i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
659
660 ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
661 ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
662
663 i4_temp_array_ht = i4_mb_ht;
664 i4_frm_mb_y = u2_mb_y * i4_mb_ht;
665 i4_frm_mb_x = u2_mb_x * i4_mb_wd;
666
667 /* --------------------------------------------------------------------- */
668 /* Loop for interpolation */
669 /* --------------------------------------------------------------------- */
670 if(i4_chroma_flag == 0)
671 {
672 int16x8_t ref_arr_16x8_r0_0, ref_arr_16x8_r0_1;
673 int16x8_t ref_arr_16x8_r1_0, ref_arr_16x8_r1_1;
674 uint8x16_t x_ref_pos_mask_r0_0, x_ref_rnd_mask_r0_0;
675 uint8x16_t u1_incr_8x16_r0_0, x_ref_pos_mask_temp_r0_0, u1_incr_not_8x16_r0_0,
676 u1_y_incr_8x16_r0_0, phs_mask_8x16_0;
677 uint8x16_t u1_incr_8x16_r1_0, x_ref_pos_mask_temp_r1_0, u1_incr_not_8x16_r1_0;
678 int16x8_t ref_arr_temp0_16x8_r0_0, res_16x8_r0_0, vert_res_16x8_r0_0;
679 int16x8_t ref_arr_temp0_16x8_r1_0, res_16x8_r1_0, vert_res_16x8_r1_0;
680 int16x8_t ref_arr_temp1_16x8_r0_0;
681 int16x8_t ref_arr_temp1_16x8_r1_0;
682
683 uint8x16_t x_ref_pos_mask_temp_r0_1, u1_incr_not_8x16_r0_1;
684 uint8x16_t x_ref_pos_mask_temp_r1_1, u1_incr_not_8x16_r1_1;
685 int16x8_t ref_arr_temp0_16x8_r0_1, res_16x8_r0_1, vert_res_16x8_r0_1;
686 int16x8_t ref_arr_temp0_16x8_r1_1, res_16x8_r1_1, vert_res_16x8_r1_1;
687 int16x8_t ref_arr_temp1_16x8_r0_1;
688 int16x8_t ref_arr_temp1_16x8_r1_1;
689
690 uint16x8_t u1_y_incr_16x8_r0_0, u1_y_incr_16x8_r0_1;
691
692 uint8x16_t u1_incr_not_8x16_r0_0_even, u1_incr_not_8x16_r1_0_even,
693 x_ref_pos_mask_temp_r0_0_even, x_ref_pos_mask_temp_r1_0_even;
694 uint8x16_t u1_incr_not_8x16_r0_0_odd, u1_incr_not_8x16_r1_0_odd,
695 x_ref_pos_mask_temp_r0_0_odd, x_ref_pos_mask_temp_r1_0_odd;
696 uint8x16x2_t u1_incr_not_8x16_2, x_ref_pos_mask_temp;
697 int16x8_t prev_res_16x8_r0_0;
698 int16x8_t prev_res_16x8_r1_0;
699 int16x8_t prev_res_16x8_r0_1;
700 int16x8_t prev_res_16x8_r1_1;
701 uint8x8x2_t u1_incr_8x8x2_t;
702 uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
703 uint16x8_t u1_prev_y_incr_16x8_r0_0;
704 uint16x8_t u1_prev_y_incr_16x8_r0_1;
705
706 WORD32 zero_r0_r1 = 0;
707
708 int32x4_t res_32x4_l_0, res_32x4_h_0;
709 int32x4_t res_32x4_l_1, res_32x4_h_1;
710 int16x8_t res_16x8_l, res_16x8_h;
711 uint16x8_t phs_mask_16x8_0, phs_mask_16x8_1;
712 int16x8_t const_16_16x8, phs_mask_16min_16x8_0;
713 int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs;
714 uint8x16_t phs_mask_div8_8x16_0, mid_indx_8x16;
715 int16x8_t phs_mask_16min_16x8_1;
716 uint16x8_t ones = vdupq_n_u16(0xFFFF);
717 uint8x16_t const_ones = vdupq_n_u8(1);
718 uint8x8x2_t u1_temp_8x8x2_t;
719 uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
720
721 WORD16 *pi2_ref_array_temp;
722 UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
723 WORD32 i4_y_phase;
724 WORD32 out_stride_temp;
725 WORD32 strt_indx_h = 0;
726
727 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
728 {
729 arr_y_phase[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
730 arr_y_ref_pos[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
731 }
732 pi1_y_ref_pos = arr_y_ref_pos;
733 pi1_y_phase = arr_y_phase;
734
735 strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos);
736 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
737 {
738 arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
739 arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
740 }
741
742 pi1_x_ref_pos = arr_x_ref_pos;
743 pi1_x_phase = arr_x_phase;
744
745 phs_mask_8x16_0 = vld1q_u8((pi1_x_phase));
746 phs_mask_16x8_0 = vmovl_u8(vget_low_u8(phs_mask_8x16_0));
747 phs_mask_16x8_1 = vmovl_u8(vld1_u8((pi1_x_phase + 8)));
748 x_ref_pos_mask_r0_0 = vld1q_u8((pi1_x_ref_pos));
749 const_16_16x8 = vdupq_n_s16(16);
750 phs_mask_div8_8x16_0 = vshrq_n_u8(phs_mask_8x16_0, 3);
751
752 phs_mask_16min_16x8_0 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_0));
753 phs_mask_16min_16x8_1 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_1));
754
755 x_ref_rnd_mask_r0_0 = vaddq_u8(x_ref_pos_mask_r0_0, phs_mask_div8_8x16_0);
756 mid_indx_8x16 = vdupq_n_u8((strt_indx_h << 1));
757
758 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
759 {
760 if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
761 {
762 if(!zero_r0_r1)
763 {
764 res_16x8_l = vdupq_n_s16(0);
765 res_16x8_h = vdupq_n_s16(0);
766
767 out_stride_temp = (i4_y * i4_out_stride);
768 vst1q_s16((pi2_out + out_stride_temp), res_16x8_l);
769 vst1q_s16((pi2_out + out_stride_temp + 8), res_16x8_h);
770 continue;
771 }
772
773 res_16x8_r0_0 = prev_res_16x8_r0_0;
774 res_16x8_r1_0 = prev_res_16x8_r1_0;
775 res_16x8_r0_1 = prev_res_16x8_r0_1;
776 res_16x8_r1_1 = prev_res_16x8_r1_1;
777
778 u1_y_incr_16x8_r0_0 = u1_prev_y_incr_16x8_r0_0;
779 u1_y_incr_16x8_r0_1 = u1_prev_y_incr_16x8_r0_1;
780 }
781 else
782 {
783 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
784 pu1_ref_x_ptr_incr_temp =
785 pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
786 ref_arr_16x8_r0_0 = vld1q_s16((pi2_ref_array_temp));
787 ref_arr_16x8_r1_0 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd));
788 ref_arr_16x8_r0_1 = vld1q_s16((pi2_ref_array_temp + strt_indx_h));
789 ref_arr_16x8_r1_1 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd + strt_indx_h));
790
791 dup_val_1 = vabsq_s16(ref_arr_16x8_r0_0);
792 dup_val_2 = vabsq_s16(ref_arr_16x8_r1_0);
793 dup_val_3 = vabsq_s16(ref_arr_16x8_r0_1);
794 dup_val_4 = vabsq_s16(ref_arr_16x8_r1_1);
795 dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2);
796 dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4);
797 dup_abs = vqaddq_s16(dup_val_1, dup_val_5);
798 zero_r0_r1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] ||
799 dup_abs[5] || dup_abs[6] || dup_abs[7];
800 if(zero_r0_r1)
801 {
802 u1_incr_8x16_r0_0 = vld1q_u8((pu1_ref_x_ptr_incr_temp));
803 u1_incr_8x16_r1_0 = vld1q_u8((pu1_ref_x_ptr_incr_temp + i4_refarray_wd));
804 u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_incr_8x16_r0_0);
805 u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_incr_8x16_r0_0);
806 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
807 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
808 u1_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
809
810 u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_incr_8x16_r1_0);
811 u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_incr_8x16_r1_0);
812 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
813 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
814 u1_incr_8x16_r1_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
815 u1_incr_not_8x16_r0_0 = vbicq_u8(phs_mask_div8_8x16_0, u1_incr_8x16_r0_0);
816 u1_incr_not_8x16_r1_0 = vbicq_u8(phs_mask_div8_8x16_0, u1_incr_8x16_r1_0);
817
818 u1_incr_not_8x16_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, x_ref_pos_mask_r0_0);
819 u1_incr_not_8x16_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, x_ref_pos_mask_r0_0);
820
821 x_ref_pos_mask_temp_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, u1_incr_8x16_r0_0);
822 x_ref_pos_mask_temp_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, u1_incr_8x16_r1_0);
823
824 u1_incr_not_8x16_r0_0_even = vshlq_n_u8(u1_incr_not_8x16_r0_0, 1);
825 u1_incr_not_8x16_r1_0_even = vshlq_n_u8(u1_incr_not_8x16_r1_0, 1);
826 x_ref_pos_mask_temp_r0_0_even = vshlq_n_u8(x_ref_pos_mask_temp_r0_0, 1);
827 x_ref_pos_mask_temp_r1_0_even = vshlq_n_u8(x_ref_pos_mask_temp_r1_0, 1);
828
829 u1_incr_not_8x16_r0_0_odd = vaddq_u8(u1_incr_not_8x16_r0_0_even, const_ones);
830 u1_incr_not_8x16_r1_0_odd = vaddq_u8(u1_incr_not_8x16_r1_0_even, const_ones);
831 x_ref_pos_mask_temp_r0_0_odd =
832 vaddq_u8(x_ref_pos_mask_temp_r0_0_even, const_ones);
833 x_ref_pos_mask_temp_r1_0_odd =
834 vaddq_u8(x_ref_pos_mask_temp_r1_0_even, const_ones);
835
836 u1_incr_not_8x16_2 =
837 vzipq_u8(u1_incr_not_8x16_r0_0_even, u1_incr_not_8x16_r0_0_odd);
838
839 u1_incr_not_8x16_r0_0 = u1_incr_not_8x16_2.val[0];
840 u1_incr_not_8x16_r0_1 = u1_incr_not_8x16_2.val[1];
841
842 u1_incr_not_8x16_2 =
843 vzipq_u8(u1_incr_not_8x16_r1_0_even, u1_incr_not_8x16_r1_0_odd);
844 u1_incr_not_8x16_r1_0 = u1_incr_not_8x16_2.val[0];
845 u1_incr_not_8x16_r1_1 = u1_incr_not_8x16_2.val[1];
846
847 x_ref_pos_mask_temp =
848 vzipq_u8(x_ref_pos_mask_temp_r0_0_even, x_ref_pos_mask_temp_r0_0_odd);
849 x_ref_pos_mask_temp_r0_0 = x_ref_pos_mask_temp.val[0];
850 x_ref_pos_mask_temp_r0_1 = x_ref_pos_mask_temp.val[1];
851
852 x_ref_pos_mask_temp =
853 vzipq_u8(x_ref_pos_mask_temp_r1_0_even, x_ref_pos_mask_temp_r1_0_odd);
854 x_ref_pos_mask_temp_r1_0 = x_ref_pos_mask_temp.val[0];
855 x_ref_pos_mask_temp_r1_1 = x_ref_pos_mask_temp.val[1];
856 u1_incr_not_8x16_r0_1 = vsubq_u8(u1_incr_not_8x16_r0_1, mid_indx_8x16);
857 u1_incr_not_8x16_r1_1 = vsubq_u8(u1_incr_not_8x16_r1_1, mid_indx_8x16);
858 x_ref_pos_mask_temp_r0_1 = vsubq_u8(x_ref_pos_mask_temp_r0_1, mid_indx_8x16);
859 x_ref_pos_mask_temp_r1_1 = vsubq_u8(x_ref_pos_mask_temp_r1_1, mid_indx_8x16);
860
861 u1_temp_8x8x2_t.val[0] =
862 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
863 u1_temp_8x8x2_t.val[1] =
864 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
865 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_0));
866 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_0));
867 ref_arr_temp0_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
868 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
869
870 u1_temp_8x8x2_t.val[0] =
871 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
872 u1_temp_8x8x2_t.val[1] =
873 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
874 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_0));
875 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_0));
876 ref_arr_temp0_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
877 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
878
879 u1_temp_8x8x2_t.val[0] =
880 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
881 u1_temp_8x8x2_t.val[1] =
882 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
883 u1_temp_8x8_t0 =
884 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_0));
885 u1_temp_8x8_t1 =
886 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_0));
887 ref_arr_temp1_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
888 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
889
890 u1_temp_8x8x2_t.val[0] =
891 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
892 u1_temp_8x8x2_t.val[1] =
893 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
894 u1_temp_8x8_t0 =
895 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_0));
896 u1_temp_8x8_t1 =
897 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_0));
898 ref_arr_temp1_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
899 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
900
901 u1_temp_8x8x2_t.val[0] =
902 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
903 u1_temp_8x8x2_t.val[1] =
904 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
905 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_1));
906 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_1));
907 ref_arr_temp0_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
908 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
909
910 u1_temp_8x8x2_t.val[0] =
911 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
912 u1_temp_8x8x2_t.val[1] =
913 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
914 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_1));
915 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_1));
916 ref_arr_temp0_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
917 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
918
919 u1_temp_8x8x2_t.val[0] =
920 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
921 u1_temp_8x8x2_t.val[1] =
922 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_1)));
923 u1_temp_8x8_t0 =
924 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_1));
925 u1_temp_8x8_t1 =
926 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_1));
927 ref_arr_temp1_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
928 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
929
930 u1_temp_8x8x2_t.val[0] =
931 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
932 u1_temp_8x8x2_t.val[1] =
933 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_1)));
934 u1_temp_8x8_t0 =
935 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_1));
936 u1_temp_8x8_t1 =
937 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_1));
938 ref_arr_temp1_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
939 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
940
941 res_16x8_r0_0 = vmulq_s16(ref_arr_temp0_16x8_r0_0, phs_mask_16min_16x8_0);
942 res_16x8_r1_0 = vmulq_s16(ref_arr_temp0_16x8_r1_0, phs_mask_16min_16x8_0);
943 res_16x8_r0_0 = vmlaq_s16(res_16x8_r0_0, ref_arr_temp1_16x8_r0_0,
944 vreinterpretq_s16_u16(phs_mask_16x8_0));
945 res_16x8_r1_0 = vmlaq_s16(res_16x8_r1_0, ref_arr_temp1_16x8_r1_0,
946 vreinterpretq_s16_u16(phs_mask_16x8_0));
947 res_16x8_r0_1 = vmulq_s16(ref_arr_temp0_16x8_r0_1, phs_mask_16min_16x8_1);
948 res_16x8_r1_1 = vmulq_s16(ref_arr_temp0_16x8_r1_1, phs_mask_16min_16x8_1);
949 res_16x8_r0_1 = vmlaq_s16(res_16x8_r0_1, ref_arr_temp1_16x8_r0_1,
950 vreinterpretq_s16_u16(phs_mask_16x8_1));
951 res_16x8_r1_1 = vmlaq_s16(res_16x8_r1_1, ref_arr_temp1_16x8_r1_1,
952 vreinterpretq_s16_u16(phs_mask_16x8_1));
953
954 pu1_ref_y_ptr_incr_temp =
955 pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
956 u1_y_incr_8x16_r0_0 = vld1q_u8((pu1_ref_y_ptr_incr_temp));
957
958 u1_incr_8x8x2_t.val[0] = vget_low_u8(u1_y_incr_8x16_r0_0);
959 u1_incr_8x8x2_t.val[1] = vget_high_u8(u1_y_incr_8x16_r0_0);
960 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_rnd_mask_r0_0));
961 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_rnd_mask_r0_0));
962 u1_y_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
963 u1_y_incr_16x8_r0_0 = vmovl_u8(vget_low_u8(u1_y_incr_8x16_r0_0));
964 u1_y_incr_16x8_r0_1 = vmovl_u8(vget_high_u8(u1_y_incr_8x16_r0_0));
965 u1_y_incr_16x8_r0_0 = vtstq_u16(u1_y_incr_16x8_r0_0, ones);
966 u1_y_incr_16x8_r0_1 = vtstq_u16(u1_y_incr_16x8_r0_1, ones);
967
968 prev_res_16x8_r0_0 = res_16x8_r0_0;
969 prev_res_16x8_r1_0 = res_16x8_r1_0;
970 prev_res_16x8_r0_1 = res_16x8_r0_1;
971 prev_res_16x8_r1_1 = res_16x8_r1_1;
972
973 u1_prev_y_incr_16x8_r0_0 = u1_y_incr_16x8_r0_0;
974 u1_prev_y_incr_16x8_r0_1 = u1_y_incr_16x8_r0_1;
975 }
976 }
977
978 if(!zero_r0_r1)
979 {
980 res_16x8_l = vdupq_n_s16(0);
981 res_16x8_h = vdupq_n_s16(0);
982 }
983 else
984 {
985 i4_y_phase = pi1_y_phase[i4_y];
986
987 if((i4_y_phase) >> 3)
988 {
989 vert_res_16x8_r0_0 =
990 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r1_0);
991 vert_res_16x8_r1_0 =
992 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r1_0);
993 vert_res_16x8_r0_1 =
994 vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r0_1, res_16x8_r1_1);
995 vert_res_16x8_r1_1 =
996 vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r1_1, res_16x8_r1_1);
997 }
998 else
999 {
1000 vert_res_16x8_r0_0 =
1001 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r0_0);
1002 vert_res_16x8_r1_0 =
1003 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r0_0);
1004 vert_res_16x8_r0_1 =
1005 vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r0_1, res_16x8_r0_1);
1006 vert_res_16x8_r1_1 =
1007 vbslq_s16(u1_y_incr_16x8_r0_1, res_16x8_r1_1, res_16x8_r0_1);
1008 }
1009
1010 res_32x4_l_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1011 res_32x4_l_0 =
1012 vmlal_n_s16(res_32x4_l_0, vget_low_s16(vert_res_16x8_r1_0), i4_y_phase);
1013
1014 res_32x4_l_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1015 res_32x4_l_1 =
1016 vmlal_n_s16(res_32x4_l_1, vget_high_s16(vert_res_16x8_r1_0), i4_y_phase);
1017 res_32x4_h_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_1), 16 - i4_y_phase);
1018 res_32x4_h_0 =
1019 vmlal_n_s16(res_32x4_h_0, vget_low_s16(vert_res_16x8_r1_1), i4_y_phase);
1020 res_32x4_h_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_1), 16 - i4_y_phase);
1021 res_32x4_h_1 =
1022 vmlal_n_s16(res_32x4_h_1, vget_high_s16(vert_res_16x8_r1_1), i4_y_phase);
1023
1024 res_32x4_l_0 = vrshrq_n_s32(res_32x4_l_0, 8);
1025 res_32x4_l_1 = vrshrq_n_s32(res_32x4_l_1, 8);
1026 res_32x4_h_0 = vrshrq_n_s32(res_32x4_h_0, 8);
1027 res_32x4_h_1 = vrshrq_n_s32(res_32x4_h_1, 8);
1028
1029 res_16x8_l = vcombine_s16(vmovn_s32(res_32x4_l_0), vmovn_s32(res_32x4_l_1));
1030 res_16x8_h = vcombine_s16(vmovn_s32(res_32x4_h_0), vmovn_s32(res_32x4_h_1));
1031 }
1032
1033 out_stride_temp = (i4_y * i4_out_stride);
1034 vst1q_s16((pi2_out + out_stride_temp), res_16x8_l);
1035 vst1q_s16((pi2_out + out_stride_temp + 8), res_16x8_h);
1036 }
1037 }
1038 else
1039 {
1040 int16x8_t ref_arr_16x8_r0_0;
1041 int16x8_t ref_arr_16x8_r1_0;
1042 uint8x16_t x_ref_pos_mask_r0_0, x_ref_rnd_mask_r0_0;
1043 uint16x8_t u1_incr_16x8_r0_0, u1_incr_mask_16x8_r0_0, phs_mask_16x8_0,
1044 u1_incr_not_16x8_r0_0, u1_y_incr_16x8_r0_0;
1045 uint16x8_t u1_incr_16x8_r1_0, u1_incr_mask_16x8_r1_0, u1_incr_not_16x8_r1_0;
1046 uint8x16_t u1_incr_8x16_r0_0, x_ref_pos_mask_temp_r0_0, u1_incr_mask_8x16_r0_0,
1047 u1_incr_not_8x16_r0_0, u1_y_incr_8x16_r0_0;
1048 uint8x16_t u1_incr_8x16_r1_0, x_ref_pos_mask_temp_r1_0, u1_incr_mask_8x16_r1_0,
1049 u1_incr_not_8x16_r1_0;
1050 int16x8_t ref_arr_temp0_16x8_r0_0, res_16x8_r0_0, vert_res_16x8_r0_0;
1051 int16x8_t ref_arr_temp0_16x8_r1_0, res_16x8_r1_0, vert_res_16x8_r1_0;
1052 int16x8_t ref_arr_temp1_16x8_r0_0;
1053 int16x8_t ref_arr_temp1_16x8_r1_0;
1054
1055 int32x4_t res_32x4_l_0;
1056 int32x4_t res_32x4_l_1;
1057 int16x8_t out_16x8_0, out_16x8_1;
1058 uint16x8_t phs_mask_div8_16x8_0, phs_mask_div8_msb_16x8_0;
1059 int16x8_t const_16_16x8, phs_mask_16min_16x8_0;
1060 uint8x8x2_t u1_temp_8x8x2_t;
1061 uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
1062
1063 uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
1064 uint16x8_t ones = vdupq_n_u16(0xFFFF);
1065
1066 WORD16 *pi2_ref_array_temp;
1067 UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
1068 WORD32 i4_y_phase;
1069 uint8x8x2_t u1_incr_8x8x2_t;
1070 uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
1071 int16x8_t prev_res_16x8_r0_0;
1072 int16x8_t prev_res_16x8_r1_0;
1073 int16x8_t dup_val_1, dup_val_2, dup_abs;
1074 uint16x8_t u1_prev_y_incr_16x8_r0_0;
1075
1076 WORD32 out_stride_temp;
1077 WORD32 zero_r0_r1 = 0;
1078 WORD32 i4_x2 = 0;
1079 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1080 {
1081 arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
1082 arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
1083 }
1084 pi1_y_ref_pos = arr_y_ref_pos;
1085 pi1_y_phase = arr_y_phase;
1086
1087 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
1088 {
1089 arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
1090 arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
1091 i4_x2 = i4_x << 1;
1092 arr_x_ref_pos_low[i4_x2] = (arr_x_ref_pos[i4_x]) << 1;
1093 arr_x_ref_pos_low[i4_x2 + 1] = arr_x_ref_pos_low[i4_x2] + 1;
1094 }
1095
1096 pi1_x_ref_pos_low = arr_x_ref_pos_low;
1097 pi1_x_phase = arr_x_phase;
1098
1099 phs_mask_16x8_0 = vmovl_u8(vld1_u8((pi1_x_phase)));
1100 x_ref_pos_mask_r0_0 = vld1q_u8((pi1_x_ref_pos_low));
1101 const_16_16x8 = vdupq_n_s16(16);
1102 phs_mask_div8_16x8_0 = vshrq_n_u16(phs_mask_16x8_0, 3);
1103 phs_mask_div8_msb_16x8_0 = vsliq_n_u16(phs_mask_div8_16x8_0, phs_mask_div8_16x8_0, 8);
1104
1105 phs_mask_16min_16x8_0 = vsubq_s16(const_16_16x8, vreinterpretq_s16_u16(phs_mask_16x8_0));
1106
1107 x_ref_rnd_mask_r0_0 = vaddq_u8(
1108 x_ref_pos_mask_r0_0, vreinterpretq_u8_u16(vshlq_n_u16(phs_mask_div8_msb_16x8_0, 1)));
1109 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1110 {
1111 if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
1112 {
1113 if(!zero_r0_r1)
1114 {
1115 res_32x4_l_0 = vdupq_n_s32(0);
1116 res_32x4_l_1 = vdupq_n_s32(0);
1117
1118 out_stride_temp = (i4_y * i4_out_stride);
1119
1120 out_16x8_0 = vld1q_s16(pi2_out + out_stride_temp);
1121 out_16x8_1 = vld1q_s16(pi2_out + out_stride_temp + 8);
1122 out_16x8_0 = vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_0),
1123 out_16x8_0);
1124 out_16x8_1 = vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_1),
1125 out_16x8_1);
1126 vst1q_s16((pi2_out + out_stride_temp), out_16x8_0);
1127 vst1q_s16((pi2_out + out_stride_temp + 8), out_16x8_1);
1128 continue;
1129 }
1130
1131 res_16x8_r0_0 = prev_res_16x8_r0_0;
1132 res_16x8_r1_0 = prev_res_16x8_r1_0;
1133
1134 u1_y_incr_16x8_r0_0 = u1_prev_y_incr_16x8_r0_0;
1135 }
1136 else
1137 {
1138 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1139 pu1_ref_x_ptr_incr_temp =
1140 pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1141 ref_arr_16x8_r0_0 = vld1q_s16((pi2_ref_array_temp));
1142 ref_arr_16x8_r1_0 = vld1q_s16((pi2_ref_array_temp + i4_refarray_wd));
1143
1144 dup_val_1 = vabsq_s16(ref_arr_16x8_r0_0);
1145 dup_val_2 = vabsq_s16(ref_arr_16x8_r1_0);
1146 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1147 zero_r0_r1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] ||
1148 dup_abs[5] || dup_abs[6] || dup_abs[7];
1149 if(zero_r0_r1)
1150 {
1151 u1_incr_16x8_r0_0 = (vmovl_u8(vld1_u8((pu1_ref_x_ptr_incr_temp))));
1152 u1_incr_16x8_r1_0 =
1153 (vmovl_u8(vld1_u8((pu1_ref_x_ptr_incr_temp + i4_refarray_wd))));
1154 u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_incr_16x8_r0_0));
1155 u1_incr_8x8x2_t.val[1] = vget_high_u8(vreinterpretq_u8_u16(u1_incr_16x8_r0_0));
1156 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
1157 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
1158 u1_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
1159
1160 u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_incr_16x8_r1_0));
1161 u1_incr_8x8x2_t.val[1] = vget_high_u8(vreinterpretq_u8_u16(u1_incr_16x8_r1_0));
1162 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_pos_mask_r0_0));
1163 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_pos_mask_r0_0));
1164 u1_incr_8x16_r1_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
1165
1166 u1_incr_16x8_r0_0 = vreinterpretq_u16_u8(u1_incr_8x16_r0_0);
1167 u1_incr_16x8_r1_0 = vreinterpretq_u16_u8(u1_incr_8x16_r1_0);
1168 u1_incr_mask_16x8_r0_0 = vsliq_n_u16(u1_incr_16x8_r0_0, u1_incr_16x8_r0_0, 8);
1169 u1_incr_mask_16x8_r1_0 = vsliq_n_u16(u1_incr_16x8_r1_0, u1_incr_16x8_r1_0, 8);
1170
1171 u1_incr_not_16x8_r0_0 =
1172 vbicq_u16(phs_mask_div8_msb_16x8_0, u1_incr_mask_16x8_r0_0);
1173 u1_incr_not_16x8_r1_0 =
1174 vbicq_u16(phs_mask_div8_msb_16x8_0, u1_incr_mask_16x8_r1_0);
1175
1176 u1_incr_mask_8x16_r0_0 =
1177 vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_mask_16x8_r0_0, 1));
1178 u1_incr_mask_8x16_r1_0 =
1179 vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_mask_16x8_r1_0, 1));
1180 u1_incr_not_8x16_r0_0 =
1181 vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_not_16x8_r0_0, 1));
1182 u1_incr_not_8x16_r1_0 =
1183 vreinterpretq_u8_u16(vshlq_n_u16(u1_incr_not_16x8_r1_0, 1));
1184
1185 u1_incr_not_8x16_r0_0 = vaddq_u8(u1_incr_not_8x16_r0_0, x_ref_pos_mask_r0_0);
1186 u1_incr_not_8x16_r1_0 = vaddq_u8(u1_incr_not_8x16_r1_0, x_ref_pos_mask_r0_0);
1187
1188 x_ref_pos_mask_temp_r0_0 =
1189 vaddq_u8(u1_incr_not_8x16_r0_0, u1_incr_mask_8x16_r0_0);
1190 x_ref_pos_mask_temp_r1_0 =
1191 vaddq_u8(u1_incr_not_8x16_r1_0, u1_incr_mask_8x16_r1_0);
1192
1193 u1_temp_8x8x2_t.val[0] =
1194 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1195 u1_temp_8x8x2_t.val[1] =
1196 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1197 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r0_0));
1198 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r0_0));
1199 ref_arr_temp0_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
1200 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1201
1202 u1_temp_8x8x2_t.val[0] =
1203 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1204 u1_temp_8x8x2_t.val[1] =
1205 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1206 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(u1_incr_not_8x16_r1_0));
1207 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(u1_incr_not_8x16_r1_0));
1208 ref_arr_temp0_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
1209 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1210
1211 u1_temp_8x8x2_t.val[0] =
1212 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1213 u1_temp_8x8x2_t.val[1] =
1214 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r0_0)));
1215 u1_temp_8x8_t0 =
1216 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r0_0));
1217 u1_temp_8x8_t1 =
1218 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r0_0));
1219 ref_arr_temp1_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
1220 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1221
1222 u1_temp_8x8x2_t.val[0] =
1223 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1224 u1_temp_8x8x2_t.val[1] =
1225 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(ref_arr_16x8_r1_0)));
1226 u1_temp_8x8_t0 =
1227 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_mask_temp_r1_0));
1228 u1_temp_8x8_t1 =
1229 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_mask_temp_r1_0));
1230 ref_arr_temp1_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
1231 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
1232 res_16x8_r0_0 = vmulq_s16(ref_arr_temp0_16x8_r0_0, phs_mask_16min_16x8_0);
1233 res_16x8_r1_0 = vmulq_s16(ref_arr_temp0_16x8_r1_0, phs_mask_16min_16x8_0);
1234 res_16x8_r0_0 = vmlaq_s16(res_16x8_r0_0, ref_arr_temp1_16x8_r0_0,
1235 vreinterpretq_s16_u16(phs_mask_16x8_0));
1236 res_16x8_r1_0 = vmlaq_s16(res_16x8_r1_0, ref_arr_temp1_16x8_r1_0,
1237 vreinterpretq_s16_u16(phs_mask_16x8_0));
1238
1239 pu1_ref_y_ptr_incr_temp =
1240 pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
1241 u1_y_incr_16x8_r0_0 = vmovl_u8(vld1_u8((pu1_ref_y_ptr_incr_temp)));
1242 u1_incr_8x8x2_t.val[0] = vget_low_u8(vreinterpretq_u8_u16(u1_y_incr_16x8_r0_0));
1243 u1_incr_8x8x2_t.val[1] =
1244 vget_high_u8(vreinterpretq_u8_u16(u1_y_incr_16x8_r0_0));
1245 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(x_ref_rnd_mask_r0_0));
1246 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(x_ref_rnd_mask_r0_0));
1247 u1_y_incr_8x16_r0_0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
1248
1249 u1_y_incr_16x8_r0_0 = vreinterpretq_u16_u8(u1_y_incr_8x16_r0_0);
1250
1251 u1_y_incr_16x8_r0_0 = vtstq_u16(u1_y_incr_16x8_r0_0, ones);
1252
1253 prev_res_16x8_r0_0 = res_16x8_r0_0;
1254 prev_res_16x8_r1_0 = res_16x8_r1_0;
1255
1256 u1_prev_y_incr_16x8_r0_0 = u1_y_incr_16x8_r0_0;
1257 }
1258 }
1259
1260 if(!zero_r0_r1)
1261 {
1262 res_32x4_l_0 = vdupq_n_s32(0);
1263 res_32x4_l_1 = vdupq_n_s32(0);
1264 }
1265 else
1266 {
1267 i4_y_phase = pi1_y_phase[i4_y];
1268
1269 if((i4_y_phase) >> 3)
1270 {
1271 vert_res_16x8_r0_0 =
1272 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r1_0);
1273 vert_res_16x8_r1_0 =
1274 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r1_0);
1275 }
1276 else
1277 {
1278 vert_res_16x8_r0_0 =
1279 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r0_0, res_16x8_r0_0);
1280 vert_res_16x8_r1_0 =
1281 vbslq_s16(u1_y_incr_16x8_r0_0, res_16x8_r1_0, res_16x8_r0_0);
1282 }
1283 res_32x4_l_0 = vmull_n_s16(vget_low_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1284 res_32x4_l_0 =
1285 vmlal_n_s16(res_32x4_l_0, vget_low_s16(vert_res_16x8_r1_0), i4_y_phase);
1286
1287 res_32x4_l_1 = vmull_n_s16(vget_high_s16(vert_res_16x8_r0_0), 16 - i4_y_phase);
1288 res_32x4_l_1 =
1289 vmlal_n_s16(res_32x4_l_1, vget_high_s16(vert_res_16x8_r1_0), i4_y_phase);
1290
1291 res_32x4_l_0 = vrshrq_n_s32(res_32x4_l_0, 8);
1292 res_32x4_l_1 = vrshrq_n_s32(res_32x4_l_1, 8);
1293 }
1294 out_stride_temp = (i4_y * i4_out_stride);
1295
1296 out_16x8_0 = vld1q_s16(pi2_out + out_stride_temp);
1297 out_16x8_1 = vld1q_s16(pi2_out + out_stride_temp + 8);
1298 out_16x8_0 =
1299 vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_0), out_16x8_0);
1300 out_16x8_1 =
1301 vbslq_s16(chroma_mask_16x8, vreinterpretq_s16_s32(res_32x4_l_1), out_16x8_1);
1302 vst1q_s16((pi2_out + out_stride_temp), out_16x8_0);
1303 vst1q_s16((pi2_out + out_stride_temp + 8), out_16x8_1);
1304 }
1305 }
1306 return;
1307 } /* End of Interpolation Function */
1308
1309 /*****************************************************************************/
1310 /* */
1311 /* Function Name : isvcd_residual_reflayer_const_non_boundary_mb_neonintr */
1312 /* */
1313 /* Description : */
1314 /* */
1315 /* Inputs : */
1316 /* Globals : none */
1317 /* Processing : */
1318 /* */
1319 /* Outputs : none */
1320 /* Returns : none */
1321 /* */
1322 /* Issues : none */
1323 /* */
1324 /* Revision History: */
1325 /* */
1326 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1327 /* 25 11 2021 Dolan creation */
1328 /* */
1329 /*****************************************************************************/
1330
isvcd_residual_reflayer_const_non_boundary_mb_neonintr(WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_ref_array,WORD32 i4_refarray_wd,WORD32 i4_refarray_ht,WORD32 i4_ref_mb_type_q0,WORD32 i4_ref_mb_type_q1,WORD32 i4_ref_mb_type_q2,WORD32 i4_ref_mb_type_q3,WORD32 i4_mb_quard1_part_x,WORD32 i4_mb_quard1_part_y,WORD32 i4_chroma_flag)1331 void isvcd_residual_reflayer_const_non_boundary_mb_neonintr(
1332 WORD16 *pi2_inp_data, WORD32 i4_inp_data_stride, WORD16 *pi2_ref_array, WORD32 i4_refarray_wd,
1333 WORD32 i4_refarray_ht, WORD32 i4_ref_mb_type_q0, WORD32 i4_ref_mb_type_q1,
1334 WORD32 i4_ref_mb_type_q2, WORD32 i4_ref_mb_type_q3, WORD32 i4_mb_quard1_part_x,
1335 WORD32 i4_mb_quard1_part_y, WORD32 i4_chroma_flag)
1336 {
1337 WORD32 i4_y;
1338 WORD16 *pi2_ref_data_byte;
1339 WORD16 *pi2_ref_array_temp;
1340
1341 if(i4_chroma_flag == 0)
1342 {
1343 WORD16 index_0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
1344 int16x8_t ref_mb_type_16x8_q0, ref_mb_type_16x8_q1, ref_mb_type_16x8_q2,
1345 ref_mb_type_16x8_q3, mb_quard1_part_x_16x8;
1346 int16x8_t ref_mb_type_16x8_0, ref_mb_type_16x8_1;
1347 int16x8_t ref_mb_type_16x8_low_0, ref_mb_type_16x8_low_1;
1348 uint16x8_t mb_type_mask_16x8_0, mb_type_mask_16x8_1;
1349 uint16x8_t mb_type_mask_16x8_low_0, mb_type_mask_16x8_low_1;
1350 uint16x8_t mask_16x8_0;
1351
1352 int16x8_t index_arr_0;
1353 int16x8_t inp_data_16x8_0, inp_data_16x8_1;
1354 int16x8_t res_16x8_0, res_16x8_1;
1355 int16x8_t one_16x8 = vdupq_n_s16(1);
1356 int16x8_t zero_16x8 = vdupq_n_s16(0);
1357
1358 index_arr_0 = vld1q_s16(&index_0[0]);
1359
1360 ref_mb_type_16x8_q0 = vdupq_n_s16(i4_ref_mb_type_q0);
1361 ref_mb_type_16x8_q1 = vdupq_n_s16(i4_ref_mb_type_q1);
1362 ref_mb_type_16x8_q2 = vdupq_n_s16(i4_ref_mb_type_q2);
1363 ref_mb_type_16x8_q3 = vdupq_n_s16(i4_ref_mb_type_q3);
1364 if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1365 {
1366 // Quard 0
1367 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1368 ref_mb_type_16x8_1 = ref_mb_type_16x8_q0;
1369 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1370 mb_type_mask_16x8_1 = mb_type_mask_16x8_0;
1371 }
1372 else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1373 (i4_mb_quard1_part_x < i4_refarray_wd))
1374 {
1375 // Quard 0 & 1
1376 if(i4_mb_quard1_part_x == 8)
1377 {
1378 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1379 ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1380 }
1381 else if(i4_mb_quard1_part_x < 8)
1382 {
1383 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1384 mask_16x8_0 =
1385 vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0
1386
1387 ref_mb_type_16x8_0 =
1388 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1389 ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1390 }
1391 else
1392 {
1393 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x - 8));
1394 mask_16x8_0 =
1395 vcleq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0
1396
1397 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1398 ref_mb_type_16x8_1 =
1399 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1400 }
1401
1402 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1403 mb_type_mask_16x8_1 = vceqq_s16(ref_mb_type_16x8_1, one_16x8);
1404 }
1405 else
1406 {
1407 if(i4_mb_quard1_part_x >= i4_refarray_wd)
1408 {
1409 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1410 ref_mb_type_16x8_1 = ref_mb_type_16x8_q0;
1411
1412 ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1413 ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q2;
1414 }
1415 else
1416 {
1417 // Quard 0, 1, 2, 3
1418 if(i4_mb_quard1_part_x == 8)
1419 {
1420 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1421 ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1422
1423 ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1424 ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q3;
1425 }
1426 else if(i4_mb_quard1_part_x < 8)
1427 {
1428 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1429 mask_16x8_0 =
1430 vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0
1431
1432 ref_mb_type_16x8_0 =
1433 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1434 ref_mb_type_16x8_1 = ref_mb_type_16x8_q1;
1435
1436 ref_mb_type_16x8_low_0 =
1437 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3);
1438 ref_mb_type_16x8_low_1 = ref_mb_type_16x8_q3;
1439 }
1440 else
1441 {
1442 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x - 8));
1443 mask_16x8_0 =
1444 vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0
1445
1446 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1447 ref_mb_type_16x8_1 =
1448 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1449
1450 ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1451 ref_mb_type_16x8_low_1 =
1452 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3);
1453 }
1454 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1455 mb_type_mask_16x8_1 = vceqq_s16(ref_mb_type_16x8_1, one_16x8);
1456
1457 mb_type_mask_16x8_low_0 = vceqq_s16(ref_mb_type_16x8_low_0, one_16x8);
1458 mb_type_mask_16x8_low_1 = vceqq_s16(ref_mb_type_16x8_low_1, one_16x8);
1459 }
1460 }
1461
1462 if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1463 {
1464 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1465 {
1466 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1467 inp_data_16x8_0 = vld1q_s16((pi2_ref_data_byte));
1468 inp_data_16x8_1 = vld1q_s16((pi2_ref_data_byte + 8));
1469 if(i4_y < i4_mb_quard1_part_y)
1470 {
1471 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8_0, zero_16x8);
1472 res_16x8_1 = vbslq_s16(mb_type_mask_16x8_1, inp_data_16x8_1, zero_16x8);
1473 }
1474 else
1475 {
1476 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_low_0, inp_data_16x8_0, zero_16x8);
1477 res_16x8_1 = vbslq_s16(mb_type_mask_16x8_low_1, inp_data_16x8_1, zero_16x8);
1478 }
1479 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1480 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1481 vst1q_s16((pi2_ref_array_temp + 8), res_16x8_1);
1482 }
1483 }
1484 else
1485 {
1486 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1487 {
1488 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1489 inp_data_16x8_0 = vld1q_s16((pi2_ref_data_byte));
1490 inp_data_16x8_1 = vld1q_s16((pi2_ref_data_byte + 8));
1491
1492 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8_0, zero_16x8);
1493 res_16x8_1 = vbslq_s16(mb_type_mask_16x8_1, inp_data_16x8_1, zero_16x8);
1494
1495 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1496 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1497 vst1q_s16((pi2_ref_array_temp + 8), res_16x8_1);
1498 }
1499 }
1500 }
1501 else
1502 {
1503 WORD16 index_0[8] = {0, 1, 2, 3, 4, 5, 6, 7};
1504 int16x8_t ref_mb_type_16x8_q0, ref_mb_type_16x8_q1, ref_mb_type_16x8_q2,
1505 ref_mb_type_16x8_q3, mb_quard1_part_x_16x8;
1506 int16x8_t ref_mb_type_16x8_0;
1507 int16x8_t ref_mb_type_16x8_low_0;
1508 uint16x8_t mb_type_mask_16x8_0;
1509 uint16x8_t mb_type_mask_16x8_low_0;
1510 uint16x8_t mask_16x8_0;
1511
1512 int16x8_t index_arr_0;
1513 int16x8x2_t inp_data_16x8x2;
1514 int16x8_t inp_data_16x8;
1515 int16x8_t res_16x8_0;
1516 int16x8_t one_16x8 = vdupq_n_s16(1);
1517 int16x8_t zero_16x8 = vdupq_n_s16(0);
1518 index_arr_0 = vld1q_s16(&index_0[0]);
1519
1520 ref_mb_type_16x8_q0 = vdupq_n_s16(i4_ref_mb_type_q0);
1521 ref_mb_type_16x8_q1 = vdupq_n_s16(i4_ref_mb_type_q1);
1522 ref_mb_type_16x8_q2 = vdupq_n_s16(i4_ref_mb_type_q2);
1523 ref_mb_type_16x8_q3 = vdupq_n_s16(i4_ref_mb_type_q3);
1524 if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1525 {
1526 // Quard 0
1527 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1528 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1529 }
1530 else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1531 (i4_mb_quard1_part_x < i4_refarray_wd))
1532 {
1533 // Quard 0 & 1
1534 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1535 mask_16x8_0 = vcltq_s16(index_arr_0,
1536 mb_quard1_part_x_16x8); // return 1 if a<b, else 0
1537
1538 ref_mb_type_16x8_0 = vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1539 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1540 }
1541 else
1542 {
1543 if(i4_mb_quard1_part_x >= i4_refarray_wd)
1544 {
1545 ref_mb_type_16x8_0 = ref_mb_type_16x8_q0;
1546 ref_mb_type_16x8_low_0 = ref_mb_type_16x8_q2;
1547 }
1548 else
1549 {
1550 mb_quard1_part_x_16x8 = vdupq_n_s16((i4_mb_quard1_part_x));
1551 mask_16x8_0 =
1552 vcltq_s16(index_arr_0, mb_quard1_part_x_16x8); // return 1 if a<b, else 0
1553
1554 ref_mb_type_16x8_0 =
1555 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q0, ref_mb_type_16x8_q1);
1556 ref_mb_type_16x8_low_0 =
1557 vbslq_s16(mask_16x8_0, ref_mb_type_16x8_q2, ref_mb_type_16x8_q3);
1558
1559 mb_type_mask_16x8_0 = vceqq_s16(ref_mb_type_16x8_0, one_16x8);
1560 mb_type_mask_16x8_low_0 = vceqq_s16(ref_mb_type_16x8_low_0, one_16x8);
1561 }
1562 }
1563
1564 if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1565 {
1566 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1567 {
1568 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1569 inp_data_16x8x2 = vld2q_s16((pi2_ref_data_byte));
1570 inp_data_16x8 = inp_data_16x8x2.val[0];
1571
1572 if(i4_y < i4_mb_quard1_part_y)
1573 {
1574 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8, zero_16x8);
1575 }
1576 else
1577 {
1578 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_low_0, inp_data_16x8, zero_16x8);
1579 }
1580 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1581 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1582 }
1583 }
1584 else
1585 {
1586 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1587 {
1588 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1589 inp_data_16x8x2 = vld2q_s16((pi2_ref_data_byte));
1590 inp_data_16x8 = inp_data_16x8x2.val[0];
1591
1592 res_16x8_0 = vbslq_s16(mb_type_mask_16x8_0, inp_data_16x8, zero_16x8);
1593
1594 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1595 vst1q_s16((pi2_ref_array_temp), res_16x8_0);
1596 }
1597 }
1598 }
1599 }
1600