xref: /aosp_15_r20/external/libavc/decoder/arm/svc/isvcd_intra_resamp_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_intra_resamp_neonintr.c
24  *
25  * @brief
26  *  Contains routines that resample for SVC resampling
27  *
28  * @author
29  *  Kishore
30  *
31  * @par List of Functions:
32  *  - isvcd_interpolate_base_luma_dyadic_neonintr()
33  *  - isvcd_interpolate_intra_base_neonintr()
34  *  - isvcd_horz_interpol_chroma_dyadic_1_neonintr()
35  *  - isvcd_horz_interpol_chroma_dyadic_2_neonintr()
36  *  - isvcd_vert_interpol_chroma_dyadic_1_neonintr()
37  *  - isvcd_vert_interpol_chroma_dyadic_2_neonintr()
38  *  - isvcd_vert_interpol_chroma_dyadic_3_neonintr()
39  *
40  * @remarks
41  *  None
42  *
43  *******************************************************************************
44  */
45 #include <assert.h>
46 #include <string.h>
47 #include <arm_neon.h>
48 
49 #include "ih264_typedefs.h"
50 #include "ih264_macros.h"
51 #include "ih264_platform_macros.h"
52 #include "isvcd_structs.h"
53 #include "ih264_debug.h"
54 
55 /*****************************************************************************/
56 /*                                                                           */
57 /*  Function Name : isvcd_interpolate_base_luma_dyadic_neonintr               */
58 /*                                                                           */
59 /*  Description   : This function takes the reference array buffer & performs*/
60 /*                  intra resampling for dyadic scaling ratios               */
61 /*  Inputs        : pu1_inp_buf : ptr to the 12x12 reference sample buffer   */
62 /*                    pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
63 /*                        vertically interpolated data                       */
64 /*                  pu1_out_buf : output buffer pointer                      */
65 /*                  i4_out_stride : output buffer stride                     */
66 /*  Globals       : none                                                     */
67 /*  Processing    : it does the interpolation in vertical direction followed */
68 /*                  by horizontal direction                                  */
69 /*  Outputs       : resampled pixels                                         */
70 /*  Returns       : none                                                     */
71 /*                                                                           */
72 /*  Issues        : none                                                     */
73 /*                                                                           */
74 /*  Revision History:                                                        */
75 /*                                                                           */
76 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
77 /*         05 21 2021   Dolan          creation                              */
78 /*                                                                           */
79 /*****************************************************************************/
isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)80 void isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
81                                                  UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
82 {
83     WORD32 i4_y;
84     WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
85     WORD32 i4_filt_stride, i4_src_stride;
86     UWORD8 *pu1_inp, *pu1_out;
87     WORD16 *pi2_tmp;
88 
89     int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2;
90     uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3;
91     int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2;
92     /* Horizontal interpolation */
93     int32x4_t const_512_32x4 = vdupq_n_s32(512);
94     int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2;
95     uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp,
96         i4_rslt_horz_r1_2_tmp;
97     uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2;
98     int32x4x2_t i4_rslt_horz_32x4x2_t;
99     int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3,
100         i4_samp_horz_16x4_4;
101     int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8;
102     int16_t i4_coeff_c0 = -3;
103     int16_t i4_coeff_c1 = 28;
104     int16_t i4_coeff_c2 = 8;
105     int16_t i4_coeff_c3 = -1;
106     int32x4_t i4_rslt_horz_r0_1_tmp32, i4_rslt_horz_r1_1_tmp32, i4_rslt_horz_r0_2_tmp32,
107         i4_rslt_horz_r1_2_tmp32;
108 
109     /* Filter coefficient values for phase 4 */
110     i4_coeff_0 = -3;
111     i4_coeff_1 = 28;
112     i4_coeff_2 = 8;
113     i4_coeff_3 = -1;
114     i4_filt_stride = 12;
115     i4_src_stride = DYADIC_REF_W_Y;
116 
117     pu1_inp = pu1_inp_buf;
118     pi2_tmp = pi2_tmp_filt_buf;
119     pu1_out = pu1_out_buf;
120 
121     /* Vertical interpolation */
122     // First 64 bits
123     i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp);
124     pu1_inp += i4_src_stride;
125     i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp);
126     pu1_inp += i4_src_stride;
127     i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp);
128     pu1_inp += i4_src_stride;
129     i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
130     pu1_inp += i4_src_stride;
131 
132     i4_rslt_vert_16x8_0 =
133         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
134     i4_rslt_vert_16x8_0 = vmlaq_n_s16(
135         i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
136     i4_rslt_vert_16x8_0 = vmlaq_n_s16(
137         i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
138     i4_rslt_vert_16x8_0 = vmlaq_n_s16(
139         i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
140 
141     vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0);
142     pi2_tmp += i4_filt_stride;
143 
144     for(i4_y = 1; i4_y < 15; i4_y += 2)
145     {
146         i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
147         i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
148         i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
149         i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
150 
151         i4_rslt_vert_16x8_0 =
152             vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
153         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
154             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
155         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
156             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
157         i4_rslt_vert_16x8_0 = vmlaq_n_s16(
158             i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
159 
160         i4_rslt_vert_16x8_2 =
161             vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
162         i4_rslt_vert_16x8_2 = vmlaq_n_s16(
163             i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
164         i4_rslt_vert_16x8_2 = vmlaq_n_s16(
165             i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
166         i4_rslt_vert_16x8_2 = vmlaq_n_s16(
167             i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
168 
169         /* Storing the results */
170         vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
171         pi2_tmp += i4_filt_stride;
172         vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2));
173         pi2_tmp += i4_filt_stride;
174         pu1_inp += i4_src_stride;
175     } /*End of Loop over y*/
176 
177     /* y = 15, y_phase = 4 */
178     i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
179     i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
180     i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
181     i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
182 
183     i4_rslt_vert_16x8_0 =
184         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
185     i4_rslt_vert_16x8_0 = vmlaq_n_s16(
186         i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
187     i4_rslt_vert_16x8_0 = vmlaq_n_s16(
188         i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
189     i4_rslt_vert_16x8_0 = vmlaq_n_s16(
190         i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
191 
192     vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
193     /* End of loop over x */
194 
195     // Remaining 32 bits
196     pu1_inp = pu1_inp_buf + 8;
197     pi2_tmp = pi2_tmp_filt_buf + 8;
198 
199     i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp);
200     pu1_inp += i4_src_stride;
201     i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp);
202     pu1_inp += i4_src_stride;
203     i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp);
204     pu1_inp += i4_src_stride;
205     i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
206     pu1_inp += i4_src_stride;
207 
208     i4_rslt_vert_16x4_1 =
209         vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
210     i4_rslt_vert_16x4_1 =
211         vmla_n_s16(i4_rslt_vert_16x4_1,
212                    vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2);
213     i4_rslt_vert_16x4_1 =
214         vmla_n_s16(i4_rslt_vert_16x4_1,
215                    vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1);
216     i4_rslt_vert_16x4_1 =
217         vmla_n_s16(i4_rslt_vert_16x4_1,
218                    vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0);
219 
220     vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
221     pi2_tmp += i4_filt_stride;
222 
223     for(i4_y = 1; i4_y < 15; i4_y += 2)
224     {
225         i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
226         i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
227         i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
228         i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
229 
230         i4_rslt_vert_16x4_1 = vmul_n_s16(
231             vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
232         i4_rslt_vert_16x4_1 = vmla_n_s16(
233             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
234             i4_coeff_1);
235         i4_rslt_vert_16x4_1 = vmla_n_s16(
236             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
237             i4_coeff_2);
238         i4_rslt_vert_16x4_1 = vmla_n_s16(
239             i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
240             i4_coeff_3);
241 
242         i4_rslt_vert_16x4_2 = vmul_n_s16(
243             vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
244         i4_rslt_vert_16x4_2 = vmla_n_s16(
245             i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
246             i4_coeff_2);
247         i4_rslt_vert_16x4_2 = vmla_n_s16(
248             i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
249             i4_coeff_1);
250         i4_rslt_vert_16x4_2 = vmla_n_s16(
251             i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
252             i4_coeff_0);
253 
254         vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
255         pi2_tmp += i4_filt_stride;
256         vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2));
257         pi2_tmp += i4_filt_stride;
258         pu1_inp += i4_src_stride;
259     }
260 
261     i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
262     i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
263     i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
264     i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
265 
266     i4_rslt_vert_16x4_1 =
267         vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
268     i4_rslt_vert_16x4_1 =
269         vmla_n_s16(i4_rslt_vert_16x4_1,
270                    vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1);
271     i4_rslt_vert_16x4_1 =
272         vmla_n_s16(i4_rslt_vert_16x4_1,
273                    vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2);
274     i4_rslt_vert_16x4_1 =
275         vmla_n_s16(i4_rslt_vert_16x4_1,
276                    vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3);
277 
278     vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
279     /* Reinitializing the ptrs */
280     pu1_inp = pu1_inp_buf;
281     pi2_tmp = pi2_tmp_filt_buf;
282 
283     /* Horizontal interpolation */
284     for(i4_y = 0; i4_y < 16; i4_y++)
285     {
286         i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp);
287         i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1);
288         i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2);
289         i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3);
290         i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4);
291         i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5);
292         i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6);
293         i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7);
294         i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8);
295 
296         i4_rslt_horz_r0_1 = vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3);  // a0c3 a1c3  a2c3  a3c3
297         i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1,
298                                         i4_coeff_c2);  // a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
299         i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1);
300         i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0);
301 
302         i4_rslt_horz_r1_1 = vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0);  // a0c0 a1c0  a2c0  a3c0
303         i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2,
304                                         i4_coeff_c1);  // a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
305         i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2);
306         i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3);
307 
308         i4_rslt_horz_r0_2 = vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3);  // a0c3 a1c3  a2c3  a3c3
309         i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5,
310                                         i4_coeff_c2);  // a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
311         i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1);
312         i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0);
313 
314         i4_rslt_horz_r1_2 = vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0);  // a0c0 a1c0  a2c0  a3c0
315         i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6,
316                                         i4_coeff_c1);  // a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
317         i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2);
318         i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3);
319 
320         i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1);
321         i4_rslt_horz_r0_1_tmp32 = i4_rslt_horz_32x4x2_t.val[0];  // 0 to 3
322         i4_rslt_horz_r1_1_tmp32 = i4_rslt_horz_32x4x2_t.val[1];  // 4 to 7
323 
324         i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2);
325         i4_rslt_horz_r0_2_tmp32 = i4_rslt_horz_32x4x2_t.val[0];  // 8 to 11
326         i4_rslt_horz_r1_2_tmp32 = i4_rslt_horz_32x4x2_t.val[1];  // 12 to 15
327 
328         i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_1_tmp32, const_512_32x4);
329         i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r1_1_tmp32, const_512_32x4);
330         i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r0_2_tmp32, const_512_32x4);
331         i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_2_tmp32, const_512_32x4);
332 
333         i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10);
334         i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10);
335 
336         i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10);
337         i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10);
338 
339         rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp);  // 0 to 7
340         rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp);  // 8 to 15
341 
342         vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1));
343         vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2));
344 
345         pu1_out += i4_out_stride;
346         pi2_tmp += i4_filt_stride;
347     }
348 }
349 
350 /*****************************************************************************/
351 /*                                                                           */
352 /*  Function Name : isvcd_interpolate_intra_base_neonintr                     */
353 /*                                                                           */
354 /*  Description   : This function takes the reference array buffer & performs*/
355 /*                    interpolation of a component to find the intra         */
356 /*                     resampled value                                       */
357 /*  Inputs        : pv_intra_samp_ctxt : intra sampling context              */
358 /*                  pu1_out : output buffer pointer                          */
359 /*                  i4_out_stride : output buffer stride                     */
360 /*                  i4_refarray_wd : reference array width                   */
361 /*                  i4_x_offset : offset in reference layer in horz direction*/
362 /*                  ps_coord : current mb co-ordinate                        */
363 /*                  i4_chroma_flag : chroma processing flag                  */
364 /*  Globals       : none                                                     */
365 /*  Processing    : it does the interpolation in vertical direction followed */
366 /*                  by horizontal direction                                  */
367 /*  Outputs       : resampled pixels                                         */
368 /*  Returns       : none                                                     */
369 /*                                                                           */
370 /*  Issues        : none                                                     */
371 /*                                                                           */
372 /*  Revision History:                                                        */
373 /*                                                                           */
374 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
375 /*         26 06 2009   vijayakumar          creation                        */
376 /*                                                                           */
377 /*****************************************************************************/
isvcd_interpolate_intra_base_neonintr(void * pv_intra_samp_ctxt,UWORD8 * pu1_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,WORD32 i4_mb_x,WORD32 i4_mb_y,WORD32 i4_chroma_flag,WORD32 i4_refarray_flag)378 void isvcd_interpolate_intra_base_neonintr(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
379                                            WORD32 i4_out_stride, WORD32 i4_refarray_wd,
380                                            WORD32 i4_mb_x, WORD32 i4_mb_y, WORD32 i4_chroma_flag,
381                                            WORD32 i4_refarray_flag)
382 {
383     /* --------------------------------------------------------------------- */
384     /* Index Parameters                                                      */
385     /* --------------------------------------------------------------------- */
386     intra_sampling_ctxt_t *ps_ctxt;
387     intra_samp_map_ctxt_t *ps_map_ctxt;
388     intra_samp_lyr_ctxt *ps_lyr_ctxt;
389     WORD32 i4_x, i4_y;
390     WORD32 i4_frm_mb_x, i4_frm_mb_y;
391     UWORD8 *pu1_refarray = NULL;
392     ref_pixel_map_t *ps_x_pos_phase;
393     ref_pixel_map_t *ps_y_pos_phase;
394     WORD32 i4_temp_array_ht;
395     WORD32 *pi4_interp_buff;
396 
397     UWORD8 arr_y_ref_pos_luma[16] = {0};
398     UWORD8 arr_x_ref_pos_luma[16] = {0};
399     UWORD8 arr_x_ref_pos_luma_low[16] = {0};
400     UWORD8 arr_x_ref_pos_luma_high[16] = {0};
401     UWORD8 arr_phase_luma[16] = {0};
402     UWORD8 *pi4_y_ref_pos_luma;
403     UWORD8 *pi4_x_ref_pos_luma_low;
404     UWORD8 *pi4_x_ref_pos_luma_high;
405     UWORD8 *pi4_phase_luma;
406     WORD16 *pi2_interp_buff_temp;
407     WORD32 i4_mb_wd;
408     WORD32 i4_mb_ht;
409     WORD32 i4_x_min;
410     ref_min_max_map_t *ps_x_min_max;
411     UWORD8 *pu1_refarray_temp;
412 
413     ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
414     ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
415 
416     if(0 == i4_refarray_flag)
417     {
418         pu1_refarray = ps_ctxt->pu1_refarray_buffer;
419     }
420     else if(1 == i4_refarray_flag)
421     {
422         pu1_refarray = ps_ctxt->pu1_refarray_cb;
423     }
424 
425     /* --------------------------------------------------------------------- */
426     /* LUMA or CHROMA                                                        */
427     /* --------------------------------------------------------------------- */
428 
429     if(1 == i4_chroma_flag)
430         ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
431     else
432         ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
433 
434     i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
435     i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
436 
437     ps_x_min_max = ps_map_ctxt->ps_x_min_max;
438 
439     i4_frm_mb_y = i4_mb_y * i4_mb_ht;
440     i4_frm_mb_x = i4_mb_x * i4_mb_wd;
441     /* get the min and max positions */
442     i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
443 
444     /* --------------------------------------------------------------------- */
445     /* Projected frame level pointers                                        */
446     /* --------------------------------------------------------------------- */
447     ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
448     ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
449 
450     /* --------------------------------------------------------------------- */
451     /* Pointers and Dimenstion of the temporary buffer                       */
452     /* --------------------------------------------------------------------- */
453     i4_temp_array_ht = i4_mb_ht;
454     pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
455     pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
456 
457     /* --------------------------------------------------------------------- */
458     /* Loop for interpolation in vertical direction                          */
459     /* --------------------------------------------------------------------- */
460     if(i4_chroma_flag == 0)
461     {
462         {
463             uint8x8_t inp_8x8_r0, inp_8x8_r0_1;
464             uint8x8_t inp_8x8_r1, inp_8x8_r1_1;
465             uint8x8_t inp_8x8_r2, inp_8x8_r2_1;
466             uint8x8_t inp_8x8_r3, inp_8x8_r3_1;
467             int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1;
468 
469             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
470             {
471                 arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
472                 arr_y_ref_pos_luma[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
473             }
474             pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
475             pi4_phase_luma = arr_phase_luma;
476 
477             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
478             {
479                 pu1_refarray_temp =
480                     pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
481                 inp_8x8_r0 = vld1_u8((pu1_refarray_temp - i4_refarray_wd));
482                 inp_8x8_r1 = vld1_u8((pu1_refarray_temp));
483                 inp_8x8_r2 = vld1_u8((pu1_refarray_temp + i4_refarray_wd));
484                 inp_8x8_r3 = vld1_u8((pu1_refarray_temp + 2 * i4_refarray_wd));
485 
486                 inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8 - i4_refarray_wd));
487                 inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8));
488                 inp_8x8_r2_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd));
489                 inp_8x8_r3_1 = vld1_u8((pu1_refarray_temp + 8 + 2 * i4_refarray_wd));
490 
491                 out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)),
492                                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
493                 out_res_16x8_r0_0 =
494                     vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)),
495                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
496                 out_res_16x8_r0_0 =
497                     vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2)),
498                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
499                 out_res_16x8_r0_0 =
500                     vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3)),
501                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
502 
503                 out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)),
504                                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
505                 out_res_16x8_r0_1 =
506                     vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)),
507                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
508                 out_res_16x8_r0_1 =
509                     vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2_1)),
510                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
511                 out_res_16x8_r0_1 =
512                     vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3_1)),
513                                 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
514 
515                 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
516                           out_res_16x8_r0_0);
517                 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8),
518                           out_res_16x8_r0_1);
519             }
520         }
521         /*Horizontal Interpolation*/
522         {
523             WORD32 strt_indx = 10;
524 
525             uint8x16_t phs_mask_8x8_0;
526             uint8x16_t x_ref_pos_luma_mask_r0_0;
527             uint8x16_t x_ref_pos_luma_mask_r0_1;
528             uint8x16_t x_ref_pos_luma_mask_r1_0;
529             uint8x16_t x_ref_pos_luma_mask_r1_1;
530             uint8x16_t x_ref_pos_luma_mask_r2_0;
531             uint8x16_t x_ref_pos_luma_mask_r2_1;
532             uint8x16_t x_ref_pos_luma_mask_r3_0;
533             uint8x16_t x_ref_pos_luma_mask_r3_1;
534 
535             WORD32 strt_indx_h = 0, i4_x2 = 0;
536             WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
537             uint8x16_t twos = vdupq_n_u8(2);
538             strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
539             strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
540             for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
541             {
542                 arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
543                 arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
544                 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
545             }
546 
547             for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
548             {
549                 i4_x2 = i4_x << 1;
550                 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
551                 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
552             }
553             for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
554             {
555                 i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
556                 arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
557                 arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
558             }
559             pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
560             pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
561             pi4_phase_luma = arr_phase_luma;
562 
563             phs_mask_8x8_0 = vld1q_u8((const uint8_t *) pi4_phase_luma);
564 
565             x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low);
566             x_ref_pos_luma_mask_r0_1 = vld1q_u8(pi4_x_ref_pos_luma_high);
567             x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos);
568             x_ref_pos_luma_mask_r1_1 = vaddq_u8(x_ref_pos_luma_mask_r0_1, twos);
569             x_ref_pos_luma_mask_r2_0 = vaddq_u8(x_ref_pos_luma_mask_r1_0, twos);
570             x_ref_pos_luma_mask_r2_1 = vaddq_u8(x_ref_pos_luma_mask_r1_1, twos);
571             x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
572             x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
573 
574             {
575                 int8x16_t ip_filt_8x16_r0;
576                 int8x16_t ip_filt_8x16_r1;
577                 int8x16_t ip_filt_8x16_r2;
578                 int8x16_t ip_filt_8x16_r3;
579 
580                 int16x8_t ip_filt_16x8_r0_0, ip_filt_16x8_r0_1;
581                 int16x8_t ip_filt_16x8_r1_0, ip_filt_16x8_r1_1;
582                 int16x8_t ip_filt_16x8_r2_0, ip_filt_16x8_r2_1;
583                 int16x8_t ip_filt_16x8_r3_0, ip_filt_16x8_r3_1;
584 
585                 int16x8_t inp_16x8_0;
586                 int16x8_t inp_16x8_1;
587                 int16x8_t inp_16x8_2;
588                 int16x8_t inp_16x8_3;
589 
590                 int16x8_t inp_16x8_r0_0, inp_16x8_r2_0;
591                 int16x8_t inp_16x8_r0_1, inp_16x8_r2_1;
592                 int16x8_t inp_16x8_r1_0, inp_16x8_r3_0;
593                 int16x8_t inp_16x8_r1_1, inp_16x8_r3_1;
594 
595                 int16x4_t inp_16x4_r0_0, inp_16x4_r2_0;
596                 int16x4_t inp_16x4_r0_1, inp_16x4_r2_1;
597                 int16x4_t inp_16x4_r1_0, inp_16x4_r3_0;
598                 int16x4_t inp_16x4_r1_1, inp_16x4_r3_1;
599 
600                 int32x4_t out_res_32x4_r0_l_0;
601                 int32x4_t out_res_32x4_r0_l_1;
602                 int32x4_t out_res_32x4_r0_h_0;
603                 int32x4_t out_res_32x4_r0_h_1;
604 
605                 uint16x4_t out_res_16x4_r0_l_0;
606                 uint16x4_t out_res_16x4_r0_l_1;
607                 uint16x4_t out_res_16x4_r0_h_0;
608                 uint16x4_t out_res_16x4_r0_h_1;
609 
610                 uint8x8_t out_res_8x8_r0_l, out_res_8x8_r0_h;
611                 uint8x8x2_t u1_temp_8x8x2_t;
612                 uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
613 
614                 ip_filt_8x16_r0 = vld1q_s8((g_ai1_interp_filter_luma));
615                 ip_filt_8x16_r1 = vld1q_s8((g_ai1_interp_filter_luma + 16));
616                 ip_filt_8x16_r2 = vld1q_s8((g_ai1_interp_filter_luma + 32));
617                 ip_filt_8x16_r3 = vld1q_s8((g_ai1_interp_filter_luma + 48));
618 
619                 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r0));
620                 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r0));
621                 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
622                 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
623                 ip_filt_8x16_r0 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
624                                               vreinterpret_s8_u8(u1_temp_8x8_t1));
625 
626                 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r1));
627                 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r1));
628                 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
629                 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
630                 ip_filt_8x16_r1 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
631                                               vreinterpret_s8_u8(u1_temp_8x8_t1));
632                 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r2));
633                 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r2));
634                 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
635                 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
636                 ip_filt_8x16_r2 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
637                                               vreinterpret_s8_u8(u1_temp_8x8_t1));
638                 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r3));
639                 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r3));
640                 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
641                 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
642                 ip_filt_8x16_r3 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
643                                               vreinterpret_s8_u8(u1_temp_8x8_t1));
644                 ip_filt_16x8_r0_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r0));
645                 ip_filt_16x8_r1_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r1));
646                 ip_filt_16x8_r2_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r2));
647                 ip_filt_16x8_r3_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r3));
648                 ip_filt_16x8_r0_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r0));
649                 ip_filt_16x8_r1_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r1));
650                 ip_filt_16x8_r2_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r2));
651                 ip_filt_16x8_r3_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r3));
652 
653                 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
654                 {
655                     inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx));
656                     inp_16x8_1 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h));
657                     inp_16x8_2 = vld1q_s16((pi2_interp_buff_temp + strt_indx + 3));
658                     inp_16x8_3 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
659                     pi2_interp_buff_temp += i4_refarray_wd;
660                     u1_temp_8x8x2_t.val[0] =
661                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
662                     u1_temp_8x8x2_t.val[1] =
663                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
664                     u1_temp_8x8_t0 =
665                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0));
666                     u1_temp_8x8_t1 =
667                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0));
668                     inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
669                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
670 
671                     u1_temp_8x8x2_t.val[0] =
672                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
673                     u1_temp_8x8x2_t.val[1] =
674                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
675                     u1_temp_8x8_t0 =
676                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_1));
677                     u1_temp_8x8_t1 =
678                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_1));
679                     inp_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
680                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
681 
682                     u1_temp_8x8x2_t.val[0] =
683                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
684                     u1_temp_8x8x2_t.val[1] =
685                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
686                     u1_temp_8x8_t0 =
687                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0));
688                     u1_temp_8x8_t1 =
689                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0));
690                     inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
691                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
692 
693                     u1_temp_8x8x2_t.val[0] =
694                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
695                     u1_temp_8x8x2_t.val[1] =
696                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
697                     u1_temp_8x8_t0 =
698                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_1));
699                     u1_temp_8x8_t1 =
700                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_1));
701                     inp_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
702                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
703 
704                     u1_temp_8x8x2_t.val[0] =
705                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
706                     u1_temp_8x8x2_t.val[1] =
707                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
708                     u1_temp_8x8_t0 =
709                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_0));
710                     u1_temp_8x8_t1 =
711                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_0));
712                     inp_16x8_r2_0 = vreinterpretq_s16_s8(vcombine_s8(
713                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
714 
715                     u1_temp_8x8x2_t.val[0] =
716                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
717                     u1_temp_8x8x2_t.val[1] =
718                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
719                     u1_temp_8x8_t0 =
720                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_1));
721                     u1_temp_8x8_t1 =
722                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_1));
723                     inp_16x8_r2_1 = vreinterpretq_s16_s8(vcombine_s8(
724                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
725 
726                     u1_temp_8x8x2_t.val[0] =
727                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_2)));
728                     u1_temp_8x8x2_t.val[1] =
729                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_2)));
730                     u1_temp_8x8_t0 =
731                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_0));
732                     u1_temp_8x8_t1 =
733                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_0));
734                     inp_16x8_r3_0 = vreinterpretq_s16_s8(vcombine_s8(
735                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
736 
737                     u1_temp_8x8x2_t.val[0] =
738                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_3)));
739                     u1_temp_8x8x2_t.val[1] =
740                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_3)));
741                     u1_temp_8x8_t0 =
742                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_1));
743                     u1_temp_8x8_t1 =
744                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_1));
745                     inp_16x8_r3_1 = vreinterpretq_s16_s8(vcombine_s8(
746                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
747 
748                     inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0);
749                     inp_16x4_r0_1 = vget_low_s16(inp_16x8_r0_1);
750                     inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0);
751                     inp_16x4_r1_1 = vget_low_s16(inp_16x8_r1_1);
752 
753                     inp_16x4_r2_0 = vget_low_s16(inp_16x8_r2_0);
754                     inp_16x4_r2_1 = vget_low_s16(inp_16x8_r2_1);
755                     inp_16x4_r3_0 = vget_low_s16(inp_16x8_r3_0);
756                     inp_16x4_r3_1 = vget_low_s16(inp_16x8_r3_1);
757 
758                     out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0));
759                     out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0,
760                                                     vget_low_s16(ip_filt_16x8_r1_0));
761                     out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r2_0,
762                                                     vget_low_s16(ip_filt_16x8_r2_0));
763                     out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r3_0,
764                                                     vget_low_s16(ip_filt_16x8_r3_0));
765                     out_res_32x4_r0_l_1 =
766                         vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0));
767                     out_res_32x4_r0_l_1 =
768                         vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0),
769                                   vget_high_s16(ip_filt_16x8_r1_0));
770                     out_res_32x4_r0_l_1 =
771                         vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r2_0),
772                                   vget_high_s16(ip_filt_16x8_r2_0));
773                     out_res_32x4_r0_l_1 =
774                         vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r3_0),
775                                   vget_high_s16(ip_filt_16x8_r3_0));
776 
777                     out_res_32x4_r0_h_0 = vmull_s16(inp_16x4_r0_1, vget_low_s16(ip_filt_16x8_r0_1));
778                     out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r1_1,
779                                                     vget_low_s16(ip_filt_16x8_r1_1));
780                     out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r2_1,
781                                                     vget_low_s16(ip_filt_16x8_r2_1));
782                     out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r3_1,
783                                                     vget_low_s16(ip_filt_16x8_r3_1));
784 
785                     out_res_32x4_r0_h_1 =
786                         vmull_s16(vget_high_s16(inp_16x8_r0_1), vget_high_s16(ip_filt_16x8_r0_1));
787                     out_res_32x4_r0_h_1 =
788                         vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r1_1),
789                                   vget_high_s16(ip_filt_16x8_r1_1));
790                     out_res_32x4_r0_h_1 =
791                         vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r2_1),
792                                   vget_high_s16(ip_filt_16x8_r2_1));
793                     out_res_32x4_r0_h_1 =
794                         vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r3_1),
795                                   vget_high_s16(ip_filt_16x8_r3_1));
796 
797                     out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10);
798                     out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10);
799                     out_res_16x4_r0_h_0 = vqrshrun_n_s32(out_res_32x4_r0_h_0, 10);
800                     out_res_16x4_r0_h_1 = vqrshrun_n_s32(out_res_32x4_r0_h_1, 10);
801 
802                     out_res_8x8_r0_l =
803                         vqmovn_u16(vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1));
804                     out_res_8x8_r0_h =
805                         vqmovn_u16(vcombine_u16(out_res_16x4_r0_h_0, out_res_16x4_r0_h_1));
806                     vst1q_u8((pu1_out + (i4_y * i4_out_stride)),
807                              vcombine_u8(out_res_8x8_r0_l, out_res_8x8_r0_h));
808                 }
809             }
810         }
811     }
812     else
813     {
814         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
815         {
816             arr_y_ref_pos_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
817             arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
818         }
819         pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
820         pi4_phase_luma = arr_phase_luma;
821 
822         {
823             uint8x8_t inp_8x8_r0, inp_8x8_r0_1;
824             uint8x8_t inp_8x8_r1, inp_8x8_r1_1;
825             int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1;
826 
827             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
828             {
829                 pu1_refarray_temp =
830                     pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
831                 inp_8x8_r0 = vld1_u8((pu1_refarray_temp));
832                 inp_8x8_r1 = vld1_u8((pu1_refarray_temp + i4_refarray_wd));
833 
834                 inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8));
835                 inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd));
836 
837                 out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)),
838                                                 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
839                 out_res_16x8_r0_0 =
840                     vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)),
841                                 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
842 
843                 out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)),
844                                                 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
845                 out_res_16x8_r0_1 =
846                     vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)),
847                                 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
848 
849                 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
850                           out_res_16x8_r0_0);
851                 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8),
852                           out_res_16x8_r0_1);
853             }
854         }
855 
856         {
857             WORD32 strt_indx = 10;
858 
859             uint8x16_t phs_mask_8x8_0;
860             uint8x16_t x_ref_pos_luma_mask_r0_0;
861             uint8x16_t x_ref_pos_luma_mask_r1_0;
862 
863             WORD32 i4_x2 = 0;
864             uint8x16_t twos = vdupq_n_u8(2);
865             strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
866 
867             for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
868             {
869                 arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
870                 arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
871                 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
872                 i4_x2 = i4_x << 1;
873                 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
874                 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
875             }
876 
877             pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
878             pi4_phase_luma = arr_phase_luma;
879 
880             phs_mask_8x8_0 = vld1q_u8(pi4_phase_luma);
881             x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low);
882             x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos);
883 
884             {
885                 uint8x16_t ip_filt_8x16_r0;
886                 uint8x16_t ip_filt_8x16_r1;
887                 int16x8_t ip_filt_16x8_r0_0;
888                 int16x8_t ip_filt_16x8_r1_0;
889                 int16x8_t inp_16x8_0;
890                 int16x8_t inp_16x8_r0_0;
891                 int16x8_t inp_16x8_r1_0;
892                 int16x4_t inp_16x4_r0_0;
893                 int16x4_t inp_16x4_r1_0;
894                 int32x4_t out_res_32x4_r0_l_0;
895                 int32x4_t out_res_32x4_r0_l_1;
896                 uint16x4_t out_res_16x4_r0_l_0;
897                 uint16x4_t out_res_16x4_r0_l_1;
898                 uint16x8_t out_res_16x8_r0_l;
899                 uint8x16_t out_8x16_r0;
900                 uint8x8x2_t u1_incr_8x8x2_t;
901                 uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
902                 uint8x8x2_t u1_temp_8x8x2_t;
903                 uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
904                 uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
905 
906                 ip_filt_8x16_r0 = vld1q_u8((g_au1_interp_filter_chroma));
907                 ip_filt_8x16_r1 = vld1q_u8((g_au1_interp_filter_chroma + 16));
908 
909                 u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r0);
910                 u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r0);
911                 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
912                 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
913                 ip_filt_8x16_r0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
914 
915                 u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r1);
916                 u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r1);
917                 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
918                 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
919                 ip_filt_8x16_r1 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
920 
921                 ip_filt_16x8_r0_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r0)));
922                 ip_filt_16x8_r1_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r1)));
923 
924                 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
925                 {
926                     inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx));
927                     pi2_interp_buff_temp += i4_refarray_wd;
928                     u1_temp_8x8x2_t.val[0] =
929                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
930                     u1_temp_8x8x2_t.val[1] =
931                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
932                     u1_temp_8x8_t0 =
933                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0));
934                     u1_temp_8x8_t1 =
935                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0));
936                     inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
937                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
938 
939                     u1_temp_8x8x2_t.val[0] =
940                         vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
941                     u1_temp_8x8x2_t.val[1] =
942                         vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
943                     u1_temp_8x8_t0 =
944                         vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0));
945                     u1_temp_8x8_t1 =
946                         vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0));
947                     inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
948                         vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
949                     inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0);
950                     inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0);
951 
952                     out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0));
953                     out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0,
954                                                     vget_low_s16(ip_filt_16x8_r1_0));
955                     out_res_32x4_r0_l_1 =
956                         vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0));
957                     out_res_32x4_r0_l_1 =
958                         vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0),
959                                   vget_high_s16(ip_filt_16x8_r1_0));
960 
961                     out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10);
962                     out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10);
963                     out_res_16x8_r0_l = vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1);
964                     out_8x16_r0 = vld1q_u8(pu1_out + (i4_y * i4_out_stride));
965                     out_8x16_r0 = vbslq_u8(chroma_mask_8x16,
966                                            vreinterpretq_u8_u16(out_res_16x8_r0_l), out_8x16_r0);
967                     vst1q_u8((pu1_out + (i4_y * i4_out_stride)), out_8x16_r0);
968                 }
969             }
970         }
971     }
972     return;
973 } /* End of Interpolation Function */
974 
975 /*****************************************************************************/
976 /*                                                                           */
977 /*  Function Name : isvcd_horz_interpol_chroma_dyadic_1_neonintr              */
978 /*                                                                           */
979 /*  Description   : This function takes the reference array buffer & performs*/
980 /*                    interpolation of a component to find the intra         */
981 /*                     resampled value                                       */
982 /*  Inputs        : pv_intra_samp_ctxt : intra sampling context              */
983 /*                  pu1_out : output buffer pointer                          */
984 /*                  i4_out_stride : output buffer stride                     */
985 /*                  i4_refarray_wd : reference array width                   */
986 /*                  i4_x_offset : offset in reference layer in horz direction*/
987 /*                  ps_coord : current mb co-ordinate                        */
988 /*                  i4_chroma_flag : chroma processing flag                  */
989 /*  Globals       : none                                                     */
990 /*  Processing    : it does the interpolation on horizontal direction        */
991 /*  Outputs       : resampled pixels                                         */
992 /*  Returns       : none                                                     */
993 /*                                                                           */
994 /*  Issues        : none                                                     */
995 /*                                                                           */
996 /*  Revision History:                                                        */
997 /*                                                                           */
998 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
999 /*         26 06 2009   vijayakumar          creation                        */
1000 /*                                                                           */
1001 /*****************************************************************************/
1002 
isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1003 void isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1004                                                   WORD32 i4_out_stride, WORD32 i4_phase_0,
1005                                                   WORD32 i4_phase_1)
1006 {
1007     WORD32 i4_y;
1008     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1009     WORD32 i4_filt_stride, i4_dst_stride;
1010     UWORD8 *pu1_out;
1011     WORD16 *pi2_tmp;
1012 
1013     int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2;
1014     int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2;
1015     int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
1016     int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
1017 
1018     int16x8_t final_horz_16x8_r0_1;
1019     int16x8_t final_horz_16x8_r1_1;
1020 
1021     uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
1022     uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
1023 
1024     i4_coeff_0 = 8 - i4_phase_0;
1025     i4_coeff_1 = i4_phase_0;
1026     i4_coeff_2 = 8 - i4_phase_1;
1027     i4_coeff_3 = i4_phase_1;
1028 
1029     pu1_out = pu1_out_buf;
1030     pi2_tmp = pi2_tmp_filt_buf;
1031     i4_filt_stride = 6;
1032     i4_dst_stride = i4_out_stride;
1033 
1034     /* Horizontal interpolation */
1035     for(i4_y = 0; i4_y < 8; i4_y += 2)
1036     {
1037         i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp);      // a0 a1 a2 a3 a4 a5 a6 a7
1038         i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1);  // a1 a2 a3 a4
1039         i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2);  // a2 a3 a4 a5
1040 
1041         i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
1042         i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
1043         i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2));
1044 
1045         i4_rslt_horz_r0_1 =
1046             vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0);  // a0c0 a1c0  a2c0 a3c0
1047 
1048         i4_rslt_horz_r0_2 =
1049             vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2);  // a1c2 a2c2  a3c2 a4c2
1050         i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
1051                                         i4_coeff_1);  // a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
1052 
1053         i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2,
1054                                         i4_coeff_3);  // a1c2+a2c3  a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
1055 
1056         i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
1057         i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2);
1058 
1059         i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
1060         i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3);
1061 
1062         final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0];
1063         final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0];
1064 
1065         final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6);
1066 
1067         final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6);
1068 
1069         i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
1070         i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
1071 
1072         i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
1073                                        i4_out_horz_8x16_r0);
1074         i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
1075                                        i4_out_horz_8x16_r1);
1076 
1077         vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
1078         vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
1079 
1080         /* Incrementing ptr */
1081         pi2_tmp += (i4_filt_stride << 1);
1082         pu1_out += (i4_dst_stride << 1);
1083 
1084     } /* End of loop over y */
1085 }
1086 
1087 /*****************************************************************************/
1088 /*                                                                           */
1089 /*  Function Name : isvcd_horz_interpol_chroma_dyadic_2_neonintr              */
1090 /*                                                                           */
1091 /*  Description   : This function takes the reference array buffer & performs*/
1092 /*                  vertical intra resampling for dyadic scaling ratios for  */
1093 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1094 /*                    chroma_phase_y_plus1:                                  */
1095 /*                        ref_lyr        cur_lyr                             */
1096 /*                            0            1                                 */
1097 /*                            0            2                                 */
1098 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1099 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the   */
1100 /*                        vertically interpolated data                       */
1101 /*                    i4_phase_0 : y phase for even values of y              */
1102 /*                    i4_phase_1 : y phase for odd values of y               */
1103 /*  Globals       : none                                                     */
1104 /*  Processing    : it does the interpolation in vertical direction          */
1105 /*  Outputs       : vertically resampled samples                             */
1106 /*  Returns       : none                                                     */
1107 /*                                                                           */
1108 /*  Issues        : none                                                     */
1109 /*                                                                           */
1110 /*  Revision History:                                                        */
1111 /*                                                                           */
1112 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1113 /*         21 05 2021   Dolan          creation                              */
1114 /*                                                                           */
1115 /*****************************************************************************/
isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1116 void isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1117                                                   WORD32 i4_out_stride, WORD32 i4_phase_0,
1118                                                   WORD32 i4_phase_1)
1119 {
1120     WORD32 i4_y;
1121     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1122     WORD32 i4_filt_stride, i4_dst_stride;
1123     UWORD8 *pu1_out;
1124     WORD16 *pi2_tmp;
1125 
1126     int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1;
1127     int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1;
1128     int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
1129     int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
1130 
1131     int16x8_t final_horz_16x8_r0_1;
1132     int16x8_t final_horz_16x8_r1_1;
1133 
1134     uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
1135     uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
1136 
1137     i4_coeff_0 = 8 - i4_phase_0;
1138     i4_coeff_1 = i4_phase_0;
1139     i4_coeff_2 = 8 - i4_phase_1;
1140     i4_coeff_3 = i4_phase_1;
1141 
1142     pu1_out = pu1_out_buf;
1143     pi2_tmp = pi2_tmp_filt_buf + 1;
1144     i4_filt_stride = 6;
1145     i4_dst_stride = i4_out_stride;
1146 
1147     /* Horizontal interpolation */
1148     for(i4_y = 0; i4_y < 8; i4_y += 2)
1149     {
1150         i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp);      // a0 a1 a2 a3 a4 a5 a6 a7
1151         i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1);  // a1 a2 a3 a4
1152 
1153         i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
1154         i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
1155 
1156         i4_rslt_horz_r0_1 =
1157             vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0);  // a0c0 a1c0  a2c0 a3c0
1158 
1159         i4_rslt_horz_r0_2 =
1160             vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_2);  // a1c2 a2c2  a3c2 a4c2
1161         i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
1162                                         i4_coeff_1);  // a0c0+a1c1 a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
1163 
1164         i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_1,
1165                                         i4_coeff_3);  // a1c2+a2c3  a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
1166 
1167         i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
1168         i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_2);
1169 
1170         i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
1171         i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_1, i4_coeff_3);
1172 
1173         final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0];
1174         final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0];
1175 
1176         final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6);
1177 
1178         final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6);
1179 
1180         i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
1181         i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
1182 
1183         i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
1184                                        i4_out_horz_8x16_r0);
1185         i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
1186                                        i4_out_horz_8x16_r1);
1187 
1188         vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
1189         vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
1190 
1191         /* Incrementing ptr */
1192         pi2_tmp += (i4_filt_stride << 1);
1193         pu1_out += (i4_dst_stride << 1);
1194 
1195     } /* End of loop over y */
1196 }
1197 
1198 /*****************************************************************************/
1199 /*                                                                           */
1200 /*  Function Name : isvcd_vert_interpol_chroma_dyadic_1_neonintr              */
1201 /*                                                                           */
1202 /*  Description   : This function takes the reference array buffer & performs*/
1203 /*                  vertical intra resampling for dyadic scaling ratios for  */
1204 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1205 /*                  chroma_phase_y_plus1:                                    */
1206 /*                        ref_lyr        cur_lyr                             */
1207 /*                            2            0                                 */
1208 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1209 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold       */
1210 /*                        vertically interpolated data                       */
1211 /*                    i4_phase_0 : y phase for even values of y              */
1212 /*                    i4_phase_1 : y phase for odd values of y               */
1213 /*  Globals       : none                                                     */
1214 /*  Processing    : it does the interpolation in vertical direction          */
1215 /*  Outputs       : vertically resampled samples                             */
1216 /*  Returns       : none                                                     */
1217 /*                                                                           */
1218 /*  Issues        : none                                                     */
1219 /*                                                                           */
1220 /*  Revision History:                                                        */
1221 /*                                                                           */
1222 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1223 /*         21 05 2021   Dolan          creation                              */
1224 /*                                                                           */
1225 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1226 void isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1227                                                   WORD32 i4_phase_0, WORD32 i4_phase_1)
1228 {
1229     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1230     WORD32 i4_src_stride;
1231     UWORD8 *pu1_inp;
1232     WORD16 *pi2_tmp;
1233 
1234     uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2;
1235     uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5;
1236     int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
1237         i4_rslt_vert_16x8_r3;
1238     int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
1239         i4_rslt_vert_16x8_r7;
1240 
1241     i4_coeff_0 = 8 - i4_phase_0;
1242     i4_coeff_1 = i4_phase_0;
1243     i4_coeff_2 = 8 - i4_phase_1;
1244     i4_coeff_3 = i4_phase_1;
1245 
1246     pu1_inp = pu1_inp_buf;
1247     pi2_tmp = pi2_tmp_filt_buf;
1248     i4_src_stride = DYADIC_REF_W_C;
1249 
1250     /* Vertical interpolation */
1251     i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
1252     pu1_inp += i4_src_stride;
1253     i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
1254     pu1_inp += i4_src_stride;
1255     i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
1256     pu1_inp += i4_src_stride;
1257     i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
1258     pu1_inp += i4_src_stride;
1259     i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
1260     pu1_inp += i4_src_stride;
1261     i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp);
1262     pu1_inp += i4_src_stride;
1263 
1264     i4_rslt_vert_16x8_r0 =
1265         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
1266     i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
1267         i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
1268     vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
1269 
1270     i4_rslt_vert_16x8_r1 =
1271         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
1272     i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
1273         i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
1274     vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
1275 
1276     i4_rslt_vert_16x8_r2 =
1277         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
1278     i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
1279         i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
1280     vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
1281 
1282     i4_rslt_vert_16x8_r3 =
1283         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
1284     i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
1285         i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
1286     vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
1287 
1288     i4_rslt_vert_16x8_r4 =
1289         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
1290     i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
1291         i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
1292     vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
1293 
1294     i4_rslt_vert_16x8_r5 =
1295         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
1296     i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
1297         i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
1298     vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
1299 
1300     i4_rslt_vert_16x8_r6 =
1301         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
1302     i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
1303         i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
1304     vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
1305 
1306     i4_rslt_vert_16x8_r7 =
1307         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2);
1308     i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
1309         i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3);
1310     vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
1311     vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
1312     vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
1313 }
1314 
1315 /*****************************************************************************/
1316 /*                                                                           */
1317 /*  Function Name : isvcd_vert_interpol_chroma_dyadic_2_neonintr              */
1318 /*                                                                           */
1319 /*  Description   : This function takes the reference array buffer & performs*/
1320 /*                  vertical intra resampling for dyadic scaling ratios for  */
1321 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1322 /*                  chroma_phase_y_plus1:                                    */
1323 /*                        ref_lyr        cur_lyr                             */
1324 /*                            2            0                                 */
1325 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1326 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the   */
1327 /*                        vertically interpolated data                       */
1328 /*                    i4_phase_0 : y phase for even values of y              */
1329 /*                    i4_phase_1 : y phase for odd values of y               */
1330 /*  Globals       : none                                                     */
1331 /*  Processing    : it does the interpolation in vertical direction          */
1332 /*  Outputs       : vertically resampled samples                             */
1333 /*  Returns       : none                                                     */
1334 /*                                                                           */
1335 /*  Issues        : none                                                     */
1336 /*                                                                           */
1337 /*  Revision History:                                                        */
1338 /*                                                                           */
1339 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1340 /*         21 05 2021   Dolan          creation                              */
1341 /*                                                                           */
1342 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1343 void isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1344                                                   WORD32 i4_phase_0, WORD32 i4_phase_1)
1345 {
1346     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1347     WORD32 i4_src_stride;
1348     UWORD8 *pu1_inp;
1349     WORD16 *pi2_tmp;
1350 
1351     uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3;
1352     uint8x8_t i4_samp_vert_8x8_r4;
1353     int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
1354         i4_rslt_vert_16x8_r3;
1355     int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
1356         i4_rslt_vert_16x8_r7;
1357 
1358     i4_coeff_0 = 8 - i4_phase_0;
1359     i4_coeff_1 = i4_phase_0;
1360     i4_coeff_2 = 8 - i4_phase_1;
1361     i4_coeff_3 = i4_phase_1;
1362 
1363     pi2_tmp = pi2_tmp_filt_buf;
1364     i4_src_stride = DYADIC_REF_W_C;
1365     pu1_inp = pu1_inp_buf + i4_src_stride;
1366 
1367     /* Vertical interpolation */
1368     i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
1369     pu1_inp += i4_src_stride;
1370     i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
1371     pu1_inp += i4_src_stride;
1372     i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
1373     pu1_inp += i4_src_stride;
1374     i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
1375     pu1_inp += i4_src_stride;
1376     i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
1377     pu1_inp += i4_src_stride;
1378 
1379     /* since y_phase = phase_0 for y = 0 */
1380     i4_rslt_vert_16x8_r0 =
1381         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
1382     i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
1383         i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
1384     vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
1385 
1386     i4_rslt_vert_16x8_r1 =
1387         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2);
1388     i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
1389         i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3);
1390     vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
1391 
1392     i4_rslt_vert_16x8_r2 =
1393         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
1394     i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
1395         i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
1396     vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
1397 
1398     i4_rslt_vert_16x8_r3 =
1399         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
1400     i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
1401         i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
1402     vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
1403 
1404     i4_rslt_vert_16x8_r4 =
1405         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
1406     i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
1407         i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
1408     vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
1409 
1410     i4_rslt_vert_16x8_r5 =
1411         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
1412     i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
1413         i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
1414     vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
1415 
1416     i4_rslt_vert_16x8_r6 =
1417         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
1418     i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
1419         i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
1420     vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
1421 
1422     i4_rslt_vert_16x8_r7 =
1423         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
1424     i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
1425         i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
1426     vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
1427 
1428     vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
1429     vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
1430 }
1431 
1432 /*****************************************************************************/
1433 /*                                                                           */
1434 /*  Function Name : isvcd_vert_interpol_chroma_dyadic_3_neonintr              */
1435 /*                                                                           */
1436 /*  Description   : This function takes the reference array buffer & performs*/
1437 /*                  vertical intra resampling for dyadic scaling ratios for  */
1438 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1439 /*                  chroma_phase_y_plus1:                                    */
1440 /*                        ref_lyr        cur_lyr                             */
1441 /*                            2            0                                 */
1442 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1443 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the   */
1444 /*                        vertically interpolated data                       */
1445 /*                    i4_phase_0 : y phase for even values of y              */
1446 /*                    i4_phase_1 : y phase for odd values of y               */
1447 /*  Globals       : none                                                     */
1448 /*  Processing    : it does the interpolation in vertical direction          */
1449 /*  Outputs       : vertically resampled samples                             */
1450 /*  Returns       : none                                                     */
1451 /*                                                                           */
1452 /*  Issues        : none                                                     */
1453 /*                                                                           */
1454 /*  Revision History:                                                        */
1455 /*                                                                           */
1456 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1457 /*         21 05 2021   Dolan          creation                              */
1458 /*                                                                           */
1459 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1460 void isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1461                                                   WORD32 i4_phase_0, WORD32 i4_phase_1)
1462 {
1463     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1464     WORD32 i4_src_stride;
1465     UWORD8 *pu1_inp;
1466     WORD16 *pi2_tmp;
1467 
1468     uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2;
1469     uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4;
1470     int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
1471         i4_rslt_vert_16x8_r3;
1472     int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
1473         i4_rslt_vert_16x8_r7;
1474 
1475     i4_coeff_0 = 8 - i4_phase_0;
1476     i4_coeff_1 = i4_phase_0;
1477     i4_coeff_2 = 8 - i4_phase_1;
1478     i4_coeff_3 = i4_phase_1;
1479 
1480     pi2_tmp = pi2_tmp_filt_buf;
1481     i4_src_stride = DYADIC_REF_W_C;
1482     pu1_inp = pu1_inp_buf;
1483 
1484     /* Vertical interpolation */
1485     /* y = 0, y_phase = phase_0 */
1486     i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
1487     pu1_inp += i4_src_stride;
1488     i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
1489     pu1_inp += i4_src_stride;
1490     i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
1491     pu1_inp += i4_src_stride;
1492     i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
1493     pu1_inp += i4_src_stride;
1494     i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
1495     pu1_inp += i4_src_stride;
1496 
1497     /* since y_phase = phase_0 for y = 0 */
1498     i4_rslt_vert_16x8_r0 =
1499         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
1500     i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
1501         i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
1502     vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
1503 
1504     i4_rslt_vert_16x8_r1 =
1505         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2);
1506     i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
1507         i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3);
1508     vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
1509 
1510     i4_rslt_vert_16x8_r2 =
1511         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
1512     i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
1513         i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
1514     vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
1515 
1516     i4_rslt_vert_16x8_r3 =
1517         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
1518     i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
1519         i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
1520     vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
1521 
1522     i4_rslt_vert_16x8_r4 =
1523         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
1524     i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
1525         i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
1526     vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
1527 
1528     i4_rslt_vert_16x8_r5 =
1529         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
1530     i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
1531         i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
1532     vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
1533 
1534     i4_rslt_vert_16x8_r6 =
1535         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
1536     i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
1537         i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
1538     vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
1539 
1540     i4_rslt_vert_16x8_r7 =
1541         vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
1542     i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
1543         i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
1544     vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
1545 
1546     vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
1547     vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
1548 }
1549