1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_intra_resamp_neonintr.c
24 *
25 * @brief
26 * Contains routines that resample for SVC resampling
27 *
28 * @author
29 * Kishore
30 *
31 * @par List of Functions:
32 * - isvcd_interpolate_base_luma_dyadic_neonintr()
33 * - isvcd_interpolate_intra_base_neonintr()
34 * - isvcd_horz_interpol_chroma_dyadic_1_neonintr()
35 * - isvcd_horz_interpol_chroma_dyadic_2_neonintr()
36 * - isvcd_vert_interpol_chroma_dyadic_1_neonintr()
37 * - isvcd_vert_interpol_chroma_dyadic_2_neonintr()
38 * - isvcd_vert_interpol_chroma_dyadic_3_neonintr()
39 *
40 * @remarks
41 * None
42 *
43 *******************************************************************************
44 */
45 #include <assert.h>
46 #include <string.h>
47 #include <arm_neon.h>
48
49 #include "ih264_typedefs.h"
50 #include "ih264_macros.h"
51 #include "ih264_platform_macros.h"
52 #include "isvcd_structs.h"
53 #include "ih264_debug.h"
54
55 /*****************************************************************************/
56 /* */
57 /* Function Name : isvcd_interpolate_base_luma_dyadic_neonintr */
58 /* */
59 /* Description : This function takes the reference array buffer & performs*/
60 /* intra resampling for dyadic scaling ratios */
61 /* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */
62 /* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
63 /* vertically interpolated data */
64 /* pu1_out_buf : output buffer pointer */
65 /* i4_out_stride : output buffer stride */
66 /* Globals : none */
67 /* Processing : it does the interpolation in vertical direction followed */
68 /* by horizontal direction */
69 /* Outputs : resampled pixels */
70 /* Returns : none */
71 /* */
72 /* Issues : none */
73 /* */
74 /* Revision History: */
75 /* */
76 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
77 /* 05 21 2021 Dolan creation */
78 /* */
79 /*****************************************************************************/
isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)80 void isvcd_interpolate_base_luma_dyadic_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
81 UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
82 {
83 WORD32 i4_y;
84 WORD16 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
85 WORD32 i4_filt_stride, i4_src_stride;
86 UWORD8 *pu1_inp, *pu1_out;
87 WORD16 *pi2_tmp;
88
89 int16x4_t i4_rslt_vert_16x4_1, i4_rslt_vert_16x4_2;
90 uint8x8_t i4_samp_vert_8x8_0, i4_samp_vert_8x8_1, i4_samp_vert_8x8_2, i4_samp_vert_8x8_3;
91 int16x8_t i4_rslt_vert_16x8_0, i4_rslt_vert_16x8_2;
92 /* Horizontal interpolation */
93 int32x4_t const_512_32x4 = vdupq_n_s32(512);
94 int32x4_t i4_rslt_horz_r0_1, i4_rslt_horz_r1_1, i4_rslt_horz_r0_2, i4_rslt_horz_r1_2;
95 uint16x4_t i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp, i4_rslt_horz_r0_2_tmp,
96 i4_rslt_horz_r1_2_tmp;
97 uint16x8_t rslt_16x8_t_1, rslt_16x8_t_2;
98 int32x4x2_t i4_rslt_horz_32x4x2_t;
99 int16x4_t i4_samp_horz_16x4_0, i4_samp_horz_16x4_1, i4_samp_horz_16x4_2, i4_samp_horz_16x4_3,
100 i4_samp_horz_16x4_4;
101 int16x4_t i4_samp_horz_16x4_5, i4_samp_horz_16x4_6, i4_samp_horz_16x4_7, i4_samp_horz_16x4_8;
102 int16_t i4_coeff_c0 = -3;
103 int16_t i4_coeff_c1 = 28;
104 int16_t i4_coeff_c2 = 8;
105 int16_t i4_coeff_c3 = -1;
106 int32x4_t i4_rslt_horz_r0_1_tmp32, i4_rslt_horz_r1_1_tmp32, i4_rslt_horz_r0_2_tmp32,
107 i4_rslt_horz_r1_2_tmp32;
108
109 /* Filter coefficient values for phase 4 */
110 i4_coeff_0 = -3;
111 i4_coeff_1 = 28;
112 i4_coeff_2 = 8;
113 i4_coeff_3 = -1;
114 i4_filt_stride = 12;
115 i4_src_stride = DYADIC_REF_W_Y;
116
117 pu1_inp = pu1_inp_buf;
118 pi2_tmp = pi2_tmp_filt_buf;
119 pu1_out = pu1_out_buf;
120
121 /* Vertical interpolation */
122 // First 64 bits
123 i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp);
124 pu1_inp += i4_src_stride;
125 i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp);
126 pu1_inp += i4_src_stride;
127 i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp);
128 pu1_inp += i4_src_stride;
129 i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
130 pu1_inp += i4_src_stride;
131
132 i4_rslt_vert_16x8_0 =
133 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
134 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
135 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
136 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
137 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
138 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
139 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
140
141 vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_0);
142 pi2_tmp += i4_filt_stride;
143
144 for(i4_y = 1; i4_y < 15; i4_y += 2)
145 {
146 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
147 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
148 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
149 i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
150
151 i4_rslt_vert_16x8_0 =
152 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
153 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
154 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
155 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
156 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
157 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
158 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
159
160 i4_rslt_vert_16x8_2 =
161 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_3);
162 i4_rslt_vert_16x8_2 = vmlaq_n_s16(
163 i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_2);
164 i4_rslt_vert_16x8_2 = vmlaq_n_s16(
165 i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_1);
166 i4_rslt_vert_16x8_2 = vmlaq_n_s16(
167 i4_rslt_vert_16x8_2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_0);
168
169 /* Storing the results */
170 vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
171 pi2_tmp += i4_filt_stride;
172 vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_2));
173 pi2_tmp += i4_filt_stride;
174 pu1_inp += i4_src_stride;
175 } /*End of Loop over y*/
176
177 /* y = 15, y_phase = 4 */
178 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
179 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
180 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
181 i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
182
183 i4_rslt_vert_16x8_0 =
184 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_0)), i4_coeff_0);
185 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
186 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_1)), i4_coeff_1);
187 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
188 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_2)), i4_coeff_2);
189 i4_rslt_vert_16x8_0 = vmlaq_n_s16(
190 i4_rslt_vert_16x8_0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_3)), i4_coeff_3);
191
192 vst1q_s16(pi2_tmp, (i4_rslt_vert_16x8_0));
193 /* End of loop over x */
194
195 // Remaining 32 bits
196 pu1_inp = pu1_inp_buf + 8;
197 pi2_tmp = pi2_tmp_filt_buf + 8;
198
199 i4_samp_vert_8x8_0 = vld1_u8((const uint8_t *) pu1_inp);
200 pu1_inp += i4_src_stride;
201 i4_samp_vert_8x8_1 = vld1_u8((const uint8_t *) pu1_inp);
202 pu1_inp += i4_src_stride;
203 i4_samp_vert_8x8_2 = vld1_u8((const uint8_t *) pu1_inp);
204 pu1_inp += i4_src_stride;
205 i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
206 pu1_inp += i4_src_stride;
207
208 i4_rslt_vert_16x4_1 =
209 vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
210 i4_rslt_vert_16x4_1 =
211 vmla_n_s16(i4_rslt_vert_16x4_1,
212 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_2);
213 i4_rslt_vert_16x4_1 =
214 vmla_n_s16(i4_rslt_vert_16x4_1,
215 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_1);
216 i4_rslt_vert_16x4_1 =
217 vmla_n_s16(i4_rslt_vert_16x4_1,
218 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_0);
219
220 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
221 pi2_tmp += i4_filt_stride;
222
223 for(i4_y = 1; i4_y < 15; i4_y += 2)
224 {
225 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
226 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
227 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
228 i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
229
230 i4_rslt_vert_16x4_1 = vmul_n_s16(
231 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
232 i4_rslt_vert_16x4_1 = vmla_n_s16(
233 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
234 i4_coeff_1);
235 i4_rslt_vert_16x4_1 = vmla_n_s16(
236 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
237 i4_coeff_2);
238 i4_rslt_vert_16x4_1 = vmla_n_s16(
239 i4_rslt_vert_16x4_1, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
240 i4_coeff_3);
241
242 i4_rslt_vert_16x4_2 = vmul_n_s16(
243 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_3);
244 i4_rslt_vert_16x4_2 = vmla_n_s16(
245 i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))),
246 i4_coeff_2);
247 i4_rslt_vert_16x4_2 = vmla_n_s16(
248 i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))),
249 i4_coeff_1);
250 i4_rslt_vert_16x4_2 = vmla_n_s16(
251 i4_rslt_vert_16x4_2, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))),
252 i4_coeff_0);
253
254 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
255 pi2_tmp += i4_filt_stride;
256 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_2));
257 pi2_tmp += i4_filt_stride;
258 pu1_inp += i4_src_stride;
259 }
260
261 i4_samp_vert_8x8_0 = i4_samp_vert_8x8_1;
262 i4_samp_vert_8x8_1 = i4_samp_vert_8x8_2;
263 i4_samp_vert_8x8_2 = i4_samp_vert_8x8_3;
264 i4_samp_vert_8x8_3 = vld1_u8((const uint8_t *) pu1_inp);
265
266 i4_rslt_vert_16x4_1 =
267 vmul_n_s16(vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_0))), i4_coeff_0);
268 i4_rslt_vert_16x4_1 =
269 vmla_n_s16(i4_rslt_vert_16x4_1,
270 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_1))), i4_coeff_1);
271 i4_rslt_vert_16x4_1 =
272 vmla_n_s16(i4_rslt_vert_16x4_1,
273 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_2))), i4_coeff_2);
274 i4_rslt_vert_16x4_1 =
275 vmla_n_s16(i4_rslt_vert_16x4_1,
276 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(i4_samp_vert_8x8_3))), i4_coeff_3);
277
278 vst1_s16(pi2_tmp, (i4_rslt_vert_16x4_1));
279 /* Reinitializing the ptrs */
280 pu1_inp = pu1_inp_buf;
281 pi2_tmp = pi2_tmp_filt_buf;
282
283 /* Horizontal interpolation */
284 for(i4_y = 0; i4_y < 16; i4_y++)
285 {
286 i4_samp_horz_16x4_0 = vld1_s16(pi2_tmp);
287 i4_samp_horz_16x4_1 = vld1_s16(pi2_tmp + 1);
288 i4_samp_horz_16x4_2 = vld1_s16(pi2_tmp + 2);
289 i4_samp_horz_16x4_3 = vld1_s16(pi2_tmp + 3);
290 i4_samp_horz_16x4_4 = vld1_s16(pi2_tmp + 4);
291 i4_samp_horz_16x4_5 = vld1_s16(pi2_tmp + 5);
292 i4_samp_horz_16x4_6 = vld1_s16(pi2_tmp + 6);
293 i4_samp_horz_16x4_7 = vld1_s16(pi2_tmp + 7);
294 i4_samp_horz_16x4_8 = vld1_s16(pi2_tmp + 8);
295
296 i4_rslt_horz_r0_1 = vmull_n_s16(i4_samp_horz_16x4_0, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3
297 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_1,
298 i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
299 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_2, i4_coeff_c1);
300 i4_rslt_horz_r0_1 = vmlal_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x4_3, i4_coeff_c0);
301
302 i4_rslt_horz_r1_1 = vmull_n_s16(i4_samp_horz_16x4_1, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0
303 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_2,
304 i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
305 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_3, i4_coeff_c2);
306 i4_rslt_horz_r1_1 = vmlal_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x4_4, i4_coeff_c3);
307
308 i4_rslt_horz_r0_2 = vmull_n_s16(i4_samp_horz_16x4_4, i4_coeff_c3); // a0c3 a1c3 a2c3 a3c3
309 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_5,
310 i4_coeff_c2); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
311 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_6, i4_coeff_c1);
312 i4_rslt_horz_r0_2 = vmlal_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x4_7, i4_coeff_c0);
313
314 i4_rslt_horz_r1_2 = vmull_n_s16(i4_samp_horz_16x4_5, i4_coeff_c0); // a0c0 a1c0 a2c0 a3c0
315 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_6,
316 i4_coeff_c1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
317 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_7, i4_coeff_c2);
318 i4_rslt_horz_r1_2 = vmlal_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x4_8, i4_coeff_c3);
319
320 i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_1, i4_rslt_horz_r1_1);
321 i4_rslt_horz_r0_1_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 0 to 3
322 i4_rslt_horz_r1_1_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 4 to 7
323
324 i4_rslt_horz_32x4x2_t = vzipq_s32(i4_rslt_horz_r0_2, i4_rslt_horz_r1_2);
325 i4_rslt_horz_r0_2_tmp32 = i4_rslt_horz_32x4x2_t.val[0]; // 8 to 11
326 i4_rslt_horz_r1_2_tmp32 = i4_rslt_horz_32x4x2_t.val[1]; // 12 to 15
327
328 i4_rslt_horz_r0_1 = vaddq_s32(i4_rslt_horz_r0_1_tmp32, const_512_32x4);
329 i4_rslt_horz_r1_1 = vaddq_s32(i4_rslt_horz_r1_1_tmp32, const_512_32x4);
330 i4_rslt_horz_r0_2 = vaddq_s32(i4_rslt_horz_r0_2_tmp32, const_512_32x4);
331 i4_rslt_horz_r1_2 = vaddq_s32(i4_rslt_horz_r1_2_tmp32, const_512_32x4);
332
333 i4_rslt_horz_r0_1_tmp = vqshrun_n_s32(i4_rslt_horz_r0_1, 10);
334 i4_rslt_horz_r1_1_tmp = vqshrun_n_s32(i4_rslt_horz_r1_1, 10);
335
336 i4_rslt_horz_r0_2_tmp = vqshrun_n_s32(i4_rslt_horz_r0_2, 10);
337 i4_rslt_horz_r1_2_tmp = vqshrun_n_s32(i4_rslt_horz_r1_2, 10);
338
339 rslt_16x8_t_1 = vcombine_u16(i4_rslt_horz_r0_1_tmp, i4_rslt_horz_r1_1_tmp); // 0 to 7
340 rslt_16x8_t_2 = vcombine_u16(i4_rslt_horz_r0_2_tmp, i4_rslt_horz_r1_2_tmp); // 8 to 15
341
342 vst1_u8(pu1_out, vqmovn_u16(rslt_16x8_t_1));
343 vst1_u8(pu1_out + 8, vqmovn_u16(rslt_16x8_t_2));
344
345 pu1_out += i4_out_stride;
346 pi2_tmp += i4_filt_stride;
347 }
348 }
349
350 /*****************************************************************************/
351 /* */
352 /* Function Name : isvcd_interpolate_intra_base_neonintr */
353 /* */
354 /* Description : This function takes the reference array buffer & performs*/
355 /* interpolation of a component to find the intra */
356 /* resampled value */
357 /* Inputs : pv_intra_samp_ctxt : intra sampling context */
358 /* pu1_out : output buffer pointer */
359 /* i4_out_stride : output buffer stride */
360 /* i4_refarray_wd : reference array width */
361 /* i4_x_offset : offset in reference layer in horz direction*/
362 /* ps_coord : current mb co-ordinate */
363 /* i4_chroma_flag : chroma processing flag */
364 /* Globals : none */
365 /* Processing : it does the interpolation in vertical direction followed */
366 /* by horizontal direction */
367 /* Outputs : resampled pixels */
368 /* Returns : none */
369 /* */
370 /* Issues : none */
371 /* */
372 /* Revision History: */
373 /* */
374 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
375 /* 26 06 2009 vijayakumar creation */
376 /* */
377 /*****************************************************************************/
isvcd_interpolate_intra_base_neonintr(void * pv_intra_samp_ctxt,UWORD8 * pu1_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,WORD32 i4_mb_x,WORD32 i4_mb_y,WORD32 i4_chroma_flag,WORD32 i4_refarray_flag)378 void isvcd_interpolate_intra_base_neonintr(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
379 WORD32 i4_out_stride, WORD32 i4_refarray_wd,
380 WORD32 i4_mb_x, WORD32 i4_mb_y, WORD32 i4_chroma_flag,
381 WORD32 i4_refarray_flag)
382 {
383 /* --------------------------------------------------------------------- */
384 /* Index Parameters */
385 /* --------------------------------------------------------------------- */
386 intra_sampling_ctxt_t *ps_ctxt;
387 intra_samp_map_ctxt_t *ps_map_ctxt;
388 intra_samp_lyr_ctxt *ps_lyr_ctxt;
389 WORD32 i4_x, i4_y;
390 WORD32 i4_frm_mb_x, i4_frm_mb_y;
391 UWORD8 *pu1_refarray = NULL;
392 ref_pixel_map_t *ps_x_pos_phase;
393 ref_pixel_map_t *ps_y_pos_phase;
394 WORD32 i4_temp_array_ht;
395 WORD32 *pi4_interp_buff;
396
397 UWORD8 arr_y_ref_pos_luma[16] = {0};
398 UWORD8 arr_x_ref_pos_luma[16] = {0};
399 UWORD8 arr_x_ref_pos_luma_low[16] = {0};
400 UWORD8 arr_x_ref_pos_luma_high[16] = {0};
401 UWORD8 arr_phase_luma[16] = {0};
402 UWORD8 *pi4_y_ref_pos_luma;
403 UWORD8 *pi4_x_ref_pos_luma_low;
404 UWORD8 *pi4_x_ref_pos_luma_high;
405 UWORD8 *pi4_phase_luma;
406 WORD16 *pi2_interp_buff_temp;
407 WORD32 i4_mb_wd;
408 WORD32 i4_mb_ht;
409 WORD32 i4_x_min;
410 ref_min_max_map_t *ps_x_min_max;
411 UWORD8 *pu1_refarray_temp;
412
413 ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
414 ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
415
416 if(0 == i4_refarray_flag)
417 {
418 pu1_refarray = ps_ctxt->pu1_refarray_buffer;
419 }
420 else if(1 == i4_refarray_flag)
421 {
422 pu1_refarray = ps_ctxt->pu1_refarray_cb;
423 }
424
425 /* --------------------------------------------------------------------- */
426 /* LUMA or CHROMA */
427 /* --------------------------------------------------------------------- */
428
429 if(1 == i4_chroma_flag)
430 ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
431 else
432 ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
433
434 i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
435 i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
436
437 ps_x_min_max = ps_map_ctxt->ps_x_min_max;
438
439 i4_frm_mb_y = i4_mb_y * i4_mb_ht;
440 i4_frm_mb_x = i4_mb_x * i4_mb_wd;
441 /* get the min and max positions */
442 i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
443
444 /* --------------------------------------------------------------------- */
445 /* Projected frame level pointers */
446 /* --------------------------------------------------------------------- */
447 ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
448 ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
449
450 /* --------------------------------------------------------------------- */
451 /* Pointers and Dimenstion of the temporary buffer */
452 /* --------------------------------------------------------------------- */
453 i4_temp_array_ht = i4_mb_ht;
454 pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
455 pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
456
457 /* --------------------------------------------------------------------- */
458 /* Loop for interpolation in vertical direction */
459 /* --------------------------------------------------------------------- */
460 if(i4_chroma_flag == 0)
461 {
462 {
463 uint8x8_t inp_8x8_r0, inp_8x8_r0_1;
464 uint8x8_t inp_8x8_r1, inp_8x8_r1_1;
465 uint8x8_t inp_8x8_r2, inp_8x8_r2_1;
466 uint8x8_t inp_8x8_r3, inp_8x8_r3_1;
467 int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1;
468
469 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
470 {
471 arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
472 arr_y_ref_pos_luma[i4_y] = (UWORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
473 }
474 pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
475 pi4_phase_luma = arr_phase_luma;
476
477 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
478 {
479 pu1_refarray_temp =
480 pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
481 inp_8x8_r0 = vld1_u8((pu1_refarray_temp - i4_refarray_wd));
482 inp_8x8_r1 = vld1_u8((pu1_refarray_temp));
483 inp_8x8_r2 = vld1_u8((pu1_refarray_temp + i4_refarray_wd));
484 inp_8x8_r3 = vld1_u8((pu1_refarray_temp + 2 * i4_refarray_wd));
485
486 inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8 - i4_refarray_wd));
487 inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8));
488 inp_8x8_r2_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd));
489 inp_8x8_r3_1 = vld1_u8((pu1_refarray_temp + 8 + 2 * i4_refarray_wd));
490
491 out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)),
492 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
493 out_res_16x8_r0_0 =
494 vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)),
495 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
496 out_res_16x8_r0_0 =
497 vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2)),
498 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
499 out_res_16x8_r0_0 =
500 vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3)),
501 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
502
503 out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)),
504 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
505 out_res_16x8_r0_1 =
506 vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)),
507 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
508 out_res_16x8_r0_1 =
509 vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r2_1)),
510 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
511 out_res_16x8_r0_1 =
512 vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r3_1)),
513 g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
514
515 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
516 out_res_16x8_r0_0);
517 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8),
518 out_res_16x8_r0_1);
519 }
520 }
521 /*Horizontal Interpolation*/
522 {
523 WORD32 strt_indx = 10;
524
525 uint8x16_t phs_mask_8x8_0;
526 uint8x16_t x_ref_pos_luma_mask_r0_0;
527 uint8x16_t x_ref_pos_luma_mask_r0_1;
528 uint8x16_t x_ref_pos_luma_mask_r1_0;
529 uint8x16_t x_ref_pos_luma_mask_r1_1;
530 uint8x16_t x_ref_pos_luma_mask_r2_0;
531 uint8x16_t x_ref_pos_luma_mask_r2_1;
532 uint8x16_t x_ref_pos_luma_mask_r3_0;
533 uint8x16_t x_ref_pos_luma_mask_r3_1;
534
535 WORD32 strt_indx_h = 0, i4_x2 = 0;
536 WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
537 uint8x16_t twos = vdupq_n_u8(2);
538 strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
539 strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
540 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
541 {
542 arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
543 arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
544 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
545 }
546
547 for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
548 {
549 i4_x2 = i4_x << 1;
550 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
551 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
552 }
553 for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
554 {
555 i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
556 arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
557 arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
558 }
559 pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
560 pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
561 pi4_phase_luma = arr_phase_luma;
562
563 phs_mask_8x8_0 = vld1q_u8((const uint8_t *) pi4_phase_luma);
564
565 x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low);
566 x_ref_pos_luma_mask_r0_1 = vld1q_u8(pi4_x_ref_pos_luma_high);
567 x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos);
568 x_ref_pos_luma_mask_r1_1 = vaddq_u8(x_ref_pos_luma_mask_r0_1, twos);
569 x_ref_pos_luma_mask_r2_0 = vaddq_u8(x_ref_pos_luma_mask_r1_0, twos);
570 x_ref_pos_luma_mask_r2_1 = vaddq_u8(x_ref_pos_luma_mask_r1_1, twos);
571 x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
572 x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
573
574 {
575 int8x16_t ip_filt_8x16_r0;
576 int8x16_t ip_filt_8x16_r1;
577 int8x16_t ip_filt_8x16_r2;
578 int8x16_t ip_filt_8x16_r3;
579
580 int16x8_t ip_filt_16x8_r0_0, ip_filt_16x8_r0_1;
581 int16x8_t ip_filt_16x8_r1_0, ip_filt_16x8_r1_1;
582 int16x8_t ip_filt_16x8_r2_0, ip_filt_16x8_r2_1;
583 int16x8_t ip_filt_16x8_r3_0, ip_filt_16x8_r3_1;
584
585 int16x8_t inp_16x8_0;
586 int16x8_t inp_16x8_1;
587 int16x8_t inp_16x8_2;
588 int16x8_t inp_16x8_3;
589
590 int16x8_t inp_16x8_r0_0, inp_16x8_r2_0;
591 int16x8_t inp_16x8_r0_1, inp_16x8_r2_1;
592 int16x8_t inp_16x8_r1_0, inp_16x8_r3_0;
593 int16x8_t inp_16x8_r1_1, inp_16x8_r3_1;
594
595 int16x4_t inp_16x4_r0_0, inp_16x4_r2_0;
596 int16x4_t inp_16x4_r0_1, inp_16x4_r2_1;
597 int16x4_t inp_16x4_r1_0, inp_16x4_r3_0;
598 int16x4_t inp_16x4_r1_1, inp_16x4_r3_1;
599
600 int32x4_t out_res_32x4_r0_l_0;
601 int32x4_t out_res_32x4_r0_l_1;
602 int32x4_t out_res_32x4_r0_h_0;
603 int32x4_t out_res_32x4_r0_h_1;
604
605 uint16x4_t out_res_16x4_r0_l_0;
606 uint16x4_t out_res_16x4_r0_l_1;
607 uint16x4_t out_res_16x4_r0_h_0;
608 uint16x4_t out_res_16x4_r0_h_1;
609
610 uint8x8_t out_res_8x8_r0_l, out_res_8x8_r0_h;
611 uint8x8x2_t u1_temp_8x8x2_t;
612 uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
613
614 ip_filt_8x16_r0 = vld1q_s8((g_ai1_interp_filter_luma));
615 ip_filt_8x16_r1 = vld1q_s8((g_ai1_interp_filter_luma + 16));
616 ip_filt_8x16_r2 = vld1q_s8((g_ai1_interp_filter_luma + 32));
617 ip_filt_8x16_r3 = vld1q_s8((g_ai1_interp_filter_luma + 48));
618
619 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r0));
620 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r0));
621 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
622 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
623 ip_filt_8x16_r0 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
624 vreinterpret_s8_u8(u1_temp_8x8_t1));
625
626 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r1));
627 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r1));
628 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
629 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
630 ip_filt_8x16_r1 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
631 vreinterpret_s8_u8(u1_temp_8x8_t1));
632 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r2));
633 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r2));
634 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
635 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
636 ip_filt_8x16_r2 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
637 vreinterpret_s8_u8(u1_temp_8x8_t1));
638 u1_temp_8x8x2_t.val[0] = vreinterpret_u8_s8(vget_low_s8(ip_filt_8x16_r3));
639 u1_temp_8x8x2_t.val[1] = vreinterpret_u8_s8(vget_high_s8(ip_filt_8x16_r3));
640 u1_temp_8x8_t0 = vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
641 u1_temp_8x8_t1 = vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
642 ip_filt_8x16_r3 = vcombine_s8(vreinterpret_s8_u8(u1_temp_8x8_t0),
643 vreinterpret_s8_u8(u1_temp_8x8_t1));
644 ip_filt_16x8_r0_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r0));
645 ip_filt_16x8_r1_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r1));
646 ip_filt_16x8_r2_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r2));
647 ip_filt_16x8_r3_0 = vmovl_s8(vget_low_s8(ip_filt_8x16_r3));
648 ip_filt_16x8_r0_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r0));
649 ip_filt_16x8_r1_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r1));
650 ip_filt_16x8_r2_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r2));
651 ip_filt_16x8_r3_1 = vmovl_s8(vget_high_s8(ip_filt_8x16_r3));
652
653 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
654 {
655 inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx));
656 inp_16x8_1 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h));
657 inp_16x8_2 = vld1q_s16((pi2_interp_buff_temp + strt_indx + 3));
658 inp_16x8_3 = vld1q_s16((pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
659 pi2_interp_buff_temp += i4_refarray_wd;
660 u1_temp_8x8x2_t.val[0] =
661 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
662 u1_temp_8x8x2_t.val[1] =
663 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
664 u1_temp_8x8_t0 =
665 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0));
666 u1_temp_8x8_t1 =
667 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0));
668 inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
669 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
670
671 u1_temp_8x8x2_t.val[0] =
672 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
673 u1_temp_8x8x2_t.val[1] =
674 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
675 u1_temp_8x8_t0 =
676 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_1));
677 u1_temp_8x8_t1 =
678 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_1));
679 inp_16x8_r0_1 = vreinterpretq_s16_s8(vcombine_s8(
680 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
681
682 u1_temp_8x8x2_t.val[0] =
683 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
684 u1_temp_8x8x2_t.val[1] =
685 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
686 u1_temp_8x8_t0 =
687 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0));
688 u1_temp_8x8_t1 =
689 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0));
690 inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
691 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
692
693 u1_temp_8x8x2_t.val[0] =
694 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
695 u1_temp_8x8x2_t.val[1] =
696 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
697 u1_temp_8x8_t0 =
698 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_1));
699 u1_temp_8x8_t1 =
700 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_1));
701 inp_16x8_r1_1 = vreinterpretq_s16_s8(vcombine_s8(
702 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
703
704 u1_temp_8x8x2_t.val[0] =
705 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
706 u1_temp_8x8x2_t.val[1] =
707 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
708 u1_temp_8x8_t0 =
709 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_0));
710 u1_temp_8x8_t1 =
711 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_0));
712 inp_16x8_r2_0 = vreinterpretq_s16_s8(vcombine_s8(
713 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
714
715 u1_temp_8x8x2_t.val[0] =
716 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_1)));
717 u1_temp_8x8x2_t.val[1] =
718 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_1)));
719 u1_temp_8x8_t0 =
720 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r2_1));
721 u1_temp_8x8_t1 =
722 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r2_1));
723 inp_16x8_r2_1 = vreinterpretq_s16_s8(vcombine_s8(
724 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
725
726 u1_temp_8x8x2_t.val[0] =
727 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_2)));
728 u1_temp_8x8x2_t.val[1] =
729 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_2)));
730 u1_temp_8x8_t0 =
731 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_0));
732 u1_temp_8x8_t1 =
733 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_0));
734 inp_16x8_r3_0 = vreinterpretq_s16_s8(vcombine_s8(
735 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
736
737 u1_temp_8x8x2_t.val[0] =
738 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_3)));
739 u1_temp_8x8x2_t.val[1] =
740 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_3)));
741 u1_temp_8x8_t0 =
742 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r3_1));
743 u1_temp_8x8_t1 =
744 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r3_1));
745 inp_16x8_r3_1 = vreinterpretq_s16_s8(vcombine_s8(
746 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
747
748 inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0);
749 inp_16x4_r0_1 = vget_low_s16(inp_16x8_r0_1);
750 inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0);
751 inp_16x4_r1_1 = vget_low_s16(inp_16x8_r1_1);
752
753 inp_16x4_r2_0 = vget_low_s16(inp_16x8_r2_0);
754 inp_16x4_r2_1 = vget_low_s16(inp_16x8_r2_1);
755 inp_16x4_r3_0 = vget_low_s16(inp_16x8_r3_0);
756 inp_16x4_r3_1 = vget_low_s16(inp_16x8_r3_1);
757
758 out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0));
759 out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0,
760 vget_low_s16(ip_filt_16x8_r1_0));
761 out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r2_0,
762 vget_low_s16(ip_filt_16x8_r2_0));
763 out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r3_0,
764 vget_low_s16(ip_filt_16x8_r3_0));
765 out_res_32x4_r0_l_1 =
766 vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0));
767 out_res_32x4_r0_l_1 =
768 vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0),
769 vget_high_s16(ip_filt_16x8_r1_0));
770 out_res_32x4_r0_l_1 =
771 vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r2_0),
772 vget_high_s16(ip_filt_16x8_r2_0));
773 out_res_32x4_r0_l_1 =
774 vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r3_0),
775 vget_high_s16(ip_filt_16x8_r3_0));
776
777 out_res_32x4_r0_h_0 = vmull_s16(inp_16x4_r0_1, vget_low_s16(ip_filt_16x8_r0_1));
778 out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r1_1,
779 vget_low_s16(ip_filt_16x8_r1_1));
780 out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r2_1,
781 vget_low_s16(ip_filt_16x8_r2_1));
782 out_res_32x4_r0_h_0 = vmlal_s16(out_res_32x4_r0_h_0, inp_16x4_r3_1,
783 vget_low_s16(ip_filt_16x8_r3_1));
784
785 out_res_32x4_r0_h_1 =
786 vmull_s16(vget_high_s16(inp_16x8_r0_1), vget_high_s16(ip_filt_16x8_r0_1));
787 out_res_32x4_r0_h_1 =
788 vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r1_1),
789 vget_high_s16(ip_filt_16x8_r1_1));
790 out_res_32x4_r0_h_1 =
791 vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r2_1),
792 vget_high_s16(ip_filt_16x8_r2_1));
793 out_res_32x4_r0_h_1 =
794 vmlal_s16(out_res_32x4_r0_h_1, vget_high_s16(inp_16x8_r3_1),
795 vget_high_s16(ip_filt_16x8_r3_1));
796
797 out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10);
798 out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10);
799 out_res_16x4_r0_h_0 = vqrshrun_n_s32(out_res_32x4_r0_h_0, 10);
800 out_res_16x4_r0_h_1 = vqrshrun_n_s32(out_res_32x4_r0_h_1, 10);
801
802 out_res_8x8_r0_l =
803 vqmovn_u16(vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1));
804 out_res_8x8_r0_h =
805 vqmovn_u16(vcombine_u16(out_res_16x4_r0_h_0, out_res_16x4_r0_h_1));
806 vst1q_u8((pu1_out + (i4_y * i4_out_stride)),
807 vcombine_u8(out_res_8x8_r0_l, out_res_8x8_r0_h));
808 }
809 }
810 }
811 }
812 else
813 {
814 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
815 {
816 arr_y_ref_pos_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
817 arr_phase_luma[i4_y] = (UWORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
818 }
819 pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
820 pi4_phase_luma = arr_phase_luma;
821
822 {
823 uint8x8_t inp_8x8_r0, inp_8x8_r0_1;
824 uint8x8_t inp_8x8_r1, inp_8x8_r1_1;
825 int16x8_t out_res_16x8_r0_0, out_res_16x8_r0_1;
826
827 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
828 {
829 pu1_refarray_temp =
830 pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
831 inp_8x8_r0 = vld1_u8((pu1_refarray_temp));
832 inp_8x8_r1 = vld1_u8((pu1_refarray_temp + i4_refarray_wd));
833
834 inp_8x8_r0_1 = vld1_u8((pu1_refarray_temp + 8));
835 inp_8x8_r1_1 = vld1_u8((pu1_refarray_temp + 8 + i4_refarray_wd));
836
837 out_res_16x8_r0_0 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0)),
838 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
839 out_res_16x8_r0_0 =
840 vmlaq_n_s16(out_res_16x8_r0_0, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1)),
841 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
842
843 out_res_16x8_r0_1 = vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r0_1)),
844 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
845 out_res_16x8_r0_1 =
846 vmlaq_n_s16(out_res_16x8_r0_1, vreinterpretq_s16_u16(vmovl_u8(inp_8x8_r1_1)),
847 g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
848
849 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
850 out_res_16x8_r0_0);
851 vst1q_s16((pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1) + 8),
852 out_res_16x8_r0_1);
853 }
854 }
855
856 {
857 WORD32 strt_indx = 10;
858
859 uint8x16_t phs_mask_8x8_0;
860 uint8x16_t x_ref_pos_luma_mask_r0_0;
861 uint8x16_t x_ref_pos_luma_mask_r1_0;
862
863 WORD32 i4_x2 = 0;
864 uint8x16_t twos = vdupq_n_u8(2);
865 strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
866
867 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
868 {
869 arr_x_ref_pos_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
870 arr_phase_luma[i4_x] = ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
871 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
872 i4_x2 = i4_x << 1;
873 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
874 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
875 }
876
877 pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
878 pi4_phase_luma = arr_phase_luma;
879
880 phs_mask_8x8_0 = vld1q_u8(pi4_phase_luma);
881 x_ref_pos_luma_mask_r0_0 = vld1q_u8(pi4_x_ref_pos_luma_low);
882 x_ref_pos_luma_mask_r1_0 = vaddq_u8(x_ref_pos_luma_mask_r0_0, twos);
883
884 {
885 uint8x16_t ip_filt_8x16_r0;
886 uint8x16_t ip_filt_8x16_r1;
887 int16x8_t ip_filt_16x8_r0_0;
888 int16x8_t ip_filt_16x8_r1_0;
889 int16x8_t inp_16x8_0;
890 int16x8_t inp_16x8_r0_0;
891 int16x8_t inp_16x8_r1_0;
892 int16x4_t inp_16x4_r0_0;
893 int16x4_t inp_16x4_r1_0;
894 int32x4_t out_res_32x4_r0_l_0;
895 int32x4_t out_res_32x4_r0_l_1;
896 uint16x4_t out_res_16x4_r0_l_0;
897 uint16x4_t out_res_16x4_r0_l_1;
898 uint16x8_t out_res_16x8_r0_l;
899 uint8x16_t out_8x16_r0;
900 uint8x8x2_t u1_incr_8x8x2_t;
901 uint8x8_t u1_incr_8x8_t0, u1_incr_8x8_t1;
902 uint8x8x2_t u1_temp_8x8x2_t;
903 uint8x8_t u1_temp_8x8_t0, u1_temp_8x8_t1;
904 uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
905
906 ip_filt_8x16_r0 = vld1q_u8((g_au1_interp_filter_chroma));
907 ip_filt_8x16_r1 = vld1q_u8((g_au1_interp_filter_chroma + 16));
908
909 u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r0);
910 u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r0);
911 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
912 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
913 ip_filt_8x16_r0 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
914
915 u1_incr_8x8x2_t.val[0] = vget_low_u8(ip_filt_8x16_r1);
916 u1_incr_8x8x2_t.val[1] = vget_high_u8(ip_filt_8x16_r1);
917 u1_incr_8x8_t0 = vtbl2_u8(u1_incr_8x8x2_t, vget_low_u8(phs_mask_8x8_0));
918 u1_incr_8x8_t1 = vtbl2_u8(u1_incr_8x8x2_t, vget_high_u8(phs_mask_8x8_0));
919 ip_filt_8x16_r1 = vcombine_u8(u1_incr_8x8_t0, u1_incr_8x8_t1);
920
921 ip_filt_16x8_r0_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r0)));
922 ip_filt_16x8_r1_0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ip_filt_8x16_r1)));
923
924 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
925 {
926 inp_16x8_0 = vld1q_s16((pi2_interp_buff_temp + strt_indx));
927 pi2_interp_buff_temp += i4_refarray_wd;
928 u1_temp_8x8x2_t.val[0] =
929 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
930 u1_temp_8x8x2_t.val[1] =
931 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
932 u1_temp_8x8_t0 =
933 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r0_0));
934 u1_temp_8x8_t1 =
935 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r0_0));
936 inp_16x8_r0_0 = vreinterpretq_s16_s8(vcombine_s8(
937 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
938
939 u1_temp_8x8x2_t.val[0] =
940 vreinterpret_u8_s8(vget_low_s8(vreinterpretq_s8_s16(inp_16x8_0)));
941 u1_temp_8x8x2_t.val[1] =
942 vreinterpret_u8_s8(vget_high_s8(vreinterpretq_s8_s16(inp_16x8_0)));
943 u1_temp_8x8_t0 =
944 vtbl2_u8(u1_temp_8x8x2_t, vget_low_u8(x_ref_pos_luma_mask_r1_0));
945 u1_temp_8x8_t1 =
946 vtbl2_u8(u1_temp_8x8x2_t, vget_high_u8(x_ref_pos_luma_mask_r1_0));
947 inp_16x8_r1_0 = vreinterpretq_s16_s8(vcombine_s8(
948 vreinterpret_s8_u8(u1_temp_8x8_t0), vreinterpret_s8_u8(u1_temp_8x8_t1)));
949 inp_16x4_r0_0 = vget_low_s16(inp_16x8_r0_0);
950 inp_16x4_r1_0 = vget_low_s16(inp_16x8_r1_0);
951
952 out_res_32x4_r0_l_0 = vmull_s16(inp_16x4_r0_0, vget_low_s16(ip_filt_16x8_r0_0));
953 out_res_32x4_r0_l_0 = vmlal_s16(out_res_32x4_r0_l_0, inp_16x4_r1_0,
954 vget_low_s16(ip_filt_16x8_r1_0));
955 out_res_32x4_r0_l_1 =
956 vmull_s16(vget_high_s16(inp_16x8_r0_0), vget_high_s16(ip_filt_16x8_r0_0));
957 out_res_32x4_r0_l_1 =
958 vmlal_s16(out_res_32x4_r0_l_1, vget_high_s16(inp_16x8_r1_0),
959 vget_high_s16(ip_filt_16x8_r1_0));
960
961 out_res_16x4_r0_l_0 = vqrshrun_n_s32(out_res_32x4_r0_l_0, 10);
962 out_res_16x4_r0_l_1 = vqrshrun_n_s32(out_res_32x4_r0_l_1, 10);
963 out_res_16x8_r0_l = vcombine_u16(out_res_16x4_r0_l_0, out_res_16x4_r0_l_1);
964 out_8x16_r0 = vld1q_u8(pu1_out + (i4_y * i4_out_stride));
965 out_8x16_r0 = vbslq_u8(chroma_mask_8x16,
966 vreinterpretq_u8_u16(out_res_16x8_r0_l), out_8x16_r0);
967 vst1q_u8((pu1_out + (i4_y * i4_out_stride)), out_8x16_r0);
968 }
969 }
970 }
971 }
972 return;
973 } /* End of Interpolation Function */
974
975 /*****************************************************************************/
976 /* */
977 /* Function Name : isvcd_horz_interpol_chroma_dyadic_1_neonintr */
978 /* */
979 /* Description : This function takes the reference array buffer & performs*/
980 /* interpolation of a component to find the intra */
981 /* resampled value */
982 /* Inputs : pv_intra_samp_ctxt : intra sampling context */
983 /* pu1_out : output buffer pointer */
984 /* i4_out_stride : output buffer stride */
985 /* i4_refarray_wd : reference array width */
986 /* i4_x_offset : offset in reference layer in horz direction*/
987 /* ps_coord : current mb co-ordinate */
988 /* i4_chroma_flag : chroma processing flag */
989 /* Globals : none */
990 /* Processing : it does the interpolation on horizontal direction */
991 /* Outputs : resampled pixels */
992 /* Returns : none */
993 /* */
994 /* Issues : none */
995 /* */
996 /* Revision History: */
997 /* */
998 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
999 /* 26 06 2009 vijayakumar creation */
1000 /* */
1001 /*****************************************************************************/
1002
isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1003 void isvcd_horz_interpol_chroma_dyadic_1_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1004 WORD32 i4_out_stride, WORD32 i4_phase_0,
1005 WORD32 i4_phase_1)
1006 {
1007 WORD32 i4_y;
1008 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1009 WORD32 i4_filt_stride, i4_dst_stride;
1010 UWORD8 *pu1_out;
1011 WORD16 *pi2_tmp;
1012
1013 int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1, i4_samp_horz_16x8_r0_2;
1014 int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1, i4_samp_horz_16x8_r1_2;
1015 int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
1016 int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
1017
1018 int16x8_t final_horz_16x8_r0_1;
1019 int16x8_t final_horz_16x8_r1_1;
1020
1021 uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
1022 uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
1023
1024 i4_coeff_0 = 8 - i4_phase_0;
1025 i4_coeff_1 = i4_phase_0;
1026 i4_coeff_2 = 8 - i4_phase_1;
1027 i4_coeff_3 = i4_phase_1;
1028
1029 pu1_out = pu1_out_buf;
1030 pi2_tmp = pi2_tmp_filt_buf;
1031 i4_filt_stride = 6;
1032 i4_dst_stride = i4_out_stride;
1033
1034 /* Horizontal interpolation */
1035 for(i4_y = 0; i4_y < 8; i4_y += 2)
1036 {
1037 i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
1038 i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4
1039 i4_samp_horz_16x8_r0_2 = vld1q_s16(pi2_tmp + 2); // a2 a3 a4 a5
1040
1041 i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
1042 i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
1043 i4_samp_horz_16x8_r1_2 = vld1q_s16(pi2_tmp + (i4_filt_stride + 2));
1044
1045 i4_rslt_horz_r0_1 =
1046 vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0
1047
1048 i4_rslt_horz_r0_2 =
1049 vmulq_n_s16(i4_samp_horz_16x8_r0_1, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2
1050 i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
1051 i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
1052
1053 i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_2,
1054 i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
1055
1056 i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
1057 i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_1, i4_coeff_2);
1058
1059 i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
1060 i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_2, i4_coeff_3);
1061
1062 final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0];
1063 final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0];
1064
1065 final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6);
1066
1067 final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6);
1068
1069 i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
1070 i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
1071
1072 i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
1073 i4_out_horz_8x16_r0);
1074 i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
1075 i4_out_horz_8x16_r1);
1076
1077 vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
1078 vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
1079
1080 /* Incrementing ptr */
1081 pi2_tmp += (i4_filt_stride << 1);
1082 pu1_out += (i4_dst_stride << 1);
1083
1084 } /* End of loop over y */
1085 }
1086
1087 /*****************************************************************************/
1088 /* */
1089 /* Function Name : isvcd_horz_interpol_chroma_dyadic_2_neonintr */
1090 /* */
1091 /* Description : This function takes the reference array buffer & performs*/
1092 /* vertical intra resampling for dyadic scaling ratios for */
1093 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1094 /* chroma_phase_y_plus1: */
1095 /* ref_lyr cur_lyr */
1096 /* 0 1 */
1097 /* 0 2 */
1098 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1099 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
1100 /* vertically interpolated data */
1101 /* i4_phase_0 : y phase for even values of y */
1102 /* i4_phase_1 : y phase for odd values of y */
1103 /* Globals : none */
1104 /* Processing : it does the interpolation in vertical direction */
1105 /* Outputs : vertically resampled samples */
1106 /* Returns : none */
1107 /* */
1108 /* Issues : none */
1109 /* */
1110 /* Revision History: */
1111 /* */
1112 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1113 /* 21 05 2021 Dolan creation */
1114 /* */
1115 /*****************************************************************************/
isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1116 void isvcd_horz_interpol_chroma_dyadic_2_neonintr(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1117 WORD32 i4_out_stride, WORD32 i4_phase_0,
1118 WORD32 i4_phase_1)
1119 {
1120 WORD32 i4_y;
1121 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1122 WORD32 i4_filt_stride, i4_dst_stride;
1123 UWORD8 *pu1_out;
1124 WORD16 *pi2_tmp;
1125
1126 int16x8_t i4_samp_horz_16x8_r0_0, i4_samp_horz_16x8_r0_1;
1127 int16x8_t i4_samp_horz_16x8_r1_0, i4_samp_horz_16x8_r1_1;
1128 int16x8_t i4_rslt_horz_r0_1, i4_rslt_horz_r0_2;
1129 int16x8_t i4_rslt_horz_r1_1, i4_rslt_horz_r1_2;
1130
1131 int16x8_t final_horz_16x8_r0_1;
1132 int16x8_t final_horz_16x8_r1_1;
1133
1134 uint8x16_t i4_out_horz_8x16_r0, i4_out_horz_8x16_r1;
1135 uint8x16_t chroma_mask_8x16 = vreinterpretq_u8_u16(vdupq_n_u16(0x00ff));
1136
1137 i4_coeff_0 = 8 - i4_phase_0;
1138 i4_coeff_1 = i4_phase_0;
1139 i4_coeff_2 = 8 - i4_phase_1;
1140 i4_coeff_3 = i4_phase_1;
1141
1142 pu1_out = pu1_out_buf;
1143 pi2_tmp = pi2_tmp_filt_buf + 1;
1144 i4_filt_stride = 6;
1145 i4_dst_stride = i4_out_stride;
1146
1147 /* Horizontal interpolation */
1148 for(i4_y = 0; i4_y < 8; i4_y += 2)
1149 {
1150 i4_samp_horz_16x8_r0_0 = vld1q_s16(pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
1151 i4_samp_horz_16x8_r0_1 = vld1q_s16(pi2_tmp + 1); // a1 a2 a3 a4
1152
1153 i4_samp_horz_16x8_r1_0 = vld1q_s16(pi2_tmp + i4_filt_stride);
1154 i4_samp_horz_16x8_r1_1 = vld1q_s16(pi2_tmp + i4_filt_stride + 1);
1155
1156 i4_rslt_horz_r0_1 =
1157 vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_0); // a0c0 a1c0 a2c0 a3c0
1158
1159 i4_rslt_horz_r0_2 =
1160 vmulq_n_s16(i4_samp_horz_16x8_r0_0, i4_coeff_2); // a1c2 a2c2 a3c2 a4c2
1161 i4_rslt_horz_r0_1 = vmlaq_n_s16(i4_rslt_horz_r0_1, i4_samp_horz_16x8_r0_1,
1162 i4_coeff_1); // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
1163
1164 i4_rslt_horz_r0_2 = vmlaq_n_s16(i4_rslt_horz_r0_2, i4_samp_horz_16x8_r0_1,
1165 i4_coeff_3); // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
1166
1167 i4_rslt_horz_r1_1 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_0);
1168 i4_rslt_horz_r1_2 = vmulq_n_s16(i4_samp_horz_16x8_r1_0, i4_coeff_2);
1169
1170 i4_rslt_horz_r1_1 = vmlaq_n_s16(i4_rslt_horz_r1_1, i4_samp_horz_16x8_r1_1, i4_coeff_1);
1171 i4_rslt_horz_r1_2 = vmlaq_n_s16(i4_rslt_horz_r1_2, i4_samp_horz_16x8_r1_1, i4_coeff_3);
1172
1173 final_horz_16x8_r0_1 = vzipq_s16(i4_rslt_horz_r0_1, i4_rslt_horz_r0_2).val[0];
1174 final_horz_16x8_r1_1 = vzipq_s16(i4_rslt_horz_r1_1, i4_rslt_horz_r1_2).val[0];
1175
1176 final_horz_16x8_r0_1 = vrshrq_n_s16(final_horz_16x8_r0_1, 6);
1177
1178 final_horz_16x8_r1_1 = vrshrq_n_s16(final_horz_16x8_r1_1, 6);
1179
1180 i4_out_horz_8x16_r0 = vld1q_u8(pu1_out);
1181 i4_out_horz_8x16_r1 = vld1q_u8(pu1_out + i4_dst_stride);
1182
1183 i4_out_horz_8x16_r0 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r0_1),
1184 i4_out_horz_8x16_r0);
1185 i4_out_horz_8x16_r1 = vbslq_u8(chroma_mask_8x16, vreinterpretq_u8_s16(final_horz_16x8_r1_1),
1186 i4_out_horz_8x16_r1);
1187
1188 vst1q_u8(pu1_out, i4_out_horz_8x16_r0);
1189 vst1q_u8(pu1_out + i4_dst_stride, i4_out_horz_8x16_r1);
1190
1191 /* Incrementing ptr */
1192 pi2_tmp += (i4_filt_stride << 1);
1193 pu1_out += (i4_dst_stride << 1);
1194
1195 } /* End of loop over y */
1196 }
1197
1198 /*****************************************************************************/
1199 /* */
1200 /* Function Name : isvcd_vert_interpol_chroma_dyadic_1_neonintr */
1201 /* */
1202 /* Description : This function takes the reference array buffer & performs*/
1203 /* vertical intra resampling for dyadic scaling ratios for */
1204 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1205 /* chroma_phase_y_plus1: */
1206 /* ref_lyr cur_lyr */
1207 /* 2 0 */
1208 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1209 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */
1210 /* vertically interpolated data */
1211 /* i4_phase_0 : y phase for even values of y */
1212 /* i4_phase_1 : y phase for odd values of y */
1213 /* Globals : none */
1214 /* Processing : it does the interpolation in vertical direction */
1215 /* Outputs : vertically resampled samples */
1216 /* Returns : none */
1217 /* */
1218 /* Issues : none */
1219 /* */
1220 /* Revision History: */
1221 /* */
1222 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1223 /* 21 05 2021 Dolan creation */
1224 /* */
1225 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1226 void isvcd_vert_interpol_chroma_dyadic_1_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1227 WORD32 i4_phase_0, WORD32 i4_phase_1)
1228 {
1229 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1230 WORD32 i4_src_stride;
1231 UWORD8 *pu1_inp;
1232 WORD16 *pi2_tmp;
1233
1234 uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2;
1235 uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4, i4_samp_vert_8x8_r5;
1236 int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
1237 i4_rslt_vert_16x8_r3;
1238 int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
1239 i4_rslt_vert_16x8_r7;
1240
1241 i4_coeff_0 = 8 - i4_phase_0;
1242 i4_coeff_1 = i4_phase_0;
1243 i4_coeff_2 = 8 - i4_phase_1;
1244 i4_coeff_3 = i4_phase_1;
1245
1246 pu1_inp = pu1_inp_buf;
1247 pi2_tmp = pi2_tmp_filt_buf;
1248 i4_src_stride = DYADIC_REF_W_C;
1249
1250 /* Vertical interpolation */
1251 i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
1252 pu1_inp += i4_src_stride;
1253 i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
1254 pu1_inp += i4_src_stride;
1255 i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
1256 pu1_inp += i4_src_stride;
1257 i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
1258 pu1_inp += i4_src_stride;
1259 i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
1260 pu1_inp += i4_src_stride;
1261 i4_samp_vert_8x8_r5 = vld1_u8(pu1_inp);
1262 pu1_inp += i4_src_stride;
1263
1264 i4_rslt_vert_16x8_r0 =
1265 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
1266 i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
1267 i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
1268 vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
1269
1270 i4_rslt_vert_16x8_r1 =
1271 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
1272 i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
1273 i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
1274 vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
1275
1276 i4_rslt_vert_16x8_r2 =
1277 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
1278 i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
1279 i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
1280 vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
1281
1282 i4_rslt_vert_16x8_r3 =
1283 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
1284 i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
1285 i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
1286 vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
1287
1288 i4_rslt_vert_16x8_r4 =
1289 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
1290 i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
1291 i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
1292 vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
1293
1294 i4_rslt_vert_16x8_r5 =
1295 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
1296 i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
1297 i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
1298 vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
1299
1300 i4_rslt_vert_16x8_r6 =
1301 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
1302 i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
1303 i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
1304 vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
1305
1306 i4_rslt_vert_16x8_r7 =
1307 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_2);
1308 i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
1309 i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r5)), i4_coeff_3);
1310 vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
1311 vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
1312 vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
1313 }
1314
1315 /*****************************************************************************/
1316 /* */
1317 /* Function Name : isvcd_vert_interpol_chroma_dyadic_2_neonintr */
1318 /* */
1319 /* Description : This function takes the reference array buffer & performs*/
1320 /* vertical intra resampling for dyadic scaling ratios for */
1321 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1322 /* chroma_phase_y_plus1: */
1323 /* ref_lyr cur_lyr */
1324 /* 2 0 */
1325 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1326 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
1327 /* vertically interpolated data */
1328 /* i4_phase_0 : y phase for even values of y */
1329 /* i4_phase_1 : y phase for odd values of y */
1330 /* Globals : none */
1331 /* Processing : it does the interpolation in vertical direction */
1332 /* Outputs : vertically resampled samples */
1333 /* Returns : none */
1334 /* */
1335 /* Issues : none */
1336 /* */
1337 /* Revision History: */
1338 /* */
1339 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1340 /* 21 05 2021 Dolan creation */
1341 /* */
1342 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1343 void isvcd_vert_interpol_chroma_dyadic_2_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1344 WORD32 i4_phase_0, WORD32 i4_phase_1)
1345 {
1346 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1347 WORD32 i4_src_stride;
1348 UWORD8 *pu1_inp;
1349 WORD16 *pi2_tmp;
1350
1351 uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2, i4_samp_vert_8x8_r3;
1352 uint8x8_t i4_samp_vert_8x8_r4;
1353 int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
1354 i4_rslt_vert_16x8_r3;
1355 int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
1356 i4_rslt_vert_16x8_r7;
1357
1358 i4_coeff_0 = 8 - i4_phase_0;
1359 i4_coeff_1 = i4_phase_0;
1360 i4_coeff_2 = 8 - i4_phase_1;
1361 i4_coeff_3 = i4_phase_1;
1362
1363 pi2_tmp = pi2_tmp_filt_buf;
1364 i4_src_stride = DYADIC_REF_W_C;
1365 pu1_inp = pu1_inp_buf + i4_src_stride;
1366
1367 /* Vertical interpolation */
1368 i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
1369 pu1_inp += i4_src_stride;
1370 i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
1371 pu1_inp += i4_src_stride;
1372 i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
1373 pu1_inp += i4_src_stride;
1374 i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
1375 pu1_inp += i4_src_stride;
1376 i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
1377 pu1_inp += i4_src_stride;
1378
1379 /* since y_phase = phase_0 for y = 0 */
1380 i4_rslt_vert_16x8_r0 =
1381 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
1382 i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
1383 i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
1384 vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
1385
1386 i4_rslt_vert_16x8_r1 =
1387 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2);
1388 i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
1389 i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3);
1390 vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
1391
1392 i4_rslt_vert_16x8_r2 =
1393 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
1394 i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
1395 i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
1396 vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
1397
1398 i4_rslt_vert_16x8_r3 =
1399 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
1400 i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
1401 i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
1402 vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
1403
1404 i4_rslt_vert_16x8_r4 =
1405 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
1406 i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
1407 i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
1408 vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
1409
1410 i4_rslt_vert_16x8_r5 =
1411 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
1412 i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
1413 i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
1414 vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
1415
1416 i4_rslt_vert_16x8_r6 =
1417 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
1418 i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
1419 i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
1420 vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
1421
1422 i4_rslt_vert_16x8_r7 =
1423 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
1424 i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
1425 i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
1426 vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
1427
1428 vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
1429 vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
1430 }
1431
1432 /*****************************************************************************/
1433 /* */
1434 /* Function Name : isvcd_vert_interpol_chroma_dyadic_3_neonintr */
1435 /* */
1436 /* Description : This function takes the reference array buffer & performs*/
1437 /* vertical intra resampling for dyadic scaling ratios for */
1438 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1439 /* chroma_phase_y_plus1: */
1440 /* ref_lyr cur_lyr */
1441 /* 2 0 */
1442 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1443 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
1444 /* vertically interpolated data */
1445 /* i4_phase_0 : y phase for even values of y */
1446 /* i4_phase_1 : y phase for odd values of y */
1447 /* Globals : none */
1448 /* Processing : it does the interpolation in vertical direction */
1449 /* Outputs : vertically resampled samples */
1450 /* Returns : none */
1451 /* */
1452 /* Issues : none */
1453 /* */
1454 /* Revision History: */
1455 /* */
1456 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1457 /* 21 05 2021 Dolan creation */
1458 /* */
1459 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1460 void isvcd_vert_interpol_chroma_dyadic_3_neonintr(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1461 WORD32 i4_phase_0, WORD32 i4_phase_1)
1462 {
1463 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1464 WORD32 i4_src_stride;
1465 UWORD8 *pu1_inp;
1466 WORD16 *pi2_tmp;
1467
1468 uint8x8_t i4_samp_vert_8x8_r0, i4_samp_vert_8x8_r1, i4_samp_vert_8x8_r2;
1469 uint8x8_t i4_samp_vert_8x8_r3, i4_samp_vert_8x8_r4;
1470 int16x8_t i4_rslt_vert_16x8_r0, i4_rslt_vert_16x8_r1, i4_rslt_vert_16x8_r2,
1471 i4_rslt_vert_16x8_r3;
1472 int16x8_t i4_rslt_vert_16x8_r4, i4_rslt_vert_16x8_r5, i4_rslt_vert_16x8_r6,
1473 i4_rslt_vert_16x8_r7;
1474
1475 i4_coeff_0 = 8 - i4_phase_0;
1476 i4_coeff_1 = i4_phase_0;
1477 i4_coeff_2 = 8 - i4_phase_1;
1478 i4_coeff_3 = i4_phase_1;
1479
1480 pi2_tmp = pi2_tmp_filt_buf;
1481 i4_src_stride = DYADIC_REF_W_C;
1482 pu1_inp = pu1_inp_buf;
1483
1484 /* Vertical interpolation */
1485 /* y = 0, y_phase = phase_0 */
1486 i4_samp_vert_8x8_r0 = vld1_u8(pu1_inp);
1487 pu1_inp += i4_src_stride;
1488 i4_samp_vert_8x8_r1 = vld1_u8(pu1_inp);
1489 pu1_inp += i4_src_stride;
1490 i4_samp_vert_8x8_r2 = vld1_u8(pu1_inp);
1491 pu1_inp += i4_src_stride;
1492 i4_samp_vert_8x8_r3 = vld1_u8(pu1_inp);
1493 pu1_inp += i4_src_stride;
1494 i4_samp_vert_8x8_r4 = vld1_u8(pu1_inp);
1495 pu1_inp += i4_src_stride;
1496
1497 /* since y_phase = phase_0 for y = 0 */
1498 i4_rslt_vert_16x8_r0 =
1499 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_0);
1500 i4_rslt_vert_16x8_r0 = vmlaq_n_s16(
1501 i4_rslt_vert_16x8_r0, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_1);
1502 vst1q_s16(pi2_tmp, i4_rslt_vert_16x8_r0);
1503
1504 i4_rslt_vert_16x8_r1 =
1505 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r0)), i4_coeff_2);
1506 i4_rslt_vert_16x8_r1 = vmlaq_n_s16(
1507 i4_rslt_vert_16x8_r1, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_3);
1508 vst1q_s16(pi2_tmp + 6, i4_rslt_vert_16x8_r1);
1509
1510 i4_rslt_vert_16x8_r2 =
1511 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_0);
1512 i4_rslt_vert_16x8_r2 = vmlaq_n_s16(
1513 i4_rslt_vert_16x8_r2, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_1);
1514 vst1q_s16(pi2_tmp + 12, i4_rslt_vert_16x8_r2);
1515
1516 i4_rslt_vert_16x8_r3 =
1517 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r1)), i4_coeff_2);
1518 i4_rslt_vert_16x8_r3 = vmlaq_n_s16(
1519 i4_rslt_vert_16x8_r3, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_3);
1520 vst1q_s16(pi2_tmp + 18, i4_rslt_vert_16x8_r3);
1521
1522 i4_rslt_vert_16x8_r4 =
1523 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_0);
1524 i4_rslt_vert_16x8_r4 = vmlaq_n_s16(
1525 i4_rslt_vert_16x8_r4, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_1);
1526 vst1q_s16(pi2_tmp + 24, i4_rslt_vert_16x8_r4);
1527
1528 i4_rslt_vert_16x8_r5 =
1529 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r2)), i4_coeff_2);
1530 i4_rslt_vert_16x8_r5 = vmlaq_n_s16(
1531 i4_rslt_vert_16x8_r5, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_3);
1532 vst1q_s16(pi2_tmp + 30, i4_rslt_vert_16x8_r5);
1533
1534 i4_rslt_vert_16x8_r6 =
1535 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_0);
1536 i4_rslt_vert_16x8_r6 = vmlaq_n_s16(
1537 i4_rslt_vert_16x8_r6, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_1);
1538 vst1q_s16(pi2_tmp + 36, i4_rslt_vert_16x8_r6);
1539
1540 i4_rslt_vert_16x8_r7 =
1541 vmulq_n_s16(vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r3)), i4_coeff_2);
1542 i4_rslt_vert_16x8_r7 = vmlaq_n_s16(
1543 i4_rslt_vert_16x8_r7, vreinterpretq_s16_u16(vmovl_u8(i4_samp_vert_8x8_r4)), i4_coeff_3);
1544 vst1_s16(pi2_tmp + 42, vget_low_s16(i4_rslt_vert_16x8_r7));
1545
1546 vst1q_lane_s16(pi2_tmp + 46, i4_rslt_vert_16x8_r7, 4);
1547 vst1q_lane_s16(pi2_tmp + 47, i4_rslt_vert_16x8_r7, 5);
1548 }
1549