1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_iquant_itrans_residual_recon_neonintr.c
24 *
25 * @brief
26 * Contains definition of functions for h264 inverse quantization inverse
27 * transformation and recon
28 *
29 * @author
30 * Kishore
31 *
32 * @par List of Functions:
33 * - ih264_iquant_itrans_residual_recon_4x4()
34 * - ih264_iquant_itrans_residual_recon_8x8()
35 * - isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr()
36 * - isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr()
37 * - ih264_iquant_itrans_residual_recon_chroma_4x4()
38 * - isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr()
39 *
40 * @remarks
41 *
42 *******************************************************************************
43 */
44
45 /*****************************************************************************/
46 /* File Includes */
47 /*****************************************************************************/
48
49 #include <string.h>
50 #include <arm_neon.h>
51
52 /* User include files */
53 #include "ih264_typedefs.h"
54 #include "ih264_defs.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "ih264_trans_data.h"
59 #include "ih264_size_defs.h"
60 #include "ih264_structs.h"
61 #include "isvcd_iquant_itrans_residual_recon.h"
62
63 /*****************************************************************************/
64 /* */
65 /* Function Name : isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr */
66 /* */
67 /* Description : this function computes the recon output from the */
68 /* IQ+IT+RESD */
69 /* */
70 /* Inputs : */
71 /* Globals : none */
72 /* Processing : */
73 /* */
74 /* Outputs : i4_nnz */
75 /* Returns : none */
76 /* */
77 /* Issues : none */
78 /* */
79 /* Revision History: */
80 /* */
81 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
82 /* 25 11 2021 Kishore creation */
83 /* */
84 /*****************************************************************************/
85
isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)86 WORD32 isvcd_iquant_itrans_residual_recon_4x4_dc_neonintr(
87 WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
88 WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
89 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
90 {
91 WORD32 i4_iq_out_temp;
92 int16x8_t temp_0;
93 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
94 int16x8_t pred0, pred1, pred2, pred3;
95 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
96 int16x8_t dup_val_1, dup_val_2, dup_abs;
97 int16x8_t resd01, resd23, dup_max, dup_min;
98 WORD32 i4_nnz;
99
100 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
101 UNUSED(pi2_tmp);
102 if(iq_start_idx == 0)
103 {
104 i4_iq_out_temp = pi2_src[0];
105
106 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
107 }
108 else
109 {
110 i4_iq_out_temp = pi2_dc_ld_addr[0];
111 }
112
113 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
114 dup_min = vdupq_n_s16(RSD_MIN);
115 dup_max = vdupq_n_s16(RSD_MAX);
116
117 pred0_in = vld1_u8((uint8_t *) pu1_pred);
118 pu1_pred = pu1_pred + pred_strd;
119 pred1_in = vld1_u8((uint8_t *) pu1_pred);
120 pu1_pred = pu1_pred + pred_strd;
121 pred2_in = vld1_u8((uint8_t *) pu1_pred);
122 pu1_pred = pu1_pred + pred_strd;
123 pred3_in = vld1_u8((uint8_t *) pu1_pred);
124
125 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
126 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
127 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
128 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
129
130 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
131 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
132 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
133 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
134
135 resd0_in = vaddq_s16(resd0_in, temp_0);
136 resd1_in = vaddq_s16(resd1_in, temp_0);
137 resd2_in = vaddq_s16(resd2_in, temp_0);
138 resd3_in = vaddq_s16(resd3_in, temp_0);
139
140 resd0_in = vminq_s16(resd0_in, dup_max);
141 resd0_in = vmaxq_s16(resd0_in, dup_min);
142 resd1_in = vminq_s16(resd1_in, dup_max);
143 resd1_in = vmaxq_s16(resd1_in, dup_min);
144 resd2_in = vminq_s16(resd2_in, dup_max);
145 resd2_in = vmaxq_s16(resd2_in, dup_min);
146 resd3_in = vminq_s16(resd3_in, dup_max);
147 resd3_in = vmaxq_s16(resd3_in, dup_min);
148
149 resd01 = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd0_in)),
150 vget_low_s64(vreinterpretq_s64_s16(resd1_in))));
151
152 resd23 = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(resd2_in)),
153 vget_low_s64(vreinterpretq_s64_s16(resd3_in))));
154
155 dup_val_1 = vabsq_s16(resd01);
156 dup_val_2 = vabsq_s16(resd23);
157 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
158 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
159 dup_abs[6] || dup_abs[7];
160
161 pred0 = vaddq_s16(pred0, resd0_in);
162 pred1 = vaddq_s16(pred1, resd1_in);
163 pred2 = vaddq_s16(pred2, resd2_in);
164 pred3 = vaddq_s16(pred3, resd3_in);
165
166 pred0_in = vqmovun_s16(pred0);
167 pred1_in = vqmovun_s16(pred1);
168 pred2_in = vqmovun_s16(pred2);
169 pred3_in = vqmovun_s16(pred3);
170
171 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred0_in), 0);
172 vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred1_in), 0);
173 vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 2), vreinterpret_u32_u8(pred2_in), 0);
174 vst1_lane_u32((uint32_t *) (pu1_out + out_strd * 3), vreinterpret_u32_u8(pred3_in), 0);
175
176 return i4_nnz;
177 }
178
179 /*****************************************************************************/
180 /* */
181 /* Function Name : isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr */
182 /* */
183 /* Description : this function computes the recon output for the chroma */
184 /* from IQ+IT+RESD */
185 /* */
186 /* Inputs : */
187 /* Globals : none */
188 /* Processing : */
189 /* */
190 /* Outputs : none */
191 /* Returns : none */
192 /* */
193 /* Issues : none */
194 /* */
195 /* Revision History: */
196 /* */
197 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
198 /* 25 11 2021 Kishore creation */
199 /* */
200 /*****************************************************************************/
201
isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)202 void isvcd_iquant_itrans_residual_recon_chroma_4x4_dc_neonintr(
203 WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
204 WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
205 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
206 {
207 WORD32 i4_iq_out_temp;
208 int16x8_t temp_0;
209 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
210 int16x8_t pred0, pred1, pred2, pred3;
211 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in, dup_min, dup_max;
212
213 uint8x8_t i4_out_horz_8x8_r0, i4_out_horz_8x8_r1, i4_out_horz_8x8_r2, i4_out_horz_8x8_r3;
214 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
215
216 UNUSED(pi2_src);
217 UNUSED(pu2_iscal_mat);
218 UNUSED(pu2_weigh_mat);
219 UNUSED(u4_qp_div_6);
220 UNUSED(pi2_tmp);
221
222 i4_iq_out_temp = pi2_dc_src[0];
223 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
224 dup_min = vdupq_n_s16(RSD_MIN);
225 dup_max = vdupq_n_s16(RSD_MAX);
226
227 pred0_in = vld1_u8((uint8_t *) pu1_pred);
228 pu1_pred = pu1_pred + pred_strd;
229 pred1_in = vld1_u8((uint8_t *) pu1_pred);
230 pu1_pred = pu1_pred + pred_strd;
231 pred2_in = vld1_u8((uint8_t *) pu1_pred);
232 pu1_pred = pu1_pred + pred_strd;
233 pred3_in = vld1_u8((uint8_t *) pu1_pred);
234
235 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
236 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
237 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
238 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
239
240 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
241 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
242 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
243 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
244
245 resd0_in = vaddq_s16(resd0_in, temp_0);
246 resd1_in = vaddq_s16(resd1_in, temp_0);
247 resd2_in = vaddq_s16(resd2_in, temp_0);
248 resd3_in = vaddq_s16(resd3_in, temp_0);
249
250 resd0_in = vminq_s16(resd0_in, dup_max);
251 resd0_in = vmaxq_s16(resd0_in, dup_min);
252 resd1_in = vminq_s16(resd1_in, dup_max);
253 resd1_in = vmaxq_s16(resd1_in, dup_min);
254 resd2_in = vminq_s16(resd2_in, dup_max);
255 resd2_in = vmaxq_s16(resd2_in, dup_min);
256 resd3_in = vminq_s16(resd3_in, dup_max);
257 resd3_in = vmaxq_s16(resd3_in, dup_min);
258
259 pred0 = vaddq_s16(pred0, resd0_in);
260 pred1 = vaddq_s16(pred1, resd1_in);
261 pred2 = vaddq_s16(pred2, resd2_in);
262 pred3 = vaddq_s16(pred3, resd3_in);
263
264 pred0_in = vqmovun_s16(pred0);
265 pred1_in = vqmovun_s16(pred1);
266 pred2_in = vqmovun_s16(pred2);
267 pred3_in = vqmovun_s16(pred3);
268
269 i4_out_horz_8x8_r0 = vld1_u8(pu1_out);
270 i4_out_horz_8x8_r1 = vld1_u8(pu1_out + out_strd);
271 i4_out_horz_8x8_r2 = vld1_u8(pu1_out + out_strd * 2);
272 i4_out_horz_8x8_r3 = vld1_u8(pu1_out + out_strd * 3);
273
274 i4_out_horz_8x8_r0 = vbsl_u8(chroma_mask_8x8, pred0_in, i4_out_horz_8x8_r0);
275 i4_out_horz_8x8_r1 = vbsl_u8(chroma_mask_8x8, pred1_in, i4_out_horz_8x8_r1);
276 i4_out_horz_8x8_r2 = vbsl_u8(chroma_mask_8x8, pred2_in, i4_out_horz_8x8_r2);
277 i4_out_horz_8x8_r3 = vbsl_u8(chroma_mask_8x8, pred3_in, i4_out_horz_8x8_r3);
278
279 vst1_u8((uint8_t *) (pu1_out), i4_out_horz_8x8_r0);
280 vst1_u8((uint8_t *) (pu1_out + out_strd), i4_out_horz_8x8_r1);
281 vst1_u8((uint8_t *) (pu1_out + out_strd * 2), i4_out_horz_8x8_r2);
282 vst1_u8((uint8_t *) (pu1_out + out_strd * 3), i4_out_horz_8x8_r3);
283 }
284
285 /*****************************************************************************/
286 /* */
287 /* Function Name : isvcd_iquant_itrans_residual_recon_4x4_neonintr */
288 /* */
289 /* Description : this function computes the recon output from the */
290 /* IQ+IT+RESD */
291 /* */
292 /* Inputs : */
293 /* Globals : none */
294 /* Processing : */
295 /* */
296 /* Outputs : i4_nnz */
297 /* Returns : none */
298 /* */
299 /* Issues : none */
300 /* */
301 /* Revision History: */
302 /* */
303 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
304 /* 25 11 2021 Kishore creation */
305 /* */
306 /*****************************************************************************/
307
isvcd_iquant_itrans_residual_recon_4x4_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)308 WORD32 isvcd_iquant_itrans_residual_recon_4x4_neonintr(
309 WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
310 WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
311 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
312 {
313 int16x4x4_t src_16x4x2;
314 int16x4x4_t iscal_16x4x2;
315 int16x4x4_t weigh_16x4x2;
316
317 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
318 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
319 int16x4_t rq1_16x4, rq3_16x4;
320 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
321 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
322 int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
323 int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
324 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
325
326 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
327 int16x8_t pred0, pred1, pred2, pred3;
328 int16x4_t resd0_in, resd1_in, resd2_in, resd3_in;
329 int16x8_t resd01_in, resd23_in, dup_min, dup_max;
330 int16x8_t pred01_in, pred23_in;
331 uint8x8_t pred01_un, pred23_un;
332 WORD32 i4_nnz;
333 int16x4x2_t xx0_16x4_2, xx2_16x4_2;
334 int32x2x2_t x0_32x2_2, x1_32x2_2;
335 int16x8_t dup_val_1, dup_val_2, dup_abs;
336
337 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
338 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
339 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
340 UNUSED(pi2_tmp);
341 dup_min = vdupq_n_s16(RSD_MIN);
342 dup_max = vdupq_n_s16(RSD_MAX);
343
344 src_16x4x2 = vld4_s16(pi2_src);
345 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
346 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
347
348 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
349 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
350 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
351 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
352
353 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
354 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
355 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
356 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
357
358 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
359 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
360 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
361 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
362
363 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
364 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
365 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
366 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
367
368 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
369 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
370 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
371 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
372
373 if(iq_start_idx == 1)
374 {
375 q0_16x4 = vset_lane_s16(pi2_dc_ld_addr[0], q0_16x4, 0);
376 }
377
378 rq1_16x4 = vshr_n_s16(q1_16x4, 1); // q1 >>1
379 rq3_16x4 = vshr_n_s16(q3_16x4, 1); // q3 >>1
380
381 x0_16x4 = vadd_s16(q0_16x4, q2_16x4); // x0 = q0 + q2
382 x1_16x4 = vsub_s16(q0_16x4, q2_16x4); // x1 = q0 - q2
383 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); // x2 = q1>>1 - q3
384 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); // x2 = q1 + q3>>1
385
386 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); // x0+x3
387 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); // x1+x2
388 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); // x1-x2
389 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); // x0-x3
390
391 xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
392 xx0_0_16x4 = xx0_16x4_2.val[0];
393 xx0_1_16x4 = xx0_16x4_2.val[1];
394 xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
395 xx2_0_16x4 = xx2_16x4_2.val[0];
396 xx2_1_16x4 = xx2_16x4_2.val[1];
397 x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
398 x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
399 x0_32x2 = x0_32x2_2.val[0];
400 x1_32x2 = x1_32x2_2.val[0];
401 x2_32x2 = x0_32x2_2.val[1];
402 x3_32x2 = x1_32x2_2.val[1];
403
404 x0_16x4 = vreinterpret_s16_s32(x0_32x2);
405 x1_16x4 = vreinterpret_s16_s32(x1_32x2);
406 x2_16x4 = vreinterpret_s16_s32(x2_32x2);
407 x3_16x4 = vreinterpret_s16_s32(x3_32x2);
408
409 /* vertical inverse transform */
410 rq1_16x4 = vshr_n_s16(x1_16x4, 1); // q1 >> 1
411 rq3_16x4 = vshr_n_s16(x3_16x4, 1); // q3 >> 1
412
413 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); // x0 = q0 + q2
414 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); // x1 = q0 - q2
415 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); // x2 = q1>>1 - q3
416 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); // x3 = q1 + q3>>1
417
418 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); // imacro = x0 + x3
419 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); // imacro = x1 + x2
420 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); // imacro = x1 - x2
421 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); // imacro = x0 - x3
422
423 resd0_in = vld1_s16((int16_t *) pi2_rsd);
424 resd1_in = vld1_s16((int16_t *) pi2_rsd + rsd_strd);
425 resd2_in = vld1_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
426 resd3_in = vld1_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
427
428 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
429 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
430 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
431 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
432
433 resd0_in = vadd_s16(resd0_in, x0_16x4);
434 resd1_in = vadd_s16(resd1_in, x1_16x4);
435 resd2_in = vadd_s16(resd2_in, x2_16x4);
436 resd3_in = vadd_s16(resd3_in, x3_16x4);
437
438 pred0_in = vld1_u8((uint8_t *) pu1_pred);
439 pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
440 pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd << 1));
441 pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
442
443 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
444 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
445 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
446 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
447
448 resd01_in = vcombine_s16(resd0_in, resd1_in);
449 resd23_in = vcombine_s16(resd2_in, resd3_in);
450
451 resd01_in = vminq_s16(resd01_in, dup_max);
452 resd01_in = vmaxq_s16(resd01_in, dup_min);
453 resd23_in = vminq_s16(resd23_in, dup_max);
454 resd23_in = vmaxq_s16(resd23_in, dup_min);
455
456 dup_val_1 = vabsq_s16(resd01_in);
457 dup_val_2 = vabsq_s16(resd23_in);
458 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
459 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
460 dup_abs[6] || dup_abs[7];
461
462 pred01_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(pred0)),
463 vget_low_s64(vreinterpretq_s64_s16(pred1))));
464
465 pred23_in = vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s16(pred2)),
466 vget_low_s64(vreinterpretq_s64_s16(pred3))));
467
468 pred01_in = vaddq_s16(pred01_in, resd01_in);
469 pred23_in = vaddq_s16(pred23_in, resd23_in);
470
471 pred01_un = vqmovun_s16(pred01_in);
472 pred23_un = vqmovun_s16(pred23_in);
473
474 vst1_lane_u32((uint32_t *) (pu1_out), vreinterpret_u32_u8(pred01_un), 0);
475 vst1_lane_u32((uint32_t *) (pu1_out + out_strd), vreinterpret_u32_u8(pred01_un), 1);
476 vst1_lane_u32((uint32_t *) (pu1_out + (out_strd << 1)), vreinterpret_u32_u8(pred23_un), 0);
477 vst1_lane_u32((uint32_t *) (pu1_out + ((out_strd << 1) + out_strd)),
478 vreinterpret_u32_u8(pred23_un), 1);
479
480 return i4_nnz;
481 }
482
483 /*****************************************************************************/
484 /* */
485 /* Function Name : isvcd_iquant_itrans_residual_recon_chroma_4x4_neonintr */
486 /* */
487 /* Description : this function computes the recon output for the chroma */
488 /* from IQ+IT+RESD */
489 /* */
490 /* Inputs : */
491 /* Globals : none */
492 /* Processing : */
493 /* */
494 /* Outputs : none */
495 /* Returns : none */
496 /* */
497 /* Issues : none */
498 /* */
499 /* Revision History: */
500 /* */
501 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
502 /* 25 11 2021 Kishore creation */
503 /* */
504 /*****************************************************************************/
505
isvcd_iquant_itrans_residual_recon_chroma_4x4_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)506 void isvcd_iquant_itrans_residual_recon_chroma_4x4_neonintr(
507 WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
508 WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
509 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
510 {
511 int16x4x4_t src_16x4x2;
512 int16x4x4_t iscal_16x4x2;
513 int16x4x4_t weigh_16x4x2;
514
515 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
516 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
517 int16x4_t rq1_16x4, rq3_16x4;
518 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
519 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
520 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
521 int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
522 int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
523 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
524
525 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
526 int16x8_t pred0, pred1, pred2, pred3;
527 int16x8_t rec0, rec1, rec2, rec3;
528 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un;
529 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in, dup_min, dup_max;
530 uint8x8_t out0, out1, out2, out3;
531 int16x4x2_t xx0_16x4_2, xx2_16x4_2;
532 int32x2x2_t x0_32x2_2, x1_32x2_2;
533
534 uint8x8_t chroma_mask_8x8 = vreinterpret_u8_u16(vdup_n_u16(0x00ff));
535 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
536 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
537 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
538 UNUSED(pi2_tmp);
539 dup_min = vdupq_n_s16(RSD_MIN);
540 dup_max = vdupq_n_s16(RSD_MAX);
541
542 src_16x4x2 = vld4_s16(pi2_src);
543 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
544 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
545
546 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
547 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
548 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
549 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
550
551 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
552 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
553 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
554 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
555
556 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
557 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
558 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
559 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
560
561 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
562 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
563 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
564 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
565
566 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
567 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
568 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
569 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
570
571 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
572
573 rq1_16x4 = vshr_n_s16(q1_16x4, 1); // q1 >>1
574 rq3_16x4 = vshr_n_s16(q3_16x4, 1); // q3 >>1
575
576 x0_16x4 = vadd_s16(q0_16x4, q2_16x4); // x0 = q0 + q2
577 x1_16x4 = vsub_s16(q0_16x4, q2_16x4); // x1 = q0 - q2
578 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); // x2 = q1>>1 - q3
579 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); // x2 = q1 + q3>>1
580
581 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); // x0+x3
582 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); // x1+x2
583 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); // x1-x2
584 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); // x0-x3
585
586 xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
587 xx0_0_16x4 = xx0_16x4_2.val[0];
588 xx0_1_16x4 = xx0_16x4_2.val[1];
589 xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
590 xx2_0_16x4 = xx2_16x4_2.val[0];
591 xx2_1_16x4 = xx2_16x4_2.val[1];
592 x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
593 x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
594 x0_32x2 = x0_32x2_2.val[0];
595 x1_32x2 = x1_32x2_2.val[0];
596 x2_32x2 = x0_32x2_2.val[1];
597 x3_32x2 = x1_32x2_2.val[1];
598
599 x0_16x4 = vreinterpret_s16_s32(x0_32x2);
600 x1_16x4 = vreinterpret_s16_s32(x1_32x2);
601 x2_16x4 = vreinterpret_s16_s32(x2_32x2);
602 x3_16x4 = vreinterpret_s16_s32(x3_32x2);
603
604 /* vertical inverse transform */
605 rq1_16x4 = vshr_n_s16(x1_16x4, 1); // q1 >> 1
606 rq3_16x4 = vshr_n_s16(x3_16x4, 1); // q3 >> 1
607
608 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); // x0 = q0 + q2
609 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); // x1 = q0 - q2
610 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); // x2 = q1>>1 - q3
611 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); // x3 = q1 + q3>>1
612
613 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); // imacro = x0 + x3
614 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); // imacro = x1 + x2
615 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); // imacro = x1 - x2
616 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); // imacro = x0 - x3
617
618 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
619 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
620 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
621 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
622
623 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
624 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
625 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
626 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
627
628 x0_16x8 = vreinterpretq_s16_s32(vmovl_s16(x0_16x4));
629 x1_16x8 = vreinterpretq_s16_s32(vmovl_s16(x1_16x4));
630 x2_16x8 = vreinterpretq_s16_s32(vmovl_s16(x2_16x4));
631 x3_16x8 = vreinterpretq_s16_s32(vmovl_s16(x3_16x4));
632
633 resd0_in = vaddq_s16(resd0_in, x0_16x8);
634 resd1_in = vaddq_s16(resd1_in, x1_16x8);
635 resd2_in = vaddq_s16(resd2_in, x2_16x8);
636 resd3_in = vaddq_s16(resd3_in, x3_16x8);
637
638 resd0_in = vminq_s16(resd0_in, dup_max);
639 resd0_in = vmaxq_s16(resd0_in, dup_min);
640 resd1_in = vminq_s16(resd1_in, dup_max);
641 resd1_in = vmaxq_s16(resd1_in, dup_min);
642 resd2_in = vminq_s16(resd2_in, dup_max);
643 resd2_in = vmaxq_s16(resd2_in, dup_min);
644 resd3_in = vminq_s16(resd3_in, dup_max);
645 resd3_in = vmaxq_s16(resd3_in, dup_min);
646
647 pred0_in = vld1_u8((uint8_t *) pu1_pred);
648 pred1_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd));
649 pred2_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd << 1));
650 pred3_in = vld1_u8((uint8_t *) pu1_pred + (pred_strd * 3));
651
652 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
653 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
654 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
655 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
656
657 rec0 = vaddq_s16(pred0, resd0_in);
658 rec1 = vaddq_s16(pred1, resd1_in);
659 rec2 = vaddq_s16(pred2, resd2_in);
660 rec3 = vaddq_s16(pred3, resd3_in);
661
662 out0 = vld1_u8(pu1_out);
663 out1 = vld1_u8(pu1_out + out_strd);
664 out2 = vld1_u8(pu1_out + out_strd * 2);
665 out3 = vld1_u8(pu1_out + out_strd * 3);
666
667 rec0_un = vqmovun_s16(rec0);
668 rec1_un = vqmovun_s16(rec1);
669 rec2_un = vqmovun_s16(rec2);
670 rec3_un = vqmovun_s16(rec3);
671
672 out0 = vbsl_u8(chroma_mask_8x8, rec0_un, out0);
673 out1 = vbsl_u8(chroma_mask_8x8, rec1_un, out1);
674 out2 = vbsl_u8(chroma_mask_8x8, rec2_un, out2);
675 out3 = vbsl_u8(chroma_mask_8x8, rec3_un, out3);
676
677 vst1_u8((pu1_out), out0);
678 vst1_u8((pu1_out + out_strd), out1);
679 vst1_u8((pu1_out + (out_strd << 1)), out2);
680 vst1_u8((pu1_out + ((out_strd << 1) + out_strd)), out3);
681 }
682
683 /*****************************************************************************/
684 /* */
685 /* Function Name : isvcd_iquant_itrans_residual_recon_8x8_neonintr */
686 /* */
687 /* Description : this function computes the recon output from the */
688 /* IQ+IT+RESD */
689 /* */
690 /* Inputs : */
691 /* Globals : none */
692 /* Processing : */
693 /* */
694 /* Outputs : i4_nnz */
695 /* Returns : none */
696 /* */
697 /* Issues : none */
698 /* */
699 /* Revision History: */
700 /* */
701 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
702 /* 25 11 2021 Kishore creation */
703 /* */
704 /*****************************************************************************/
705
isvcd_iquant_itrans_residual_recon_8x8_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd_ptr,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)706 WORD32 isvcd_iquant_itrans_residual_recon_8x8_neonintr(
707 WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd_ptr, UWORD8 *pu1_out, WORD32 pred_strd,
708 WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
709 UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
710 {
711 int32x4_t qp_div_32x4 = vdupq_n_s32(qp_div);
712 int16x8_t iscal_16x8_0, iscal_16x8_1, iscal_16x8_2, iscal_16x8_3, iscal_16x8_4, iscal_16x8_5,
713 iscal_16x8_6, iscal_16x8_7;
714 int16x8_t weigh_16x8_0, weigh_16x8_1, weigh_16x8_2, weigh_16x8_3, weigh_16x8_4, weigh_16x8_5,
715 weigh_16x8_6, weigh_16x8_7;
716
717 int16x8_t src_16x8_0, src_16x8_1, src_16x8_2, src_16x8_3, src_16x8_4, src_16x8_5, src_16x8_6,
718 src_16x8_7;
719 int16x8_t coeff_mul_16x8_0, coeff_mul_16x8_1, coeff_mul_16x8_2, coeff_mul_16x8_3,
720 coeff_mul_16x8_4, coeff_mul_16x8_5, coeff_mul_16x8_6, coeff_mul_16x8_7;
721
722 int32x4_t quant_res_32x4_l_0, quant_res_32x4_l_1, quant_res_32x4_l_2, quant_res_32x4_l_3,
723 quant_res_32x4_l_4, quant_res_32x4_l_5, quant_res_32x4_l_6, quant_res_32x4_l_7;
724 int32x4_t quant_res_32x4_h_0, quant_res_32x4_h_1, quant_res_32x4_h_2, quant_res_32x4_h_3,
725 quant_res_32x4_h_4, quant_res_32x4_h_5, quant_res_32x4_h_6, quant_res_32x4_h_7;
726 int16x4_t quant_res_16x4_l_0, quant_res_16x4_l_1, quant_res_16x4_l_2, quant_res_16x4_l_3,
727 quant_res_16x4_l_4, quant_res_16x4_l_5, quant_res_16x4_l_6, quant_res_16x4_l_7;
728 int16x4_t quant_res_16x4_h_0, quant_res_16x4_h_1, quant_res_16x4_h_2, quant_res_16x4_h_3,
729 quant_res_16x4_h_4, quant_res_16x4_h_5, quant_res_16x4_h_6, quant_res_16x4_h_7;
730
731 int16x8_t quant_res_16x8_0, quant_res_16x8_1, quant_res_16x8_2, quant_res_16x8_3,
732 quant_res_16x8_4, quant_res_16x8_5, quant_res_16x8_6, quant_res_16x8_7;
733
734 int16x8_t trans_16x8_0, trans_16x8_1, trans_16x8_2, trans_16x8_3, trans_16x8_4, trans_16x8_5,
735 trans_16x8_6, trans_16x8_7;
736 int32x4_t trans_32x4_0, trans_32x4_1, trans_32x4_2, trans_32x4_3, trans_32x4_4, trans_32x4_5,
737 trans_32x4_6, trans_32x4_7;
738 int64x2_t trans_64x2_0, trans_64x2_1, trans_64x2_2, trans_64x2_3, trans_64x2_4, trans_64x2_5,
739 trans_64x2_6, trans_64x2_7;
740 int16x4_t trans_16x4_1_l, trans_16x4_3_l, trans_16x4_5_l, trans_16x4_7_l;
741 int16x8_t rs_trans_16x8_1, rs_trans_16x8_2, rs_trans_16x8_3, rs_trans_16x8_5, rs_trans_16x8_6,
742 rs_trans_16x8_7;
743 int32x4_t sub_3_5_l, sub_3_5_h;
744 int32x4_t add_3_5_l, add_3_5_h;
745 int32x4_t sub_1_7_l, sub_1_7_h;
746 int32x4_t add_1_7_l, add_1_7_h;
747 int32x4_t sub_357_l, sub_357_h;
748 int32x4_t add_351_l, add_351_h;
749 int32x4_t add_175_l, add_175_h;
750 int32x4_t sub_173_l, sub_173_h;
751 int32x4_t y1_32x4_l, y1_32x4_h;
752 int32x4_t y3_32x4_l, y3_32x4_h;
753 int32x4_t y5_32x4_l, y5_32x4_h;
754 int32x4_t y7_32x4_l, y7_32x4_h;
755 int16x4_t y1_16x4_l, y3_16x4_l, y5_16x4_l, y7_16x4_l;
756 int16x4_t y1_16x4_h, y3_16x4_h, y5_16x4_h, y7_16x4_h;
757
758 int16x8_t y0_16x8, y1_16x8, y2_16x8, y3_16x8, y4_16x8, y5_16x8, y6_16x8, y7_16x8;
759 int16x8_t rs_y1_16x8, rs_y3_16x8, rs_y5_16x8, rs_y7_16x8;
760 int16x8_t z0_16x8, z1_16x8, z2_16x8, z3_16x8, z4_16x8, z5_16x8, z6_16x8, z7_16x8;
761
762 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in, resd4_in, resd5_in, resd6_in, resd7_in;
763 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in, pred4_in, pred5_in, pred6_in, pred7_in;
764 int16x8_t pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
765 int16x8_t rec0, rec1, rec2, rec3, rec4, rec5, rec6, rec7;
766 uint8x8_t rec0_un, rec1_un, rec2_un, rec3_un, rec4_un, rec5_un, rec6_un, rec7_un;
767
768 int64x2_t resd0_64x2, resd1_64x2, resd2_64x2, resd3_64x2, resd4_64x2, resd5_64x2, resd6_64x2,
769 resd7_64x2;
770 int16x8x2_t trans_16x8_0_1, trans_16x8_2_3, trans_16x8_4_5, trans_16x8_6_7;
771 int32x4x2_t trans_32x4_0_2, trans_32x4_1_3, trans_32x4_4_6, trans_32x4_5_7;
772
773 int16x8_t resd_b0_r01;
774 int16x8_t resd_b0_r23;
775 int16x8_t resd_b1_r01;
776 int16x8_t resd_b1_r23;
777 int16x8_t resd_b2_r45;
778 int16x8_t resd_b2_r67;
779 int16x8_t resd_b3_r45;
780 int16x8_t resd_b3_r67;
781
782 int16x8_t dup_min, dup_max;
783 int16x8_t dup_val_1, dup_val_2, dup_abs;
784
785 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
786 WORD32 i;
787 UNUSED(pi2_tmp);
788 UNUSED(iq_start_idx);
789 UNUSED(pi2_dc_ld_addr);
790
791 iscal_16x8_0 = vld1q_s16((const int16_t *) pu2_iscale_mat);
792 iscal_16x8_1 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 8));
793 iscal_16x8_2 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 16));
794 iscal_16x8_3 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 24));
795 iscal_16x8_4 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 32));
796 iscal_16x8_5 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 40));
797 iscal_16x8_6 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 48));
798 iscal_16x8_7 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 56));
799
800 weigh_16x8_0 = vld1q_s16((const int16_t *) pu2_weigh_mat);
801 weigh_16x8_1 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 8));
802 weigh_16x8_2 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 16));
803 weigh_16x8_3 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 24));
804 weigh_16x8_4 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 32));
805 weigh_16x8_5 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 40));
806 weigh_16x8_6 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 48));
807 weigh_16x8_7 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 56));
808
809 src_16x8_0 = vld1q_s16((const int16_t *) pi2_src); // a0 a1 a2 a3 a4 a5 a6 a7
810 src_16x8_1 = vld1q_s16((const int16_t *) (pi2_src + 8)); // b0 b1 b2 b3 b4 b5 b6 b7
811 src_16x8_2 = vld1q_s16((const int16_t *) (pi2_src + 16));
812 src_16x8_3 = vld1q_s16((const int16_t *) (pi2_src + 24));
813 src_16x8_4 = vld1q_s16((const int16_t *) (pi2_src + 32));
814 src_16x8_5 = vld1q_s16((const int16_t *) (pi2_src + 40));
815 src_16x8_6 = vld1q_s16((const int16_t *) (pi2_src + 48));
816 src_16x8_7 = vld1q_s16((const int16_t *) (pi2_src + 56));
817
818 dup_min = vdupq_n_s16(RSD_MIN);
819 dup_max = vdupq_n_s16(RSD_MAX);
820
821 coeff_mul_16x8_0 = vmulq_s16(iscal_16x8_0, weigh_16x8_0);
822 coeff_mul_16x8_1 = vmulq_s16(iscal_16x8_1, weigh_16x8_1);
823 coeff_mul_16x8_2 = vmulq_s16(iscal_16x8_2, weigh_16x8_2);
824 coeff_mul_16x8_3 = vmulq_s16(iscal_16x8_3, weigh_16x8_3);
825 coeff_mul_16x8_4 = vmulq_s16(iscal_16x8_4, weigh_16x8_4);
826 coeff_mul_16x8_5 = vmulq_s16(iscal_16x8_5, weigh_16x8_5);
827 coeff_mul_16x8_6 = vmulq_s16(iscal_16x8_6, weigh_16x8_6);
828 coeff_mul_16x8_7 = vmulq_s16(iscal_16x8_7, weigh_16x8_7);
829
830 quant_res_32x4_l_0 = vmull_s16(vget_low_s16(coeff_mul_16x8_0), vget_low_s16(src_16x8_0));
831 quant_res_32x4_l_1 = vmull_s16(vget_low_s16(coeff_mul_16x8_1), vget_low_s16(src_16x8_1));
832 quant_res_32x4_l_2 = vmull_s16(vget_low_s16(coeff_mul_16x8_2), vget_low_s16(src_16x8_2));
833 quant_res_32x4_l_3 = vmull_s16(vget_low_s16(coeff_mul_16x8_3), vget_low_s16(src_16x8_3));
834 quant_res_32x4_l_4 = vmull_s16(vget_low_s16(coeff_mul_16x8_4), vget_low_s16(src_16x8_4));
835 quant_res_32x4_l_5 = vmull_s16(vget_low_s16(coeff_mul_16x8_5), vget_low_s16(src_16x8_5));
836 quant_res_32x4_l_6 = vmull_s16(vget_low_s16(coeff_mul_16x8_6), vget_low_s16(src_16x8_6));
837 quant_res_32x4_l_7 = vmull_s16(vget_low_s16(coeff_mul_16x8_7), vget_low_s16(src_16x8_7));
838
839 quant_res_32x4_h_0 = vmull_s16(vget_high_s16(coeff_mul_16x8_0), vget_high_s16(src_16x8_0));
840 quant_res_32x4_h_1 = vmull_s16(vget_high_s16(coeff_mul_16x8_1), vget_high_s16(src_16x8_1));
841 quant_res_32x4_h_2 = vmull_s16(vget_high_s16(coeff_mul_16x8_2), vget_high_s16(src_16x8_2));
842 quant_res_32x4_h_3 = vmull_s16(vget_high_s16(coeff_mul_16x8_3), vget_high_s16(src_16x8_3));
843 quant_res_32x4_h_4 = vmull_s16(vget_high_s16(coeff_mul_16x8_4), vget_high_s16(src_16x8_4));
844 quant_res_32x4_h_5 = vmull_s16(vget_high_s16(coeff_mul_16x8_5), vget_high_s16(src_16x8_5));
845 quant_res_32x4_h_6 = vmull_s16(vget_high_s16(coeff_mul_16x8_6), vget_high_s16(src_16x8_6));
846 quant_res_32x4_h_7 = vmull_s16(vget_high_s16(coeff_mul_16x8_7), vget_high_s16(src_16x8_7));
847
848 quant_res_32x4_l_0 = vshlq_s32(quant_res_32x4_l_0, qp_div_32x4);
849 quant_res_32x4_l_1 = vshlq_s32(quant_res_32x4_l_1, qp_div_32x4);
850 quant_res_32x4_l_2 = vshlq_s32(quant_res_32x4_l_2, qp_div_32x4);
851 quant_res_32x4_l_3 = vshlq_s32(quant_res_32x4_l_3, qp_div_32x4);
852 quant_res_32x4_l_4 = vshlq_s32(quant_res_32x4_l_4, qp_div_32x4);
853 quant_res_32x4_l_5 = vshlq_s32(quant_res_32x4_l_5, qp_div_32x4);
854 quant_res_32x4_l_6 = vshlq_s32(quant_res_32x4_l_6, qp_div_32x4);
855 quant_res_32x4_l_7 = vshlq_s32(quant_res_32x4_l_7, qp_div_32x4);
856
857 quant_res_32x4_h_0 = vshlq_s32(quant_res_32x4_h_0, qp_div_32x4);
858 quant_res_32x4_h_1 = vshlq_s32(quant_res_32x4_h_1, qp_div_32x4);
859 quant_res_32x4_h_2 = vshlq_s32(quant_res_32x4_h_2, qp_div_32x4);
860 quant_res_32x4_h_3 = vshlq_s32(quant_res_32x4_h_3, qp_div_32x4);
861 quant_res_32x4_h_4 = vshlq_s32(quant_res_32x4_h_4, qp_div_32x4);
862 quant_res_32x4_h_5 = vshlq_s32(quant_res_32x4_h_5, qp_div_32x4);
863 quant_res_32x4_h_6 = vshlq_s32(quant_res_32x4_h_6, qp_div_32x4);
864 quant_res_32x4_h_7 = vshlq_s32(quant_res_32x4_h_7, qp_div_32x4);
865
866 quant_res_16x4_l_0 = vqrshrn_n_s32(quant_res_32x4_l_0, 6);
867 quant_res_16x4_l_1 = vqrshrn_n_s32(quant_res_32x4_l_1, 6);
868 quant_res_16x4_l_2 = vqrshrn_n_s32(quant_res_32x4_l_2, 6);
869 quant_res_16x4_l_3 = vqrshrn_n_s32(quant_res_32x4_l_3, 6);
870 quant_res_16x4_l_4 = vqrshrn_n_s32(quant_res_32x4_l_4, 6);
871 quant_res_16x4_l_5 = vqrshrn_n_s32(quant_res_32x4_l_5, 6);
872 quant_res_16x4_l_6 = vqrshrn_n_s32(quant_res_32x4_l_6, 6);
873 quant_res_16x4_l_7 = vqrshrn_n_s32(quant_res_32x4_l_7, 6);
874
875 quant_res_16x4_h_0 = vqrshrn_n_s32(quant_res_32x4_h_0, 6);
876 quant_res_16x4_h_1 = vqrshrn_n_s32(quant_res_32x4_h_1, 6);
877 quant_res_16x4_h_2 = vqrshrn_n_s32(quant_res_32x4_h_2, 6);
878 quant_res_16x4_h_3 = vqrshrn_n_s32(quant_res_32x4_h_3, 6);
879 quant_res_16x4_h_4 = vqrshrn_n_s32(quant_res_32x4_h_4, 6);
880 quant_res_16x4_h_5 = vqrshrn_n_s32(quant_res_32x4_h_5, 6);
881 quant_res_16x4_h_6 = vqrshrn_n_s32(quant_res_32x4_h_6, 6);
882 quant_res_16x4_h_7 = vqrshrn_n_s32(quant_res_32x4_h_7, 6);
883
884 quant_res_16x8_0 = vcombine_s16(quant_res_16x4_l_0, quant_res_16x4_h_0);
885 quant_res_16x8_1 = vcombine_s16(quant_res_16x4_l_1, quant_res_16x4_h_1);
886 quant_res_16x8_2 = vcombine_s16(quant_res_16x4_l_2, quant_res_16x4_h_2);
887 quant_res_16x8_3 = vcombine_s16(quant_res_16x4_l_3, quant_res_16x4_h_3);
888 quant_res_16x8_4 = vcombine_s16(quant_res_16x4_l_4, quant_res_16x4_h_4);
889 quant_res_16x8_5 = vcombine_s16(quant_res_16x4_l_5, quant_res_16x4_h_5);
890 quant_res_16x8_6 = vcombine_s16(quant_res_16x4_l_6, quant_res_16x4_h_6);
891 quant_res_16x8_7 = vcombine_s16(quant_res_16x4_l_7, quant_res_16x4_h_7);
892
893 for(i = 0; i < 2; i++)
894 {
895 trans_16x8_0_1 = vtrnq_s16(quant_res_16x8_0, quant_res_16x8_1);
896 trans_16x8_0 = trans_16x8_0_1.val[0];
897 trans_16x8_1 = trans_16x8_0_1.val[1];
898
899 trans_16x8_2_3 = vtrnq_s16(quant_res_16x8_2, quant_res_16x8_3);
900 trans_16x8_2 = trans_16x8_2_3.val[0];
901 trans_16x8_3 = trans_16x8_2_3.val[1];
902
903 trans_16x8_4_5 = vtrnq_s16(quant_res_16x8_4, quant_res_16x8_5);
904 trans_16x8_4 = trans_16x8_4_5.val[0];
905 trans_16x8_5 = trans_16x8_4_5.val[1];
906
907 trans_16x8_6_7 = vtrnq_s16(quant_res_16x8_6, quant_res_16x8_7);
908 trans_16x8_6 = trans_16x8_6_7.val[0];
909 trans_16x8_7 = trans_16x8_6_7.val[1];
910
911 trans_32x4_0_2 =
912 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_0), vreinterpretq_s32_s16(trans_16x8_2));
913 trans_32x4_0 = trans_32x4_0_2.val[0];
914 trans_32x4_2 = trans_32x4_0_2.val[1];
915
916 trans_32x4_1_3 =
917 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_1), vreinterpretq_s32_s16(trans_16x8_3));
918 trans_32x4_1 = trans_32x4_1_3.val[0];
919 trans_32x4_3 = trans_32x4_1_3.val[1];
920
921 trans_32x4_4_6 =
922 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_4), vreinterpretq_s32_s16(trans_16x8_6));
923 trans_32x4_4 = trans_32x4_4_6.val[0];
924 trans_32x4_6 = trans_32x4_4_6.val[1];
925
926 trans_32x4_5_7 =
927 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_5), vreinterpretq_s32_s16(trans_16x8_7));
928 trans_32x4_5 = trans_32x4_5_7.val[0];
929 trans_32x4_7 = trans_32x4_5_7.val[1];
930
931 trans_64x2_0 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_0)),
932 vreinterpret_s64_s32(vget_low_s32(trans_32x4_4)));
933 trans_64x2_4 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_0)),
934 vreinterpret_s64_s32(vget_high_s32(trans_32x4_4)));
935
936 trans_64x2_1 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_1)),
937 vreinterpret_s64_s32(vget_low_s32(trans_32x4_5)));
938 trans_64x2_5 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_1)),
939 vreinterpret_s64_s32(vget_high_s32(trans_32x4_5)));
940
941 trans_64x2_2 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_2)),
942 vreinterpret_s64_s32(vget_low_s32(trans_32x4_6)));
943 trans_64x2_6 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_2)),
944 vreinterpret_s64_s32(vget_high_s32(trans_32x4_6)));
945
946 trans_64x2_3 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_3)),
947 vreinterpret_s64_s32(vget_low_s32(trans_32x4_7)));
948 trans_64x2_7 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_3)),
949 vreinterpret_s64_s32(vget_high_s32(trans_32x4_7)));
950
951 trans_16x8_0 = vreinterpretq_s16_s64(trans_64x2_0);
952 trans_16x8_1 = vreinterpretq_s16_s64(trans_64x2_1);
953 trans_16x8_2 = vreinterpretq_s16_s64(trans_64x2_2);
954 trans_16x8_3 = vreinterpretq_s16_s64(trans_64x2_3);
955 trans_16x8_4 = vreinterpretq_s16_s64(trans_64x2_4);
956 trans_16x8_5 = vreinterpretq_s16_s64(trans_64x2_5);
957 trans_16x8_6 = vreinterpretq_s16_s64(trans_64x2_6);
958 trans_16x8_7 = vreinterpretq_s16_s64(trans_64x2_7);
959
960 rs_trans_16x8_1 = vshrq_n_s16(trans_16x8_1, 1);
961 rs_trans_16x8_2 = vshrq_n_s16(trans_16x8_2, 1);
962 rs_trans_16x8_3 = vshrq_n_s16(trans_16x8_3, 1);
963 rs_trans_16x8_5 = vshrq_n_s16(trans_16x8_5, 1);
964 rs_trans_16x8_6 = vshrq_n_s16(trans_16x8_6, 1);
965 rs_trans_16x8_7 = vshrq_n_s16(trans_16x8_7, 1);
966
967 y0_16x8 = vaddq_s16(trans_16x8_0,
968 trans_16x8_4); // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
969 y2_16x8 = vsubq_s16(trans_16x8_0,
970 trans_16x8_4); // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
971 y4_16x8 = vsubq_s16(rs_trans_16x8_2,
972 trans_16x8_6); // i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
973 y6_16x8 = vaddq_s16(trans_16x8_2,
974 rs_trans_16x8_6); // i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
975
976 trans_16x4_3_l = vget_low_s16(trans_16x8_3);
977 trans_16x4_5_l = vget_low_s16(trans_16x8_5);
978
979 //-w3 + w5
980 sub_3_5_l = vsubl_s16(vget_low_s16(trans_16x8_5), vget_low_s16(trans_16x8_3));
981 sub_3_5_h = vsubl_s16(vget_high_s16(trans_16x8_5), vget_high_s16(trans_16x8_3));
982
983 // w3 + w5
984 add_3_5_l = vaddl_s16(trans_16x4_3_l, trans_16x4_5_l);
985 add_3_5_h = vaddl_s16(vget_high_s16(trans_16x8_3), vget_high_s16(trans_16x8_5));
986
987 trans_16x4_1_l = vget_low_s16(trans_16x8_1);
988 trans_16x4_7_l = vget_low_s16(trans_16x8_7);
989
990 //-w1 + w7
991 sub_1_7_l = vsubl_s16(trans_16x4_7_l, trans_16x4_1_l);
992 sub_1_7_h = vsubl_s16(vget_high_s16(trans_16x8_7), vget_high_s16(trans_16x8_1));
993
994 // w1 + w7
995 add_1_7_l = vaddl_s16(trans_16x4_1_l, trans_16x4_7_l);
996 add_1_7_h = vaddl_s16(vget_high_s16(trans_16x8_1), vget_high_s16(trans_16x8_7));
997
998 //-w3 + w5 - w7
999 sub_357_l = vsubw_s16(sub_3_5_l, trans_16x4_7_l);
1000 sub_357_h = vsubw_s16(sub_3_5_h, vget_high_s16(trans_16x8_7));
1001
1002 // w3 + w5 + w1
1003 add_351_l = vaddw_s16(add_3_5_l, trans_16x4_1_l);
1004 add_351_h = vaddw_s16(add_3_5_h, vget_high_s16(trans_16x8_1));
1005
1006 //-w1 + w7 + w5
1007 add_175_l = vaddw_s16(sub_1_7_l, trans_16x4_5_l);
1008 add_175_h = vaddw_s16(sub_1_7_h, vget_high_s16(trans_16x8_5));
1009
1010 // w1 + w7 - w3
1011 sub_173_l = vsubw_s16(add_1_7_l, trans_16x4_3_l);
1012 sub_173_h = vsubw_s16(add_1_7_h, vget_high_s16(trans_16x8_3));
1013
1014 //-w3 + w5 - w7 - (w7 >> 1)
1015 y1_32x4_l = vsubw_s16(sub_357_l, vget_low_s16(rs_trans_16x8_7));
1016 y1_32x4_h = vsubw_s16(sub_357_h, vget_high_s16(rs_trans_16x8_7));
1017
1018 // w1 + w7 - w3 - (w3 >> 1)
1019 y3_32x4_l = vsubw_s16(sub_173_l, vget_low_s16(rs_trans_16x8_3));
1020 y3_32x4_h = vsubw_s16(sub_173_h, vget_high_s16(rs_trans_16x8_3));
1021
1022 //-w1 + w7 + w5 + (w5 >> 1)
1023 y5_32x4_l = vaddw_s16(add_175_l, vget_low_s16(rs_trans_16x8_5));
1024 y5_32x4_h = vaddw_s16(add_175_h, vget_high_s16(rs_trans_16x8_5));
1025
1026 // w3 + w5 + w1 + (w1 >> 1)
1027 y7_32x4_l = vaddw_s16(add_351_l, vget_low_s16(rs_trans_16x8_1));
1028 y7_32x4_h = vaddw_s16(add_351_h, vget_high_s16(rs_trans_16x8_1));
1029
1030 y1_16x4_l = vmovn_s32(y1_32x4_l);
1031 y1_16x4_h = vmovn_s32(y1_32x4_h);
1032 y1_16x8 = vcombine_s16(y1_16x4_l, y1_16x4_h);
1033 y3_16x4_l = vmovn_s32(y3_32x4_l);
1034 y3_16x4_h = vmovn_s32(y3_32x4_h);
1035 y3_16x8 = vcombine_s16(y3_16x4_l, y3_16x4_h);
1036 y5_16x4_l = vmovn_s32(y5_32x4_l);
1037 y5_16x4_h = vmovn_s32(y5_32x4_h);
1038 y5_16x8 = vcombine_s16(y5_16x4_l, y5_16x4_h);
1039 y7_16x4_l = vmovn_s32(y7_32x4_l);
1040 y7_16x4_h = vmovn_s32(y7_32x4_h);
1041 y7_16x8 = vcombine_s16(y7_16x4_l, y7_16x4_h);
1042
1043 rs_y1_16x8 = vshrq_n_s16(y1_16x8, 2);
1044 rs_y3_16x8 = vshrq_n_s16(y3_16x8, 2);
1045 rs_y5_16x8 = vshrq_n_s16(y5_16x8, 2);
1046 rs_y7_16x8 = vshrq_n_s16(y7_16x8, 2);
1047
1048 z0_16x8 = vaddq_s16(y0_16x8, y6_16x8); // z0 = y0 + y6
1049 z1_16x8 = vaddq_s16(y1_16x8, rs_y7_16x8); // z1 = y1 + (y7 >> 2)
1050 z2_16x8 = vaddq_s16(y2_16x8, y4_16x8); // z2 = y2 + y4
1051 z3_16x8 = vaddq_s16(y3_16x8, rs_y5_16x8); // z3 = y3 + (y5 >> 2)
1052 z4_16x8 = vsubq_s16(y2_16x8, y4_16x8); // z4 = y2 - y4
1053 z5_16x8 = vsubq_s16(rs_y3_16x8, y5_16x8); // z5 = (y3 >> 2) - y5
1054 z6_16x8 = vsubq_s16(y0_16x8, y6_16x8); // z6 = y0 - y6
1055 z7_16x8 = vsubq_s16(y7_16x8, rs_y1_16x8); // z7 = y7 - (y1 >> 2)
1056
1057 quant_res_16x8_0 = vaddq_s16(z0_16x8, z7_16x8); // x0 = z0 + z7
1058 quant_res_16x8_1 = vaddq_s16(z2_16x8, z5_16x8); // x1 = z2 + z5
1059 quant_res_16x8_2 = vaddq_s16(z4_16x8, z3_16x8); // x2 = z4 + z3
1060 quant_res_16x8_3 = vaddq_s16(z6_16x8, z1_16x8); // x3 = z6 + z1
1061 quant_res_16x8_4 = vsubq_s16(z6_16x8, z1_16x8); // x4 = z6 - z1
1062 quant_res_16x8_5 = vsubq_s16(z4_16x8, z3_16x8); // x5 = z4 - z3
1063 quant_res_16x8_6 = vsubq_s16(z2_16x8, z5_16x8); // x6 = z2 - z5
1064 quant_res_16x8_7 = vsubq_s16(z0_16x8, z7_16x8); // x7 = z0 - z7
1065 }
1066
1067 resd0_in = vld1q_s16((int16_t *) pi2_rsd_ptr);
1068 resd1_in = vld1q_s16((int16_t *) pi2_rsd_ptr + rsd_strd);
1069 resd2_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 2));
1070 resd3_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 3));
1071 resd4_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 4));
1072 resd5_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 5));
1073 resd6_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 6));
1074 resd7_in = vld1q_s16((int16_t *) pi2_rsd_ptr + (rsd_strd * 7));
1075
1076 quant_res_16x8_0 = vrshrq_n_s16(quant_res_16x8_0, 6);
1077 quant_res_16x8_1 = vrshrq_n_s16(quant_res_16x8_1, 6);
1078 quant_res_16x8_2 = vrshrq_n_s16(quant_res_16x8_2, 6);
1079 quant_res_16x8_3 = vrshrq_n_s16(quant_res_16x8_3, 6);
1080 quant_res_16x8_4 = vrshrq_n_s16(quant_res_16x8_4, 6);
1081 quant_res_16x8_5 = vrshrq_n_s16(quant_res_16x8_5, 6);
1082 quant_res_16x8_6 = vrshrq_n_s16(quant_res_16x8_6, 6);
1083 quant_res_16x8_7 = vrshrq_n_s16(quant_res_16x8_7, 6);
1084
1085 resd0_in = vaddq_s16(quant_res_16x8_0, resd0_in);
1086 resd1_in = vaddq_s16(quant_res_16x8_1, resd1_in);
1087 resd2_in = vaddq_s16(quant_res_16x8_2, resd2_in);
1088 resd3_in = vaddq_s16(quant_res_16x8_3, resd3_in);
1089 resd4_in = vaddq_s16(quant_res_16x8_4, resd4_in);
1090 resd5_in = vaddq_s16(quant_res_16x8_5, resd5_in);
1091 resd6_in = vaddq_s16(quant_res_16x8_6, resd6_in);
1092 resd7_in = vaddq_s16(quant_res_16x8_7, resd7_in);
1093
1094 resd0_in = vminq_s16(resd0_in, dup_max);
1095 resd0_in = vmaxq_s16(resd0_in, dup_min);
1096 resd1_in = vminq_s16(resd1_in, dup_max);
1097 resd1_in = vmaxq_s16(resd1_in, dup_min);
1098 resd2_in = vminq_s16(resd2_in, dup_max);
1099 resd2_in = vmaxq_s16(resd2_in, dup_min);
1100 resd3_in = vminq_s16(resd3_in, dup_max);
1101 resd3_in = vmaxq_s16(resd3_in, dup_min);
1102 resd4_in = vminq_s16(resd4_in, dup_max);
1103 resd4_in = vmaxq_s16(resd4_in, dup_min);
1104 resd5_in = vminq_s16(resd5_in, dup_max);
1105 resd5_in = vmaxq_s16(resd5_in, dup_min);
1106 resd6_in = vminq_s16(resd6_in, dup_max);
1107 resd6_in = vmaxq_s16(resd6_in, dup_min);
1108 resd7_in = vminq_s16(resd7_in, dup_max);
1109 resd7_in = vmaxq_s16(resd7_in, dup_min);
1110
1111 resd0_64x2 = vreinterpretq_s64_s16(resd0_in);
1112 resd1_64x2 = vreinterpretq_s64_s16(resd1_in);
1113 resd2_64x2 = vreinterpretq_s64_s16(resd2_in);
1114 resd3_64x2 = vreinterpretq_s64_s16(resd3_in);
1115 resd4_64x2 = vreinterpretq_s64_s16(resd4_in);
1116 resd5_64x2 = vreinterpretq_s64_s16(resd5_in);
1117 resd6_64x2 = vreinterpretq_s64_s16(resd6_in);
1118 resd7_64x2 = vreinterpretq_s64_s16(resd7_in);
1119
1120 resd_b0_r01 =
1121 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd0_64x2), vget_low_s64(resd1_64x2)));
1122 resd_b0_r23 =
1123 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd2_64x2), vget_low_s64(resd3_64x2)));
1124 resd_b1_r01 =
1125 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd0_64x2), vget_high_s64(resd1_64x2)));
1126 resd_b1_r23 =
1127 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd2_64x2), vget_high_s64(resd3_64x2)));
1128 resd_b2_r45 =
1129 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd4_64x2), vget_low_s64(resd5_64x2)));
1130 resd_b2_r67 =
1131 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(resd6_64x2), vget_low_s64(resd7_64x2)));
1132 resd_b3_r45 =
1133 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd4_64x2), vget_high_s64(resd5_64x2)));
1134 resd_b3_r67 =
1135 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(resd6_64x2), vget_high_s64(resd7_64x2)));
1136
1137 dup_val_1 = vabsq_s16(resd_b0_r01);
1138 dup_val_2 = vabsq_s16(resd_b0_r23);
1139 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1140 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1141 dup_abs[6] || dup_abs[7];
1142
1143 dup_val_1 = vabsq_s16(resd_b1_r01);
1144 dup_val_2 = vabsq_s16(resd_b1_r23);
1145 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1146 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1147 dup_abs[6] || dup_abs[7];
1148
1149 dup_val_1 = vabsq_s16(resd_b2_r45);
1150 dup_val_2 = vabsq_s16(resd_b2_r67);
1151 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1152 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1153 dup_abs[6] || dup_abs[7];
1154
1155 dup_val_1 = vabsq_s16(resd_b3_r45);
1156 dup_val_2 = vabsq_s16(resd_b3_r67);
1157 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1158 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1159 dup_abs[6] || dup_abs[7];
1160
1161 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1162
1163 pred0_in = vld1_u8((uint8_t *) pu1_pred);
1164 pu1_pred = pu1_pred + pred_strd;
1165 pred1_in = vld1_u8((uint8_t *) pu1_pred);
1166 pu1_pred = pu1_pred + pred_strd;
1167 pred2_in = vld1_u8((uint8_t *) pu1_pred);
1168 pu1_pred = pu1_pred + pred_strd;
1169 pred3_in = vld1_u8((uint8_t *) pu1_pred);
1170 pu1_pred = pu1_pred + pred_strd;
1171 pred4_in = vld1_u8((uint8_t *) pu1_pred);
1172 pu1_pred = pu1_pred + pred_strd;
1173 pred5_in = vld1_u8((uint8_t *) pu1_pred);
1174 pu1_pred = pu1_pred + pred_strd;
1175 pred6_in = vld1_u8((uint8_t *) pu1_pred);
1176 pu1_pred = pu1_pred + pred_strd;
1177 pred7_in = vld1_u8((uint8_t *) pu1_pred);
1178
1179 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1180 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1181 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1182 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1183 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
1184 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
1185 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
1186 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
1187
1188 rec0 = vaddq_s16(pred0, resd0_in);
1189 rec1 = vaddq_s16(pred1, resd1_in);
1190 rec2 = vaddq_s16(pred2, resd2_in);
1191 rec3 = vaddq_s16(pred3, resd3_in);
1192 rec4 = vaddq_s16(pred4, resd4_in);
1193 rec5 = vaddq_s16(pred5, resd5_in);
1194 rec6 = vaddq_s16(pred6, resd6_in);
1195 rec7 = vaddq_s16(pred7, resd7_in);
1196
1197 rec0_un = vqmovun_s16(rec0);
1198 rec1_un = vqmovun_s16(rec1);
1199 rec2_un = vqmovun_s16(rec2);
1200 rec3_un = vqmovun_s16(rec3);
1201 rec4_un = vqmovun_s16(rec4);
1202 rec5_un = vqmovun_s16(rec5);
1203 rec6_un = vqmovun_s16(rec6);
1204 rec7_un = vqmovun_s16(rec7);
1205
1206 vst1_u8(pu1_out, rec0_un);
1207 vst1_u8(pu1_out + out_strd, rec1_un);
1208 vst1_u8(pu1_out + out_strd * 2, rec2_un);
1209 vst1_u8(pu1_out + out_strd * 3, rec3_un);
1210 vst1_u8(pu1_out + out_strd * 4, rec4_un);
1211 vst1_u8(pu1_out + out_strd * 5, rec5_un);
1212 vst1_u8(pu1_out + out_strd * 6, rec6_un);
1213 vst1_u8(pu1_out + out_strd * 7, rec7_un);
1214
1215 return nnz;
1216 }
1217
1218 /*****************************************************************************/
1219 /* */
1220 /* Function Name : isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr */
1221 /* */
1222 /* Description : this function computes the recon dc output from the */
1223 /* IQ+IT+RESD */
1224 /* */
1225 /* Inputs : */
1226 /* Globals : none */
1227 /* Processing : */
1228 /* */
1229 /* Outputs : i4_nnz */
1230 /* Returns : none */
1231 /* */
1232 /* Issues : none */
1233 /* */
1234 /* Revision History: */
1235 /* */
1236 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1237 /* 25 11 2021 Kishore creation */
1238 /* */
1239 /*****************************************************************************/
1240
isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr(WORD16 * pi2_src,UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)1241 WORD32 isvcd_iquant_itrans_residual_recon_8x8_dc_neonintr(
1242 WORD16 *pi2_src, UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out, WORD32 pred_strd,
1243 WORD32 rsd_strd, WORD32 out_strd, const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
1244 UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
1245 {
1246 WORD32 i4_iq_out_temp;
1247 int16x8_t temp_0;
1248 uint8x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1249 uint8x8_t pred4_in, pred5_in, pred6_in, pred7_in;
1250 int16x8_t pred0, pred1, pred2, pred3;
1251 int16x8_t resd0_in, resd1_in, resd2_in, resd3_in;
1252 int16x8_t pred4, pred5, pred6, pred7, dup_min, dup_max;
1253 int16x8_t resd4_in, resd5_in, resd6_in, resd7_in;
1254
1255 int64x2_t pred0_64x2, pred1_64x2, pred2_64x2, pred3_64x2, pred4_64x2, pred5_64x2, pred6_64x2,
1256 pred7_64x2;
1257
1258 int16x8_t pred_b0_r01;
1259 int16x8_t pred_b0_r23;
1260 int16x8_t pred_b1_r01;
1261 int16x8_t pred_b1_r23;
1262 int16x8_t pred_b2_r45;
1263 int16x8_t pred_b2_r67;
1264 int16x8_t pred_b3_r45;
1265 int16x8_t pred_b3_r67;
1266 int16x8_t dup_val_1, dup_val_2, dup_abs;
1267
1268 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1269 WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
1270
1271 UNUSED(pi2_tmp);
1272 UNUSED(iq_start_idx);
1273 UNUSED(pi2_dc_ld_addr);
1274 i4_iq_out_temp = pi2_src[0];
1275
1276 INV_QUANT(i4_iq_out_temp, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
1277
1278 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1279 dup_min = vdupq_n_s16(RSD_MIN);
1280 dup_max = vdupq_n_s16(RSD_MAX);
1281
1282 pred0_in = vld1_u8((uint8_t *) pu1_pred);
1283 pu1_pred = pu1_pred + pred_strd;
1284 pred1_in = vld1_u8((uint8_t *) pu1_pred);
1285 pu1_pred = pu1_pred + pred_strd;
1286 pred2_in = vld1_u8((uint8_t *) pu1_pred);
1287 pu1_pred = pu1_pred + pred_strd;
1288 pred3_in = vld1_u8((uint8_t *) pu1_pred);
1289 pu1_pred = pu1_pred + pred_strd;
1290 pred4_in = vld1_u8((uint8_t *) pu1_pred);
1291 pu1_pred = pu1_pred + pred_strd;
1292 pred5_in = vld1_u8((uint8_t *) pu1_pred);
1293 pu1_pred = pu1_pred + pred_strd;
1294 pred6_in = vld1_u8((uint8_t *) pu1_pred);
1295 pu1_pred = pu1_pred + pred_strd;
1296 pred7_in = vld1_u8((uint8_t *) pu1_pred);
1297
1298 pred0 = vreinterpretq_s16_u16(vmovl_u8(pred0_in));
1299 pred1 = vreinterpretq_s16_u16(vmovl_u8(pred1_in));
1300 pred2 = vreinterpretq_s16_u16(vmovl_u8(pred2_in));
1301 pred3 = vreinterpretq_s16_u16(vmovl_u8(pred3_in));
1302 pred4 = vreinterpretq_s16_u16(vmovl_u8(pred4_in));
1303 pred5 = vreinterpretq_s16_u16(vmovl_u8(pred5_in));
1304 pred6 = vreinterpretq_s16_u16(vmovl_u8(pred6_in));
1305 pred7 = vreinterpretq_s16_u16(vmovl_u8(pred7_in));
1306
1307 resd0_in = vld1q_s16((int16_t *) pi2_rsd);
1308 resd1_in = vld1q_s16((int16_t *) pi2_rsd + rsd_strd);
1309 resd2_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 2));
1310 resd3_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 3));
1311 resd4_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 4));
1312 resd5_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 5));
1313 resd6_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 6));
1314 resd7_in = vld1q_s16((int16_t *) pi2_rsd + (rsd_strd * 7));
1315
1316 resd0_in = vaddq_s16(resd0_in, temp_0);
1317 resd1_in = vaddq_s16(resd1_in, temp_0);
1318 resd2_in = vaddq_s16(resd2_in, temp_0);
1319 resd3_in = vaddq_s16(resd3_in, temp_0);
1320 resd4_in = vaddq_s16(resd4_in, temp_0);
1321 resd5_in = vaddq_s16(resd5_in, temp_0);
1322 resd6_in = vaddq_s16(resd6_in, temp_0);
1323 resd7_in = vaddq_s16(resd7_in, temp_0);
1324
1325 resd0_in = vminq_s16(resd0_in, dup_max);
1326 resd0_in = vmaxq_s16(resd0_in, dup_min);
1327 resd1_in = vminq_s16(resd1_in, dup_max);
1328 resd1_in = vmaxq_s16(resd1_in, dup_min);
1329 resd2_in = vminq_s16(resd2_in, dup_max);
1330 resd2_in = vmaxq_s16(resd2_in, dup_min);
1331 resd3_in = vminq_s16(resd3_in, dup_max);
1332 resd3_in = vmaxq_s16(resd3_in, dup_min);
1333 resd4_in = vminq_s16(resd4_in, dup_max);
1334 resd4_in = vmaxq_s16(resd4_in, dup_min);
1335 resd5_in = vminq_s16(resd5_in, dup_max);
1336 resd5_in = vmaxq_s16(resd5_in, dup_min);
1337 resd6_in = vminq_s16(resd6_in, dup_max);
1338 resd6_in = vmaxq_s16(resd6_in, dup_min);
1339 resd7_in = vminq_s16(resd7_in, dup_max);
1340 resd7_in = vmaxq_s16(resd7_in, dup_min);
1341
1342 pred0_64x2 = vreinterpretq_s64_s16(resd0_in);
1343 pred1_64x2 = vreinterpretq_s64_s16(resd1_in);
1344 pred2_64x2 = vreinterpretq_s64_s16(resd2_in);
1345 pred3_64x2 = vreinterpretq_s64_s16(resd3_in);
1346 pred4_64x2 = vreinterpretq_s64_s16(resd4_in);
1347 pred5_64x2 = vreinterpretq_s64_s16(resd5_in);
1348 pred6_64x2 = vreinterpretq_s64_s16(resd6_in);
1349 pred7_64x2 = vreinterpretq_s64_s16(resd7_in);
1350
1351 pred_b0_r01 =
1352 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred0_64x2), vget_low_s64(pred1_64x2)));
1353 pred_b0_r23 =
1354 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred2_64x2), vget_low_s64(pred3_64x2)));
1355 pred_b1_r01 =
1356 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred0_64x2), vget_high_s64(pred1_64x2)));
1357 pred_b1_r23 =
1358 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred2_64x2), vget_high_s64(pred3_64x2)));
1359 pred_b2_r45 =
1360 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred4_64x2), vget_low_s64(pred5_64x2)));
1361 pred_b2_r67 =
1362 vreinterpretq_s16_s64(vcombine_s64(vget_low_s64(pred6_64x2), vget_low_s64(pred7_64x2)));
1363 pred_b3_r45 =
1364 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred4_64x2), vget_high_s64(pred5_64x2)));
1365 pred_b3_r67 =
1366 vreinterpretq_s16_s64(vcombine_s64(vget_high_s64(pred6_64x2), vget_high_s64(pred7_64x2)));
1367
1368 dup_val_1 = vabsq_s16(pred_b0_r01);
1369 dup_val_2 = vabsq_s16(pred_b0_r23);
1370 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1371 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1372 dup_abs[6] || dup_abs[7];
1373
1374 dup_val_1 = vabsq_s16(pred_b1_r01);
1375 dup_val_2 = vabsq_s16(pred_b1_r23);
1376 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1377 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1378 dup_abs[6] || dup_abs[7];
1379
1380 dup_val_1 = vabsq_s16(pred_b2_r45);
1381 dup_val_2 = vabsq_s16(pred_b2_r67);
1382 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1383 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1384 dup_abs[6] || dup_abs[7];
1385
1386 dup_val_1 = vabsq_s16(pred_b3_r45);
1387 dup_val_2 = vabsq_s16(pred_b3_r67);
1388 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1389 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1390 dup_abs[6] || dup_abs[7];
1391
1392 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1393
1394 pred0 = vaddq_s16(pred0, resd0_in);
1395 pred1 = vaddq_s16(pred1, resd1_in);
1396 pred2 = vaddq_s16(pred2, resd2_in);
1397 pred3 = vaddq_s16(pred3, resd3_in);
1398 pred4 = vaddq_s16(pred4, resd4_in);
1399 pred5 = vaddq_s16(pred5, resd5_in);
1400 pred6 = vaddq_s16(pred6, resd6_in);
1401 pred7 = vaddq_s16(pred7, resd7_in);
1402
1403 pred0_in = vqmovun_s16(pred0);
1404 pred1_in = vqmovun_s16(pred1);
1405 pred2_in = vqmovun_s16(pred2);
1406 pred3_in = vqmovun_s16(pred3);
1407 pred4_in = vqmovun_s16(pred4);
1408 pred5_in = vqmovun_s16(pred5);
1409 pred6_in = vqmovun_s16(pred6);
1410 pred7_in = vqmovun_s16(pred7);
1411
1412 vst1_u8((uint8_t *) (pu1_out), pred0_in);
1413 vst1_u8((uint8_t *) (pu1_out + out_strd), pred1_in);
1414 vst1_u8((uint8_t *) (pu1_out + out_strd * 2), pred2_in);
1415 vst1_u8((uint8_t *) (pu1_out + out_strd * 3), pred3_in);
1416 vst1_u8((uint8_t *) (pu1_out + out_strd * 4), pred4_in);
1417 vst1_u8((uint8_t *) (pu1_out + out_strd * 5), pred5_in);
1418 vst1_u8((uint8_t *) (pu1_out + out_strd * 6), pred6_in);
1419 vst1_u8((uint8_t *) (pu1_out + out_strd * 7), pred7_in);
1420
1421 return nnz;
1422 }
1423