1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_iquant_itrans_residual_neonintr.c
24 *
25 * @brief
26 * Contains definition of functions for h264 inverse quantization inverse
27 * transformation and resd comp
28 *
29 * @author
30 * Kishore
31 *
32 * @par List of Functions:
33 * - isvcd_iquant_itrans_residual_4x4_neonintr()
34 * - isvcd_iquant_itrans_residual_8x8_neonintr()
35 * - isvcd_iquant_itrans_residual_4x4_dc_neonintr()
36 * - isvcd_iquant_itrans_residual_8x8_dc_neonintr()
37 * - isvcd_iquant_itrans_residual_chroma_4x4_neonintr()
38 * - isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr()
39 *
40 * @remarks
41 *
42 *******************************************************************************
43 */
44
45 /*****************************************************************************/
46 /* File Includes */
47 /*****************************************************************************/
48
49 #include <string.h>
50 #include <arm_neon.h>
51
52 /* User include files */
53 #include "ih264_typedefs.h"
54 #include "ih264_defs.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "ih264_trans_data.h"
59 #include "ih264_size_defs.h"
60 #include "ih264_structs.h"
61 #include "isvcd_iquant_itrans_residual.h"
62
63 /*****************************************************************************/
64 /* */
65 /* Function Name : isvcd_iquant_itrans_residual_4x4_neonintr */
66 /* */
67 /* Description : this function computes the resd output from the */
68 /* IQ+IT */
69 /* */
70 /* Inputs : */
71 /* Globals : none */
72 /* Processing : */
73 /* */
74 /* Outputs : i4_nnz */
75 /* Returns : none */
76 /* */
77 /* Issues : none */
78 /* */
79 /* Revision History: */
80 /* */
81 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
82 /* 25 11 2021 Kishore creation */
83 /* */
84 /*****************************************************************************/
85
isvcd_iquant_itrans_residual_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)86 WORD32 isvcd_iquant_itrans_residual_4x4_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out,
87 WORD32 pred_strd, WORD32 out_strd,
88 const UWORD16 *pu2_iscal_mat,
89 const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
90 WORD16 *pi2_tmp, WORD32 iq_start_idx,
91 WORD16 *pi2_dc_ld_addr)
92 {
93 int16x4x4_t src_16x4x2;
94 int16x4x4_t iscal_16x4x2;
95 int16x4x4_t weigh_16x4x2;
96
97 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
98 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
99
100 int16x4_t pred0, pred1, pred2, pred3;
101 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
102 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
103 int16x4_t rq1_16x4, rq3_16x4;
104 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
105 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
106 int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
107 int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
108 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4, dup_min, dup_max;
109 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
110
111 int16x8_t pred01_in, pred23_in;
112 WORD32 i4_nnz;
113 int16x4x2_t xx0_16x4_2, xx2_16x4_2;
114 int32x2x2_t x0_32x2_2, x1_32x2_2;
115 int16x8_t dup_val_1, dup_val_2, dup_abs;
116 UNUSED(pi2_tmp);
117
118 src_16x4x2 = vld4_s16(pi2_src);
119 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
120 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
121
122 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
123 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
124 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
125 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
126
127 dup_min = vdup_n_s16(RSD_MIN);
128 dup_max = vdup_n_s16(RSD_MAX);
129
130 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
131 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
132 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
133 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
134
135 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
136 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
137 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
138 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
139
140 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
141 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
142 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
143 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
144
145 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
146 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
147 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
148 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
149
150 if(iq_start_idx == 1)
151 {
152 q0_16x4 = vset_lane_s16(pi2_dc_ld_addr[0], q0_16x4, 0);
153 }
154
155 rq1_16x4 = vshr_n_s16(q1_16x4, 1); // q1 >>1
156 rq3_16x4 = vshr_n_s16(q3_16x4, 1); // q3 >>1
157
158 x0_16x4 = vadd_s16(q0_16x4, q2_16x4); // x0 = q0 + q2
159 x1_16x4 = vsub_s16(q0_16x4, q2_16x4); // x1 = q0 - q2
160 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); // x2 = q1>>1 - q3
161 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); // x2 = q1 + q3>>1
162
163 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); // x0+x3
164 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); // x1+x2
165 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); // x1-x2
166 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); // x0-x3
167
168 xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
169 xx0_0_16x4 = xx0_16x4_2.val[0];
170 xx0_1_16x4 = xx0_16x4_2.val[1];
171 xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
172 xx2_0_16x4 = xx2_16x4_2.val[0];
173 xx2_1_16x4 = xx2_16x4_2.val[1];
174 x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
175 x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
176 x0_32x2 = x0_32x2_2.val[0];
177 x1_32x2 = x1_32x2_2.val[0];
178 x2_32x2 = x0_32x2_2.val[1];
179 x3_32x2 = x1_32x2_2.val[1];
180
181 x0_16x4 = vreinterpret_s16_s32(x0_32x2);
182 x1_16x4 = vreinterpret_s16_s32(x1_32x2);
183 x2_16x4 = vreinterpret_s16_s32(x2_32x2);
184 x3_16x4 = vreinterpret_s16_s32(x3_32x2);
185
186 /* vertical inverse transform */
187 rq1_16x4 = vshr_n_s16(x1_16x4, 1); // q1 >> 1
188 rq3_16x4 = vshr_n_s16(x3_16x4, 1); // q3 >> 1
189
190 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); // x0 = q0 + q2
191 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); // x1 = q0 - q2
192 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); // x2 = q1>>1 - q3
193 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); // x3 = q1 + q3>>1
194
195 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); // imacro = x0 + x3
196 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); // imacro = x1 + x2
197 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); // imacro = x1 - x2
198 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); // imacro = x0 - x3
199
200 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
201 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
202 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
203 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
204
205 pred0 = vld1_s16((int16_t *) pi2_pred);
206 pred1 = vld1_s16((int16_t *) pi2_pred + pred_strd);
207 pred2 = vld1_s16((int16_t *) pi2_pred + (pred_strd * 2));
208 pred3 = vld1_s16((int16_t *) pi2_pred + (pred_strd * 3));
209
210 x0_16x4 = vadd_s16(pred0, x0_16x4);
211 x1_16x4 = vadd_s16(pred1, x1_16x4);
212 x2_16x4 = vadd_s16(pred2, x2_16x4);
213 x3_16x4 = vadd_s16(pred3, x3_16x4);
214
215 x0_16x4 = vmin_s16(x0_16x4, dup_max);
216 x0_16x4 = vmax_s16(x0_16x4, dup_min);
217 x1_16x4 = vmin_s16(x1_16x4, dup_max);
218 x1_16x4 = vmax_s16(x1_16x4, dup_min);
219 x2_16x4 = vmin_s16(x2_16x4, dup_max);
220 x2_16x4 = vmax_s16(x2_16x4, dup_min);
221 x3_16x4 = vmin_s16(x3_16x4, dup_max);
222 x3_16x4 = vmax_s16(x3_16x4, dup_min);
223
224 pred01_in = vcombine_s16(x0_16x4, x1_16x4);
225 pred23_in = vcombine_s16(x2_16x4, x3_16x4);
226
227 dup_val_1 = vabsq_s16(pred01_in);
228 dup_val_2 = vabsq_s16(pred23_in);
229 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
230 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
231 dup_abs[6] || dup_abs[7];
232
233 vst1_s16(pi2_out, x0_16x4);
234 vst1_s16(pi2_out + out_strd, x1_16x4);
235 vst1_s16(pi2_out + (out_strd << 1), x2_16x4);
236 vst1_s16(pi2_out + ((out_strd << 1) + out_strd), x3_16x4);
237
238 return i4_nnz;
239 }
240
241 /*****************************************************************************/
242 /* */
243 /* Function Name : isvcd_iquant_itrans_residual_4x4_dc_neonintr */
244 /* */
245 /* Description : this function computes the resd output from the */
246 /* IQ+IT */
247 /* */
248 /* Inputs : */
249 /* Globals : none */
250 /* Processing : */
251 /* */
252 /* Outputs : i4_nnz */
253 /* Returns : none */
254 /* */
255 /* Issues : none */
256 /* */
257 /* Revision History: */
258 /* */
259 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
260 /* 25 11 2021 Kishore creation */
261 /* */
262 /*****************************************************************************/
isvcd_iquant_itrans_residual_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)263 WORD32 isvcd_iquant_itrans_residual_4x4_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred,
264 WORD16 *pi2_out, WORD32 pred_strd,
265 WORD32 out_strd, const UWORD16 *pu2_iscal_mat,
266 const UWORD16 *pu2_weigh_mat,
267 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp,
268 WORD32 iq_start_idx, WORD16 *pi2_dc_ld_addr)
269 {
270 WORD32 i4_iq_out_temp;
271 int16x4_t temp_0;
272 int16x4_t pred0_in, pred1_in, pred2_in, pred3_in, dup_min, dup_max;
273 int16x8_t pred01_in, pred23_in;
274 WORD32 i4_nnz;
275 int16x8_t dup_val_1, dup_val_2, dup_abs;
276
277 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
278 UNUSED(pi2_tmp);
279
280 if(iq_start_idx == 0)
281 {
282 i4_iq_out_temp = pi2_src[0];
283 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
284 }
285 else
286 {
287 i4_iq_out_temp = pi2_dc_ld_addr[0];
288 }
289
290 /* inv transform and residual comp */
291 pred0_in = vld1_s16((int16_t *) pi2_pred);
292 pi2_pred = pi2_pred + pred_strd;
293 pred1_in = vld1_s16((int16_t *) pi2_pred);
294 pi2_pred = pi2_pred + pred_strd;
295 pred2_in = vld1_s16((int16_t *) pi2_pred);
296 pi2_pred = pi2_pred + pred_strd;
297 pred3_in = vld1_s16((int16_t *) pi2_pred);
298
299 temp_0 = vdup_n_s16((i4_iq_out_temp + 32) >> 6);
300 dup_min = vdup_n_s16(RSD_MIN);
301 dup_max = vdup_n_s16(RSD_MAX);
302
303 pred0_in = vadd_s16(pred0_in, temp_0);
304 pred1_in = vadd_s16(pred1_in, temp_0);
305 pred2_in = vadd_s16(pred2_in, temp_0);
306 pred3_in = vadd_s16(pred3_in, temp_0);
307
308 pred0_in = vmin_s16(pred0_in, dup_max);
309 pred0_in = vmax_s16(pred0_in, dup_min);
310 pred1_in = vmin_s16(pred1_in, dup_max);
311 pred1_in = vmax_s16(pred1_in, dup_min);
312 pred2_in = vmin_s16(pred2_in, dup_max);
313 pred2_in = vmax_s16(pred2_in, dup_min);
314 pred3_in = vmin_s16(pred3_in, dup_max);
315 pred3_in = vmax_s16(pred3_in, dup_min);
316
317 pred01_in = vcombine_s16(pred0_in, pred1_in);
318 pred23_in = vcombine_s16(pred2_in, pred3_in);
319
320 dup_val_1 = vabsq_s16(pred01_in);
321 dup_val_2 = vabsq_s16(pred23_in);
322 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
323 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
324 dup_abs[6] || dup_abs[7];
325
326 vst1_s16((int16_t *) (pi2_out), pred0_in);
327 vst1_s16((int16_t *) (pi2_out + out_strd), pred1_in);
328 vst1_s16((int16_t *) (pi2_out + (out_strd * 2)), pred2_in);
329 vst1_s16((int16_t *) (pi2_out + (out_strd * 3)), pred3_in);
330
331 return i4_nnz;
332 }
333
334 /*****************************************************************************/
335 /* */
336 /* Function Name : isvcd_iquant_itrans_residual_chroma_4x4_neonintr */
337 /* */
338 /* Description : this function computes the resd output from the */
339 /* IQ+IT */
340 /* */
341 /* Inputs : */
342 /* Globals : none */
343 /* Processing : */
344 /* */
345 /* Outputs : i4_nnz */
346 /* Returns : none */
347 /* */
348 /* Issues : none */
349 /* */
350 /* Revision History: */
351 /* */
352 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
353 /* 25 11 2021 Kishore creation */
354 /* */
355 /*****************************************************************************/
356
isvcd_iquant_itrans_residual_chroma_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)357 WORD32 isvcd_iquant_itrans_residual_chroma_4x4_neonintr(
358 WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out, WORD32 pred_strd, WORD32 out_strd,
359 const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
360 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
361 {
362 int16x4x4_t src_16x4x2;
363 int16x4x4_t iscal_16x4x2;
364 int16x4x4_t weigh_16x4x2;
365
366 int16x8_t pred0, pred1, pred2, pred3;
367 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
368 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
369 int16x4_t rq1_16x4, rq3_16x4;
370 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
371 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
372 int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
373 int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
374 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
375 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
376 int16x4_t zero_16x4 = vdup_n_s16(0);
377 int16x4_t x_16x4_low, x_16x4_high;
378 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
379 int16x8_t dup_min, dup_max;
380 WORD32 i4_nnz;
381 int16x4x2_t xx0_16x4_2, xx2_16x4_2, x_16x4x2_t;
382 int32x2x2_t x0_32x2_2, x1_32x2_2;
383 int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs;
384 int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
385 uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
386 int16x8_t chroma_mask_16x8_2 = vreinterpretq_s16_s32(vdupq_n_s32(0x0000ffff));
387
388 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
389 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
390 UNUSED(pi2_tmp);
391
392 dup_min = vdupq_n_s16(RSD_MIN);
393 dup_max = vdupq_n_s16(RSD_MAX);
394
395 src_16x4x2 = vld4_s16(pi2_src);
396 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
397 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
398
399 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
400 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
401 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
402 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
403
404 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
405 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
406 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
407 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
408
409 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
410 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
411 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
412 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
413
414 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
415 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
416 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
417 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
418
419 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
420 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
421 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
422 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
423
424 rq1_16x4 = vshr_n_s16(q1_16x4, 1); // q1 >>1
425 rq3_16x4 = vshr_n_s16(q3_16x4, 1); // q3 >>1
426
427 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
428
429 x0_16x4 = vadd_s16(q0_16x4, q2_16x4); // x0 = q0 + q2
430 x1_16x4 = vsub_s16(q0_16x4, q2_16x4); // x1 = q0 - q2
431 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); // x2 = q1>>1 - q3
432 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); // x2 = q1 + q3>>1
433
434 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); // x0+x3
435 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); // x1+x2
436 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); // x1-x2
437 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); // x0-x3
438
439 xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
440 xx0_0_16x4 = xx0_16x4_2.val[0];
441 xx0_1_16x4 = xx0_16x4_2.val[1];
442 xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
443 xx2_0_16x4 = xx2_16x4_2.val[0];
444 xx2_1_16x4 = xx2_16x4_2.val[1];
445 x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
446 x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
447 x0_32x2 = x0_32x2_2.val[0];
448 x1_32x2 = x1_32x2_2.val[0];
449 x2_32x2 = x0_32x2_2.val[1];
450 x3_32x2 = x1_32x2_2.val[1];
451
452 x0_16x4 = vreinterpret_s16_s32(x0_32x2);
453 x1_16x4 = vreinterpret_s16_s32(x1_32x2);
454 x2_16x4 = vreinterpret_s16_s32(x2_32x2);
455 x3_16x4 = vreinterpret_s16_s32(x3_32x2);
456
457 /* vertical inverse transform */
458 rq1_16x4 = vshr_n_s16(x1_16x4, 1); // q1 >> 1
459 rq3_16x4 = vshr_n_s16(x3_16x4, 1); // q3 >> 1
460
461 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); // x0 = q0 + q2
462 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); // x1 = q0 - q2
463 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); // x2 = q1>>1 - q3
464 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); // x3 = q1 + q3>>1
465
466 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); // imacro = x0 + x3
467 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); // imacro = x1 + x2
468 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); // imacro = x1 - x2
469 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); // imacro = x0 - x3
470
471 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
472 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
473 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
474 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
475
476 x_16x4x2_t = vzip_s16(x0_16x4, zero_16x4);
477 x_16x4_low = x_16x4x2_t.val[0];
478 x_16x4_high = x_16x4x2_t.val[1];
479 x0_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
480
481 x_16x4x2_t = vzip_s16(x1_16x4, zero_16x4);
482 x_16x4_low = x_16x4x2_t.val[0];
483 x_16x4_high = x_16x4x2_t.val[1];
484 x1_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
485
486 x_16x4x2_t = vzip_s16(x2_16x4, zero_16x4);
487 x_16x4_low = x_16x4x2_t.val[0];
488 x_16x4_high = x_16x4x2_t.val[1];
489 x2_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
490
491 x_16x4x2_t = vzip_s16(x3_16x4, zero_16x4);
492 x_16x4_low = x_16x4x2_t.val[0];
493 x_16x4_high = x_16x4x2_t.val[1];
494 x3_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
495
496 pred0 = vld1q_s16((int16_t *) pi2_pred);
497 pred1 = vld1q_s16((int16_t *) pi2_pred + pred_strd);
498 pred2 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 2));
499 pred3 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 3));
500
501 x0_16x8 = vaddq_s16(pred0, x0_16x8);
502 x1_16x8 = vaddq_s16(pred1, x1_16x8);
503 x2_16x8 = vaddq_s16(pred2, x2_16x8);
504 x3_16x8 = vaddq_s16(pred3, x3_16x8);
505
506 x0_16x8 = vminq_s16(x0_16x8, dup_max);
507 x0_16x8 = vmaxq_s16(x0_16x8, dup_min);
508 x1_16x8 = vminq_s16(x1_16x8, dup_max);
509 x1_16x8 = vmaxq_s16(x1_16x8, dup_min);
510 x2_16x8 = vminq_s16(x2_16x8, dup_max);
511 x2_16x8 = vmaxq_s16(x2_16x8, dup_min);
512 x3_16x8 = vminq_s16(x3_16x8, dup_max);
513 x3_16x8 = vmaxq_s16(x3_16x8, dup_min);
514
515 x0_16x8 = vandq_s16(x0_16x8, chroma_mask_16x8_2);
516 x1_16x8 = vandq_s16(x1_16x8, chroma_mask_16x8_2);
517 x2_16x8 = vandq_s16(x2_16x8, chroma_mask_16x8_2);
518 x3_16x8 = vandq_s16(x3_16x8, chroma_mask_16x8_2);
519
520 dup_val_1 = vabsq_s16(x0_16x8);
521 dup_val_2 = vabsq_s16(x1_16x8);
522 dup_val_3 = vabsq_s16(x2_16x8);
523 dup_val_4 = vabsq_s16(x3_16x8);
524 dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2);
525 dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4);
526 dup_abs = vqaddq_s16(dup_val_1, dup_val_5);
527
528 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
529 dup_abs[6] || dup_abs[7];
530
531 i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
532 i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
533 i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + out_strd * 2);
534 i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + out_strd * 3);
535
536 i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, x0_16x8, i4_out_horz_16x8_r0);
537 i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, x1_16x8, i4_out_horz_16x8_r1);
538 i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, x2_16x8, i4_out_horz_16x8_r2);
539 i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, x3_16x8, i4_out_horz_16x8_r3);
540
541 vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
542 vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
543 vst1q_s16((int16_t *) (pi2_out + out_strd * 2), i4_out_horz_16x8_r2);
544 vst1q_s16((int16_t *) (pi2_out + out_strd * 3), i4_out_horz_16x8_r3);
545
546 return i4_nnz;
547 }
548
549 /*****************************************************************************/
550 /* */
551 /* Function Name : isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr */
552 /* */
553 /* Description : this function computes the resd output from the */
554 /* IQ+IT */
555 /* */
556 /* Inputs : */
557 /* Globals : none */
558 /* Processing : */
559 /* */
560 /* Outputs : i4_nnz */
561 /* Returns : none */
562 /* */
563 /* Issues : none */
564 /* */
565 /* Revision History: */
566 /* */
567 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
568 /* 25 11 2021 Kishore creation */
569 /* */
570 /*****************************************************************************/
571
isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)572 WORD32 isvcd_iquant_itrans_residual_chroma_4x4_dc_neonintr(
573 WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out, WORD32 pred_strd, WORD32 out_strd,
574 const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
575 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
576 {
577 int16x8_t pred0, pred1, pred2, pred3;
578 int16x8_t temp_0, dup_min, dup_max;
579 WORD32 i4_iq_out_temp;
580 int16x8_t dup_val_1, dup_val_2, dup_val_3, dup_val_4, dup_val_5, dup_abs;
581 int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
582 uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
583 int16x8_t chroma_mask_16x8_2 = vreinterpretq_s16_s32(vdupq_n_s32(0x0000ffff));
584
585 WORD32 i4_nnz;
586 UNUSED(pi2_src);
587 UNUSED(pu2_iscal_mat);
588 UNUSED(pu2_weigh_mat);
589 UNUSED(u4_qp_div_6);
590 UNUSED(pi2_tmp);
591 UNUSED(pi2_dc_src);
592
593 i4_iq_out_temp = pi2_dc_src[0];
594 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
595 dup_min = vdupq_n_s16(RSD_MIN);
596 dup_max = vdupq_n_s16(RSD_MAX);
597
598 pred0 = vld1q_s16((int16_t *) pi2_pred);
599 pred1 = vld1q_s16((int16_t *) pi2_pred + pred_strd);
600 pred2 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 2));
601 pred3 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 3));
602
603 pred0 = vaddq_s16(pred0, temp_0);
604 pred1 = vaddq_s16(pred1, temp_0);
605 pred2 = vaddq_s16(pred2, temp_0);
606 pred3 = vaddq_s16(pred3, temp_0);
607
608 pred0 = vminq_s16(pred0, dup_max);
609 pred0 = vmaxq_s16(pred0, dup_min);
610 pred1 = vminq_s16(pred1, dup_max);
611 pred1 = vmaxq_s16(pred1, dup_min);
612 pred2 = vminq_s16(pred2, dup_max);
613 pred2 = vmaxq_s16(pred2, dup_min);
614 pred3 = vminq_s16(pred3, dup_max);
615 pred3 = vmaxq_s16(pred3, dup_min);
616
617 pred0 = vandq_s16(pred0, chroma_mask_16x8_2);
618 pred1 = vandq_s16(pred1, chroma_mask_16x8_2);
619 pred2 = vandq_s16(pred2, chroma_mask_16x8_2);
620 pred3 = vandq_s16(pred3, chroma_mask_16x8_2);
621
622 dup_val_1 = vabsq_s16(pred0);
623 dup_val_2 = vabsq_s16(pred1);
624 dup_val_3 = vabsq_s16(pred2);
625 dup_val_4 = vabsq_s16(pred2);
626 dup_val_5 = vqaddq_s16(dup_val_1, dup_val_2);
627 dup_val_1 = vqaddq_s16(dup_val_3, dup_val_4);
628 dup_abs = vqaddq_s16(dup_val_1, dup_val_5);
629
630 i4_nnz = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
631 dup_abs[6] || dup_abs[7];
632
633 i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
634 i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
635 i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + (out_strd * 2));
636 i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + (out_strd * 3));
637
638 i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, pred0, i4_out_horz_16x8_r0);
639 i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, pred1, i4_out_horz_16x8_r1);
640 i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, pred2, i4_out_horz_16x8_r2);
641 i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, pred3, i4_out_horz_16x8_r3);
642
643 vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
644 vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
645 vst1q_s16((int16_t *) (pi2_out + (out_strd * 2)), i4_out_horz_16x8_r2);
646 vst1q_s16((int16_t *) (pi2_out + (out_strd * 3)), i4_out_horz_16x8_r3);
647
648 return i4_nnz;
649 }
650
651 /*****************************************************************************/
652 /* */
653 /* Function Name : isvcd_iquant_itrans_residual_8x8_neonintr */
654 /* */
655 /* Description : this function computes the resd output from the */
656 /* IQ+IT */
657 /* */
658 /* Inputs : */
659 /* Globals : none */
660 /* Processing : */
661 /* */
662 /* Outputs : i4_nnz */
663 /* Returns : none */
664 /* */
665 /* Issues : none */
666 /* */
667 /* Revision History: */
668 /* */
669 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
670 /* 25 11 2021 Kishore creation */
671 /* */
672 /*****************************************************************************/
673
isvcd_iquant_itrans_residual_8x8_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)674 WORD32 isvcd_iquant_itrans_residual_8x8_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred, WORD16 *pi2_out,
675 WORD32 pred_strd, WORD32 out_strd,
676 const UWORD16 *pu2_iscale_mat,
677 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div,
678 WORD16 *pi2_tmp, WORD32 iq_start_idx,
679 WORD16 *pi2_dc_ld_addr)
680 {
681 int16x8_t iscal_16x8_0, iscal_16x8_1, iscal_16x8_2, iscal_16x8_3, iscal_16x8_4, iscal_16x8_5,
682 iscal_16x8_6, iscal_16x8_7;
683
684 int16x8_t weigh_16x8_0, weigh_16x8_1, weigh_16x8_2, weigh_16x8_3, weigh_16x8_4, weigh_16x8_5,
685 weigh_16x8_6, weigh_16x8_7;
686
687 int16x8_t src_16x8_0, src_16x8_1, src_16x8_2, src_16x8_3, src_16x8_4, src_16x8_5, src_16x8_6,
688 src_16x8_7;
689 int16x8_t coeff_mul_16x8_0, coeff_mul_16x8_1, coeff_mul_16x8_2, coeff_mul_16x8_3,
690 coeff_mul_16x8_4, coeff_mul_16x8_5, coeff_mul_16x8_6, coeff_mul_16x8_7;
691
692 int32x4_t quant_res_32x4_l_0, quant_res_32x4_l_1, quant_res_32x4_l_2, quant_res_32x4_l_3,
693 quant_res_32x4_l_4, quant_res_32x4_l_5, quant_res_32x4_l_6, quant_res_32x4_l_7;
694 int32x4_t quant_res_32x4_h_0, quant_res_32x4_h_1, quant_res_32x4_h_2, quant_res_32x4_h_3,
695 quant_res_32x4_h_4, quant_res_32x4_h_5, quant_res_32x4_h_6, quant_res_32x4_h_7;
696 int16x4_t quant_res_16x4_l_0, quant_res_16x4_l_1, quant_res_16x4_l_2, quant_res_16x4_l_3,
697 quant_res_16x4_l_4, quant_res_16x4_l_5, quant_res_16x4_l_6, quant_res_16x4_l_7;
698 int16x4_t quant_res_16x4_h_0, quant_res_16x4_h_1, quant_res_16x4_h_2, quant_res_16x4_h_3,
699 quant_res_16x4_h_4, quant_res_16x4_h_5, quant_res_16x4_h_6, quant_res_16x4_h_7;
700
701 int16x8_t quant_res_16x8_0, quant_res_16x8_1, quant_res_16x8_2, quant_res_16x8_3,
702 quant_res_16x8_4, quant_res_16x8_5, quant_res_16x8_6, quant_res_16x8_7;
703
704 int16x8_t trans_16x8_0, trans_16x8_1, trans_16x8_2, trans_16x8_3, trans_16x8_4, trans_16x8_5,
705 trans_16x8_6, trans_16x8_7;
706 int32x4_t trans_32x4_0, trans_32x4_1, trans_32x4_2, trans_32x4_3, trans_32x4_4, trans_32x4_5,
707 trans_32x4_6, trans_32x4_7;
708 int64x2_t trans_64x2_0, trans_64x2_1, trans_64x2_2, trans_64x2_3, trans_64x2_4, trans_64x2_5,
709 trans_64x2_6, trans_64x2_7;
710 int16x4_t trans_16x4_1_l, trans_16x4_3_l, trans_16x4_5_l, trans_16x4_7_l;
711 int16x8_t rs_trans_16x8_1, rs_trans_16x8_2, rs_trans_16x8_3, rs_trans_16x8_5, rs_trans_16x8_6,
712 rs_trans_16x8_7;
713 int32x4_t sub_3_5_l, sub_3_5_h;
714 int32x4_t add_3_5_l, add_3_5_h;
715 int32x4_t sub_1_7_l, sub_1_7_h;
716 int32x4_t add_1_7_l, add_1_7_h;
717 int32x4_t sub_357_l, sub_357_h;
718 int32x4_t add_351_l, add_351_h;
719 int32x4_t add_175_l, add_175_h;
720 int32x4_t sub_173_l, sub_173_h;
721 int32x4_t y1_32x4_l, y1_32x4_h;
722 int32x4_t y3_32x4_l, y3_32x4_h;
723 int32x4_t y5_32x4_l, y5_32x4_h;
724 int32x4_t y7_32x4_l, y7_32x4_h;
725 int16x4_t y1_16x4_l, y3_16x4_l, y5_16x4_l, y7_16x4_l;
726
727 int16x8_t y0_16x8, y1_16x8, y2_16x8, y3_16x8, y4_16x8, y5_16x8, y6_16x8, y7_16x8;
728 int16x8_t rs_y1_16x8, rs_y3_16x8, rs_y5_16x8, rs_y7_16x8;
729 int16x8_t z0_16x8, z1_16x8, z2_16x8, z3_16x8, z4_16x8, z5_16x8, z6_16x8, z7_16x8;
730 int16x8_t pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
731
732 int64x2_t pred0_in_64x2, pred1_in_64x2, pred2_in_64x2, pred3_in_64x2, pred4_in_64x2,
733 pred5_in_64x2, pred6_in_64x2, pred7_in_64x2;
734
735 int32x4_t qp_div_6_32x4 = vdupq_n_s32(qp_div);
736
737 int16x8_t pred_b0_r01_in;
738 int16x8_t pred_b0_r23_in;
739 int16x8_t pred_b1_r01_in;
740 int16x8_t pred_b1_r23_in;
741 int16x8_t pred_b2_r45_in;
742 int16x8_t pred_b2_r67_in;
743 int16x8_t pred_b3_r45_in;
744 int16x8_t pred_b3_r67_in;
745 int16x8_t dup_min, dup_max;
746
747 int16x8x2_t trans_16x8_0_1, trans_16x8_2_3, trans_16x8_4_5, trans_16x8_6_7;
748 int32x4x2_t trans_32x4_0_2, trans_32x4_1_3, trans_32x4_4_6, trans_32x4_5_7;
749 int16x4_t y1_16x4_h, y3_16x4_h, y5_16x4_h, y7_16x4_h;
750 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3, i;
751 int16x8_t dup_val_1, dup_val_2, dup_abs;
752
753 UNUSED(pi2_tmp);
754 UNUSED(iq_start_idx);
755 UNUSED(pi2_dc_ld_addr);
756
757 dup_min = vdupq_n_s16(RSD_MIN);
758 dup_max = vdupq_n_s16(RSD_MAX);
759
760 iscal_16x8_0 = vld1q_s16((const int16_t *) pu2_iscale_mat);
761 iscal_16x8_1 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 8));
762 iscal_16x8_2 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 16));
763 iscal_16x8_3 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 24));
764 iscal_16x8_4 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 32));
765 iscal_16x8_5 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 40));
766 iscal_16x8_6 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 48));
767 iscal_16x8_7 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 56));
768
769 weigh_16x8_0 = vld1q_s16((const int16_t *) pu2_weigh_mat);
770 weigh_16x8_1 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 8));
771 weigh_16x8_2 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 16));
772 weigh_16x8_3 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 24));
773 weigh_16x8_4 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 32));
774 weigh_16x8_5 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 40));
775 weigh_16x8_6 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 48));
776 weigh_16x8_7 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 56));
777
778 src_16x8_0 = vld1q_s16((const int16_t *) pi2_src); // a0 a1 a2 a3 a4 a5 a6 a7
779 src_16x8_1 = vld1q_s16((const int16_t *) (pi2_src + 8)); // b0 b1 b2 b3 b4 b5 b6 b7
780 src_16x8_2 = vld1q_s16((const int16_t *) (pi2_src + 16));
781 src_16x8_3 = vld1q_s16((const int16_t *) (pi2_src + 24));
782 src_16x8_4 = vld1q_s16((const int16_t *) (pi2_src + 32));
783 src_16x8_5 = vld1q_s16((const int16_t *) (pi2_src + 40));
784 src_16x8_6 = vld1q_s16((const int16_t *) (pi2_src + 48));
785 src_16x8_7 = vld1q_s16((const int16_t *) (pi2_src + 56));
786
787 coeff_mul_16x8_0 = vmulq_s16(iscal_16x8_0, weigh_16x8_0);
788 coeff_mul_16x8_1 = vmulq_s16(iscal_16x8_1, weigh_16x8_1);
789 coeff_mul_16x8_2 = vmulq_s16(iscal_16x8_2, weigh_16x8_2);
790 coeff_mul_16x8_3 = vmulq_s16(iscal_16x8_3, weigh_16x8_3);
791 coeff_mul_16x8_4 = vmulq_s16(iscal_16x8_4, weigh_16x8_4);
792 coeff_mul_16x8_5 = vmulq_s16(iscal_16x8_5, weigh_16x8_5);
793 coeff_mul_16x8_6 = vmulq_s16(iscal_16x8_6, weigh_16x8_6);
794 coeff_mul_16x8_7 = vmulq_s16(iscal_16x8_7, weigh_16x8_7);
795
796 quant_res_32x4_l_0 = vmull_s16(vget_low_s16(coeff_mul_16x8_0), vget_low_s16(src_16x8_0));
797 quant_res_32x4_l_1 = vmull_s16(vget_low_s16(coeff_mul_16x8_1), vget_low_s16(src_16x8_1));
798 quant_res_32x4_l_2 = vmull_s16(vget_low_s16(coeff_mul_16x8_2), vget_low_s16(src_16x8_2));
799 quant_res_32x4_l_3 = vmull_s16(vget_low_s16(coeff_mul_16x8_3), vget_low_s16(src_16x8_3));
800 quant_res_32x4_l_4 = vmull_s16(vget_low_s16(coeff_mul_16x8_4), vget_low_s16(src_16x8_4));
801 quant_res_32x4_l_5 = vmull_s16(vget_low_s16(coeff_mul_16x8_5), vget_low_s16(src_16x8_5));
802 quant_res_32x4_l_6 = vmull_s16(vget_low_s16(coeff_mul_16x8_6), vget_low_s16(src_16x8_6));
803 quant_res_32x4_l_7 = vmull_s16(vget_low_s16(coeff_mul_16x8_7), vget_low_s16(src_16x8_7));
804
805 quant_res_32x4_h_0 = vmull_s16(vget_high_s16(coeff_mul_16x8_0), vget_high_s16(src_16x8_0));
806 quant_res_32x4_h_1 = vmull_s16(vget_high_s16(coeff_mul_16x8_1), vget_high_s16(src_16x8_1));
807 quant_res_32x4_h_2 = vmull_s16(vget_high_s16(coeff_mul_16x8_2), vget_high_s16(src_16x8_2));
808 quant_res_32x4_h_3 = vmull_s16(vget_high_s16(coeff_mul_16x8_3), vget_high_s16(src_16x8_3));
809 quant_res_32x4_h_4 = vmull_s16(vget_high_s16(coeff_mul_16x8_4), vget_high_s16(src_16x8_4));
810 quant_res_32x4_h_5 = vmull_s16(vget_high_s16(coeff_mul_16x8_5), vget_high_s16(src_16x8_5));
811 quant_res_32x4_h_6 = vmull_s16(vget_high_s16(coeff_mul_16x8_6), vget_high_s16(src_16x8_6));
812 quant_res_32x4_h_7 = vmull_s16(vget_high_s16(coeff_mul_16x8_7), vget_high_s16(src_16x8_7));
813
814 quant_res_32x4_l_0 = vshlq_s32(quant_res_32x4_l_0, qp_div_6_32x4);
815 quant_res_32x4_l_1 = vshlq_s32(quant_res_32x4_l_1, qp_div_6_32x4);
816 quant_res_32x4_l_2 = vshlq_s32(quant_res_32x4_l_2, qp_div_6_32x4);
817 quant_res_32x4_l_3 = vshlq_s32(quant_res_32x4_l_3, qp_div_6_32x4);
818 quant_res_32x4_l_4 = vshlq_s32(quant_res_32x4_l_4, qp_div_6_32x4);
819 quant_res_32x4_l_5 = vshlq_s32(quant_res_32x4_l_5, qp_div_6_32x4);
820 quant_res_32x4_l_6 = vshlq_s32(quant_res_32x4_l_6, qp_div_6_32x4);
821 quant_res_32x4_l_7 = vshlq_s32(quant_res_32x4_l_7, qp_div_6_32x4);
822
823 quant_res_32x4_h_0 = vshlq_s32(quant_res_32x4_h_0, qp_div_6_32x4);
824 quant_res_32x4_h_1 = vshlq_s32(quant_res_32x4_h_1, qp_div_6_32x4);
825 quant_res_32x4_h_2 = vshlq_s32(quant_res_32x4_h_2, qp_div_6_32x4);
826 quant_res_32x4_h_3 = vshlq_s32(quant_res_32x4_h_3, qp_div_6_32x4);
827 quant_res_32x4_h_4 = vshlq_s32(quant_res_32x4_h_4, qp_div_6_32x4);
828 quant_res_32x4_h_5 = vshlq_s32(quant_res_32x4_h_5, qp_div_6_32x4);
829 quant_res_32x4_h_6 = vshlq_s32(quant_res_32x4_h_6, qp_div_6_32x4);
830 quant_res_32x4_h_7 = vshlq_s32(quant_res_32x4_h_7, qp_div_6_32x4);
831
832 quant_res_16x4_l_0 = vqrshrn_n_s32(quant_res_32x4_l_0, 6);
833 quant_res_16x4_l_1 = vqrshrn_n_s32(quant_res_32x4_l_1, 6);
834 quant_res_16x4_l_2 = vqrshrn_n_s32(quant_res_32x4_l_2, 6);
835 quant_res_16x4_l_3 = vqrshrn_n_s32(quant_res_32x4_l_3, 6);
836 quant_res_16x4_l_4 = vqrshrn_n_s32(quant_res_32x4_l_4, 6);
837 quant_res_16x4_l_5 = vqrshrn_n_s32(quant_res_32x4_l_5, 6);
838 quant_res_16x4_l_6 = vqrshrn_n_s32(quant_res_32x4_l_6, 6);
839 quant_res_16x4_l_7 = vqrshrn_n_s32(quant_res_32x4_l_7, 6);
840
841 quant_res_16x4_h_0 = vqrshrn_n_s32(quant_res_32x4_h_0, 6);
842 quant_res_16x4_h_1 = vqrshrn_n_s32(quant_res_32x4_h_1, 6);
843 quant_res_16x4_h_2 = vqrshrn_n_s32(quant_res_32x4_h_2, 6);
844 quant_res_16x4_h_3 = vqrshrn_n_s32(quant_res_32x4_h_3, 6);
845 quant_res_16x4_h_4 = vqrshrn_n_s32(quant_res_32x4_h_4, 6);
846 quant_res_16x4_h_5 = vqrshrn_n_s32(quant_res_32x4_h_5, 6);
847 quant_res_16x4_h_6 = vqrshrn_n_s32(quant_res_32x4_h_6, 6);
848 quant_res_16x4_h_7 = vqrshrn_n_s32(quant_res_32x4_h_7, 6);
849
850 quant_res_16x8_0 = vcombine_s16(quant_res_16x4_l_0, quant_res_16x4_h_0);
851 quant_res_16x8_1 = vcombine_s16(quant_res_16x4_l_1, quant_res_16x4_h_1);
852 quant_res_16x8_2 = vcombine_s16(quant_res_16x4_l_2, quant_res_16x4_h_2);
853 quant_res_16x8_3 = vcombine_s16(quant_res_16x4_l_3, quant_res_16x4_h_3);
854 quant_res_16x8_4 = vcombine_s16(quant_res_16x4_l_4, quant_res_16x4_h_4);
855 quant_res_16x8_5 = vcombine_s16(quant_res_16x4_l_5, quant_res_16x4_h_5);
856 quant_res_16x8_6 = vcombine_s16(quant_res_16x4_l_6, quant_res_16x4_h_6);
857 quant_res_16x8_7 = vcombine_s16(quant_res_16x4_l_7, quant_res_16x4_h_7);
858
859 for(i = 0; i < 2; i++)
860 {
861 trans_16x8_0_1 = vtrnq_s16(quant_res_16x8_0, quant_res_16x8_1);
862 trans_16x8_0 = trans_16x8_0_1.val[0];
863 trans_16x8_1 = trans_16x8_0_1.val[1];
864
865 trans_16x8_2_3 = vtrnq_s16(quant_res_16x8_2, quant_res_16x8_3);
866 trans_16x8_2 = trans_16x8_2_3.val[0];
867 trans_16x8_3 = trans_16x8_2_3.val[1];
868
869 trans_16x8_4_5 = vtrnq_s16(quant_res_16x8_4, quant_res_16x8_5);
870 trans_16x8_4 = trans_16x8_4_5.val[0];
871 trans_16x8_5 = trans_16x8_4_5.val[1];
872
873 trans_16x8_6_7 = vtrnq_s16(quant_res_16x8_6, quant_res_16x8_7);
874 trans_16x8_6 = trans_16x8_6_7.val[0];
875 trans_16x8_7 = trans_16x8_6_7.val[1];
876
877 trans_32x4_0_2 =
878 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_0), vreinterpretq_s32_s16(trans_16x8_2));
879 trans_32x4_0 = trans_32x4_0_2.val[0];
880 trans_32x4_2 = trans_32x4_0_2.val[1];
881
882 trans_32x4_1_3 =
883 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_1), vreinterpretq_s32_s16(trans_16x8_3));
884 trans_32x4_1 = trans_32x4_1_3.val[0];
885 trans_32x4_3 = trans_32x4_1_3.val[1];
886
887 trans_32x4_4_6 =
888 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_4), vreinterpretq_s32_s16(trans_16x8_6));
889 trans_32x4_4 = trans_32x4_4_6.val[0];
890 trans_32x4_6 = trans_32x4_4_6.val[1];
891
892 trans_32x4_5_7 =
893 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_5), vreinterpretq_s32_s16(trans_16x8_7));
894 trans_32x4_5 = trans_32x4_5_7.val[0];
895 trans_32x4_7 = trans_32x4_5_7.val[1];
896
897 trans_64x2_0 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_0)),
898 vreinterpret_s64_s32(vget_low_s32(trans_32x4_4)));
899 trans_64x2_4 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_0)),
900 vreinterpret_s64_s32(vget_high_s32(trans_32x4_4)));
901
902 trans_64x2_1 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_1)),
903 vreinterpret_s64_s32(vget_low_s32(trans_32x4_5)));
904 trans_64x2_5 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_1)),
905 vreinterpret_s64_s32(vget_high_s32(trans_32x4_5)));
906
907 trans_64x2_2 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_2)),
908 vreinterpret_s64_s32(vget_low_s32(trans_32x4_6)));
909 trans_64x2_6 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_2)),
910 vreinterpret_s64_s32(vget_high_s32(trans_32x4_6)));
911
912 trans_64x2_3 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_3)),
913 vreinterpret_s64_s32(vget_low_s32(trans_32x4_7)));
914 trans_64x2_7 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_3)),
915 vreinterpret_s64_s32(vget_high_s32(trans_32x4_7)));
916
917 trans_16x8_0 = vreinterpretq_s16_s64(trans_64x2_0);
918 trans_16x8_1 = vreinterpretq_s16_s64(trans_64x2_1);
919 trans_16x8_2 = vreinterpretq_s16_s64(trans_64x2_2);
920 trans_16x8_3 = vreinterpretq_s16_s64(trans_64x2_3);
921 trans_16x8_4 = vreinterpretq_s16_s64(trans_64x2_4);
922 trans_16x8_5 = vreinterpretq_s16_s64(trans_64x2_5);
923 trans_16x8_6 = vreinterpretq_s16_s64(trans_64x2_6);
924 trans_16x8_7 = vreinterpretq_s16_s64(trans_64x2_7);
925
926 rs_trans_16x8_1 = vshrq_n_s16(trans_16x8_1, 1); //(pi2_tmp_ptr[1] >> 1)
927 rs_trans_16x8_2 = vshrq_n_s16(trans_16x8_2, 1); //(pi2_tmp_ptr[2] >> 1)
928 rs_trans_16x8_3 = vshrq_n_s16(trans_16x8_3, 1); //(pi2_tmp_ptr[3] >> 1)
929 rs_trans_16x8_5 = vshrq_n_s16(trans_16x8_5, 1); //(pi2_tmp_ptr[5] >> 1)
930 rs_trans_16x8_6 = vshrq_n_s16(trans_16x8_6, 1); //(pi2_tmp_ptr[6] >> 1)
931 rs_trans_16x8_7 = vshrq_n_s16(trans_16x8_7, 1); //(pi2_tmp_ptr[7] >> 1)
932
933 y0_16x8 = vaddq_s16(trans_16x8_0,
934 trans_16x8_4); // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
935 y2_16x8 = vsubq_s16(trans_16x8_0,
936 trans_16x8_4); // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
937 y4_16x8 = vsubq_s16(rs_trans_16x8_2,
938 trans_16x8_6); // i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
939 y6_16x8 = vaddq_s16(trans_16x8_2,
940 rs_trans_16x8_6); // i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
941
942 trans_16x4_3_l = vget_low_s16(trans_16x8_3);
943 trans_16x4_5_l = vget_low_s16(trans_16x8_5);
944
945 //-w3 + w5
946 sub_3_5_l = vsubl_s16(vget_low_s16(trans_16x8_5), vget_low_s16(trans_16x8_3));
947 sub_3_5_h = vsubl_s16(vget_high_s16(trans_16x8_5), vget_high_s16(trans_16x8_3));
948
949 // w3 + w5
950 add_3_5_l = vaddl_s16(trans_16x4_3_l, trans_16x4_5_l);
951 add_3_5_h = vaddl_s16(vget_high_s16(trans_16x8_3), vget_high_s16(trans_16x8_5));
952
953 trans_16x4_1_l = vget_low_s16(trans_16x8_1);
954 trans_16x4_7_l = vget_low_s16(trans_16x8_7);
955
956 //-w1 + w7
957 sub_1_7_l = vsubl_s16(trans_16x4_7_l, trans_16x4_1_l);
958 sub_1_7_h = vsubl_s16(vget_high_s16(trans_16x8_7), vget_high_s16(trans_16x8_1));
959
960 // w1 + w7
961 add_1_7_l = vaddl_s16(trans_16x4_1_l, trans_16x4_7_l);
962 add_1_7_h = vaddl_s16(vget_high_s16(trans_16x8_1), vget_high_s16(trans_16x8_7));
963
964 //-w3 + w5 - w7
965 sub_357_l = vsubw_s16(sub_3_5_l, trans_16x4_7_l);
966 sub_357_h = vsubw_s16(sub_3_5_h, vget_high_s16(trans_16x8_7));
967
968 // w3 + w5 + w1
969 add_351_l = vaddw_s16(add_3_5_l, trans_16x4_1_l);
970 add_351_h = vaddw_s16(add_3_5_h, vget_high_s16(trans_16x8_1));
971
972 //-w1 + w7 + w5
973 add_175_l = vaddw_s16(sub_1_7_l, trans_16x4_5_l);
974 add_175_h = vaddw_s16(sub_1_7_h, vget_high_s16(trans_16x8_5));
975
976 // w1 + w7 - w3
977 sub_173_l = vsubw_s16(add_1_7_l, trans_16x4_3_l);
978 sub_173_h = vsubw_s16(add_1_7_h, vget_high_s16(trans_16x8_3));
979
980 //-w3 + w5 - w7 - (w7 >> 1)
981 y1_32x4_l = vsubw_s16(sub_357_l, vget_low_s16(rs_trans_16x8_7));
982 y1_32x4_h = vsubw_s16(sub_357_h, vget_high_s16(rs_trans_16x8_7));
983
984 // w1 + w7 - w3 - (w3 >> 1)
985 y3_32x4_l = vsubw_s16(sub_173_l, vget_low_s16(rs_trans_16x8_3));
986 y3_32x4_h = vsubw_s16(sub_173_h, vget_high_s16(rs_trans_16x8_3));
987
988 //-w1 + w7 + w5 + (w5 >> 1)
989 y5_32x4_l = vaddw_s16(add_175_l, vget_low_s16(rs_trans_16x8_5));
990 y5_32x4_h = vaddw_s16(add_175_h, vget_high_s16(rs_trans_16x8_5));
991
992 // w3 + w5 + w1 + (w1 >> 1)
993 y7_32x4_l = vaddw_s16(add_351_l, vget_low_s16(rs_trans_16x8_1));
994 y7_32x4_h = vaddw_s16(add_351_h, vget_high_s16(rs_trans_16x8_1));
995
996 y1_16x4_l = vmovn_s32(y1_32x4_l);
997 y1_16x4_h = vmovn_s32(y1_32x4_h);
998 y1_16x8 = vcombine_s16(y1_16x4_l, y1_16x4_h);
999 y3_16x4_l = vmovn_s32(y3_32x4_l);
1000 y3_16x4_h = vmovn_s32(y3_32x4_h);
1001 y3_16x8 = vcombine_s16(y3_16x4_l, y3_16x4_h);
1002 y5_16x4_l = vmovn_s32(y5_32x4_l);
1003 y5_16x4_h = vmovn_s32(y5_32x4_h);
1004 y5_16x8 = vcombine_s16(y5_16x4_l, y5_16x4_h);
1005 y7_16x4_l = vmovn_s32(y7_32x4_l);
1006 y7_16x4_h = vmovn_s32(y7_32x4_h);
1007 y7_16x8 = vcombine_s16(y7_16x4_l, y7_16x4_h);
1008
1009 rs_y1_16x8 = vshrq_n_s16(y1_16x8, 2);
1010 rs_y3_16x8 = vshrq_n_s16(y3_16x8, 2);
1011 rs_y5_16x8 = vshrq_n_s16(y5_16x8, 2);
1012 rs_y7_16x8 = vshrq_n_s16(y7_16x8, 2);
1013
1014 z0_16x8 = vaddq_s16(y0_16x8, y6_16x8); // z0 = y0 + y6
1015 z1_16x8 = vaddq_s16(y1_16x8, rs_y7_16x8); // z1 = y1 + (y7 >> 2)
1016 z2_16x8 = vaddq_s16(y2_16x8, y4_16x8); // z2 = y2 + y4
1017 z3_16x8 = vaddq_s16(y3_16x8, rs_y5_16x8); // z3 = y3 + (y5 >> 2)
1018 z4_16x8 = vsubq_s16(y2_16x8, y4_16x8); // z4 = y2 - y4
1019 z5_16x8 = vsubq_s16(rs_y3_16x8, y5_16x8); // z5 = (y3 >> 2) - y5
1020 z6_16x8 = vsubq_s16(y0_16x8, y6_16x8); // z6 = y0 - y6
1021 z7_16x8 = vsubq_s16(y7_16x8, rs_y1_16x8); // z7 = y7 - (y1 >> 2)
1022
1023 quant_res_16x8_0 = vaddq_s16(z0_16x8, z7_16x8); // x0 = z0 + z7
1024 quant_res_16x8_1 = vaddq_s16(z2_16x8, z5_16x8); // x1 = z2 + z5
1025 quant_res_16x8_2 = vaddq_s16(z4_16x8, z3_16x8); // x2 = z4 + z3
1026 quant_res_16x8_3 = vaddq_s16(z6_16x8, z1_16x8); // x3 = z6 + z1
1027 quant_res_16x8_4 = vsubq_s16(z6_16x8, z1_16x8); // x4 = z6 - z1
1028 quant_res_16x8_5 = vsubq_s16(z4_16x8, z3_16x8); // x5 = z4 - z3
1029 quant_res_16x8_6 = vsubq_s16(z2_16x8, z5_16x8); // x6 = z2 - z5
1030 quant_res_16x8_7 = vsubq_s16(z0_16x8, z7_16x8); // x7 = z0 - z7
1031 }
1032
1033 quant_res_16x8_0 = vrshrq_n_s16(quant_res_16x8_0, 6);
1034 quant_res_16x8_1 = vrshrq_n_s16(quant_res_16x8_1, 6);
1035 quant_res_16x8_2 = vrshrq_n_s16(quant_res_16x8_2, 6);
1036 quant_res_16x8_3 = vrshrq_n_s16(quant_res_16x8_3, 6);
1037 quant_res_16x8_4 = vrshrq_n_s16(quant_res_16x8_4, 6);
1038 quant_res_16x8_5 = vrshrq_n_s16(quant_res_16x8_5, 6);
1039 quant_res_16x8_6 = vrshrq_n_s16(quant_res_16x8_6, 6);
1040 quant_res_16x8_7 = vrshrq_n_s16(quant_res_16x8_7, 6);
1041
1042 pred0 = vld1q_s16((int16_t *) pi2_pred);
1043 pred1 = vld1q_s16((int16_t *) pi2_pred + pred_strd);
1044 pred2 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 2));
1045 pred3 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 3));
1046 pred4 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 4));
1047 pred5 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 5));
1048 pred6 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 6));
1049 pred7 = vld1q_s16((int16_t *) pi2_pred + (pred_strd * 7));
1050
1051 quant_res_16x8_0 = vaddq_s16(pred0, quant_res_16x8_0);
1052 quant_res_16x8_1 = vaddq_s16(pred1, quant_res_16x8_1);
1053 quant_res_16x8_2 = vaddq_s16(pred2, quant_res_16x8_2);
1054 quant_res_16x8_3 = vaddq_s16(pred3, quant_res_16x8_3);
1055 quant_res_16x8_4 = vaddq_s16(pred4, quant_res_16x8_4);
1056 quant_res_16x8_5 = vaddq_s16(pred5, quant_res_16x8_5);
1057 quant_res_16x8_6 = vaddq_s16(pred6, quant_res_16x8_6);
1058 quant_res_16x8_7 = vaddq_s16(pred7, quant_res_16x8_7);
1059
1060 quant_res_16x8_0 = vminq_s16(quant_res_16x8_0, dup_max);
1061 quant_res_16x8_0 = vmaxq_s16(quant_res_16x8_0, dup_min);
1062 quant_res_16x8_1 = vminq_s16(quant_res_16x8_1, dup_max);
1063 quant_res_16x8_1 = vmaxq_s16(quant_res_16x8_1, dup_min);
1064 quant_res_16x8_2 = vminq_s16(quant_res_16x8_2, dup_max);
1065 quant_res_16x8_2 = vmaxq_s16(quant_res_16x8_2, dup_min);
1066 quant_res_16x8_3 = vminq_s16(quant_res_16x8_3, dup_max);
1067 quant_res_16x8_3 = vmaxq_s16(quant_res_16x8_3, dup_min);
1068 quant_res_16x8_4 = vminq_s16(quant_res_16x8_4, dup_max);
1069 quant_res_16x8_4 = vmaxq_s16(quant_res_16x8_4, dup_min);
1070 quant_res_16x8_5 = vminq_s16(quant_res_16x8_5, dup_max);
1071 quant_res_16x8_5 = vmaxq_s16(quant_res_16x8_5, dup_min);
1072 quant_res_16x8_6 = vminq_s16(quant_res_16x8_6, dup_max);
1073 quant_res_16x8_6 = vmaxq_s16(quant_res_16x8_6, dup_min);
1074 quant_res_16x8_7 = vminq_s16(quant_res_16x8_7, dup_max);
1075 quant_res_16x8_7 = vmaxq_s16(quant_res_16x8_7, dup_min);
1076
1077 pred0_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_0);
1078 pred1_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_1);
1079 pred2_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_2);
1080 pred3_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_3);
1081 pred4_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_4);
1082 pred5_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_5);
1083 pred6_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_6);
1084 pred7_in_64x2 = vreinterpretq_s64_s16(quant_res_16x8_7);
1085
1086 pred_b0_r01_in = vreinterpretq_s16_s64(
1087 vcombine_s64(vget_low_s64(pred0_in_64x2), vget_low_s64(pred1_in_64x2)));
1088 pred_b0_r23_in = vreinterpretq_s16_s64(
1089 vcombine_s64(vget_low_s64(pred2_in_64x2), vget_low_s64(pred3_in_64x2)));
1090 pred_b1_r01_in = vreinterpretq_s16_s64(
1091 vcombine_s64(vget_high_s64(pred0_in_64x2), vget_high_s64(pred1_in_64x2)));
1092 pred_b1_r23_in = vreinterpretq_s16_s64(
1093 vcombine_s64(vget_high_s64(pred2_in_64x2), vget_high_s64(pred3_in_64x2)));
1094 pred_b2_r45_in = vreinterpretq_s16_s64(
1095 vcombine_s64(vget_low_s64(pred4_in_64x2), vget_low_s64(pred5_in_64x2)));
1096 pred_b2_r67_in = vreinterpretq_s16_s64(
1097 vcombine_s64(vget_low_s64(pred6_in_64x2), vget_low_s64(pred7_in_64x2)));
1098 pred_b3_r45_in = vreinterpretq_s16_s64(
1099 vcombine_s64(vget_high_s64(pred4_in_64x2), vget_high_s64(pred5_in_64x2)));
1100 pred_b3_r67_in = vreinterpretq_s16_s64(
1101 vcombine_s64(vget_high_s64(pred6_in_64x2), vget_high_s64(pred7_in_64x2)));
1102
1103 dup_val_1 = vabsq_s16(pred_b0_r01_in);
1104 dup_val_2 = vabsq_s16(pred_b0_r23_in);
1105 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1106 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1107 dup_abs[6] || dup_abs[7];
1108
1109 dup_val_1 = vabsq_s16(pred_b1_r01_in);
1110 dup_val_2 = vabsq_s16(pred_b1_r23_in);
1111 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1112 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1113 dup_abs[6] || dup_abs[7];
1114
1115 dup_val_1 = vabsq_s16(pred_b2_r45_in);
1116 dup_val_2 = vabsq_s16(pred_b2_r67_in);
1117 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1118 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1119 dup_abs[6] || dup_abs[7];
1120
1121 dup_val_1 = vabsq_s16(pred_b3_r45_in);
1122 dup_val_2 = vabsq_s16(pred_b3_r67_in);
1123 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1124 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1125 dup_abs[6] || dup_abs[7];
1126
1127 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1128
1129 vst1q_s16(pi2_out, quant_res_16x8_0);
1130 vst1q_s16(pi2_out + out_strd, quant_res_16x8_1);
1131 vst1q_s16(pi2_out + out_strd * 2, quant_res_16x8_2);
1132 vst1q_s16(pi2_out + out_strd * 3, quant_res_16x8_3);
1133 vst1q_s16(pi2_out + out_strd * 4, quant_res_16x8_4);
1134 vst1q_s16(pi2_out + out_strd * 5, quant_res_16x8_5);
1135 vst1q_s16(pi2_out + out_strd * 6, quant_res_16x8_6);
1136 vst1q_s16(pi2_out + out_strd * 7, quant_res_16x8_7);
1137
1138 return nnz;
1139 }
1140
1141 /*****************************************************************************/
1142 /* */
1143 /* Function Name : isvcd_iquant_itrans_residual_8x8_dc_neonintr */
1144 /* */
1145 /* Description : this function computes the resd dc output from the */
1146 /* IQ+IT */
1147 /* */
1148 /* Inputs : */
1149 /* Globals : none */
1150 /* Processing : */
1151 /* */
1152 /* Outputs : i4_nnz */
1153 /* Returns : none */
1154 /* */
1155 /* Issues : none */
1156 /* */
1157 /* Revision History: */
1158 /* */
1159 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1160 /* 25 11 2021 Kishore creation */
1161 /* */
1162 /*****************************************************************************/
1163
isvcd_iquant_itrans_residual_8x8_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_pred,WORD16 * pi2_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)1164 WORD32 isvcd_iquant_itrans_residual_8x8_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_pred,
1165 WORD16 *pi2_out, WORD32 pred_strd,
1166 WORD32 out_strd, const UWORD16 *pu2_iscale_mat,
1167 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div,
1168 WORD16 *pi2_tmp, WORD32 iq_start_idx,
1169 WORD16 *pi2_dc_ld_addr)
1170 {
1171 WORD32 i4_iq_out_temp;
1172 int16x8_t temp_0, dup_min, dup_max;
1173 int16x8_t pred0_in, pred1_in, pred2_in, pred3_in;
1174 int16x8_t pred4_in, pred5_in, pred6_in, pred7_in;
1175 WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
1176
1177 int64x2_t pred0_in_64x2, pred1_in_64x2, pred2_in_64x2, pred3_in_64x2, pred4_in_64x2,
1178 pred5_in_64x2, pred6_in_64x2, pred7_in_64x2;
1179
1180 int16x8_t pred_b0_r01_in;
1181 int16x8_t pred_b0_r23_in;
1182 int16x8_t pred_b1_r01_in;
1183 int16x8_t pred_b1_r23_in;
1184 int16x8_t pred_b2_r45_in;
1185 int16x8_t pred_b2_r67_in;
1186 int16x8_t pred_b3_r45_in;
1187 int16x8_t pred_b3_r67_in;
1188
1189 int16x8_t dup_val_1, dup_val_2, dup_abs;
1190 WORD32 nnz, nnz_b0, nnz_b1, nnz_b2, nnz_b3;
1191
1192 UNUSED(pi2_tmp);
1193 UNUSED(iq_start_idx);
1194 UNUSED(pi2_dc_ld_addr);
1195
1196 i4_iq_out_temp = pi2_src[0];
1197 INV_QUANT(i4_iq_out_temp, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
1198
1199 /* inv transform and residual comp */
1200 pred0_in = vld1q_s16((int16_t *) pi2_pred);
1201 pi2_pred = pi2_pred + pred_strd;
1202 pred1_in = vld1q_s16((int16_t *) pi2_pred);
1203 pi2_pred = pi2_pred + pred_strd;
1204 pred2_in = vld1q_s16((int16_t *) pi2_pred);
1205 pi2_pred = pi2_pred + pred_strd;
1206 pred3_in = vld1q_s16((int16_t *) pi2_pred);
1207 pi2_pred = pi2_pred + pred_strd;
1208 pred4_in = vld1q_s16((int16_t *) pi2_pred);
1209 pi2_pred = pi2_pred + pred_strd;
1210 pred5_in = vld1q_s16((int16_t *) pi2_pred);
1211 pi2_pred = pi2_pred + pred_strd;
1212 pred6_in = vld1q_s16((int16_t *) pi2_pred);
1213 pi2_pred = pi2_pred + pred_strd;
1214 pred7_in = vld1q_s16((int16_t *) pi2_pred);
1215
1216 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
1217 dup_min = vdupq_n_s16(RSD_MIN);
1218 dup_max = vdupq_n_s16(RSD_MAX);
1219
1220 pred0_in = vaddq_s16(pred0_in, temp_0);
1221 pred1_in = vaddq_s16(pred1_in, temp_0);
1222 pred2_in = vaddq_s16(pred2_in, temp_0);
1223 pred3_in = vaddq_s16(pred3_in, temp_0);
1224 pred4_in = vaddq_s16(pred4_in, temp_0);
1225 pred5_in = vaddq_s16(pred5_in, temp_0);
1226 pred6_in = vaddq_s16(pred6_in, temp_0);
1227 pred7_in = vaddq_s16(pred7_in, temp_0);
1228
1229 pred0_in = vminq_s16(pred0_in, dup_max);
1230 pred0_in = vmaxq_s16(pred0_in, dup_min);
1231 pred1_in = vminq_s16(pred1_in, dup_max);
1232 pred1_in = vmaxq_s16(pred1_in, dup_min);
1233 pred2_in = vminq_s16(pred2_in, dup_max);
1234 pred2_in = vmaxq_s16(pred2_in, dup_min);
1235 pred3_in = vminq_s16(pred3_in, dup_max);
1236 pred3_in = vmaxq_s16(pred3_in, dup_min);
1237 pred4_in = vminq_s16(pred4_in, dup_max);
1238 pred4_in = vmaxq_s16(pred4_in, dup_min);
1239 pred5_in = vminq_s16(pred5_in, dup_max);
1240 pred5_in = vmaxq_s16(pred5_in, dup_min);
1241 pred6_in = vminq_s16(pred6_in, dup_max);
1242 pred6_in = vmaxq_s16(pred6_in, dup_min);
1243 pred7_in = vminq_s16(pred7_in, dup_max);
1244 pred7_in = vmaxq_s16(pred7_in, dup_min);
1245
1246 pred0_in_64x2 = vreinterpretq_s64_s16(pred0_in);
1247 pred1_in_64x2 = vreinterpretq_s64_s16(pred1_in);
1248 pred2_in_64x2 = vreinterpretq_s64_s16(pred2_in);
1249 pred3_in_64x2 = vreinterpretq_s64_s16(pred3_in);
1250 pred4_in_64x2 = vreinterpretq_s64_s16(pred4_in);
1251 pred5_in_64x2 = vreinterpretq_s64_s16(pred5_in);
1252 pred6_in_64x2 = vreinterpretq_s64_s16(pred6_in);
1253 pred7_in_64x2 = vreinterpretq_s64_s16(pred7_in);
1254
1255 pred_b0_r01_in = vreinterpretq_s16_s64(
1256 vcombine_s64(vget_low_s64(pred0_in_64x2), vget_low_s64(pred1_in_64x2)));
1257 pred_b0_r23_in = vreinterpretq_s16_s64(
1258 vcombine_s64(vget_low_s64(pred2_in_64x2), vget_low_s64(pred3_in_64x2)));
1259 pred_b1_r01_in = vreinterpretq_s16_s64(
1260 vcombine_s64(vget_high_s64(pred0_in_64x2), vget_high_s64(pred1_in_64x2)));
1261 pred_b1_r23_in = vreinterpretq_s16_s64(
1262 vcombine_s64(vget_high_s64(pred2_in_64x2), vget_high_s64(pred3_in_64x2)));
1263 pred_b2_r45_in = vreinterpretq_s16_s64(
1264 vcombine_s64(vget_low_s64(pred4_in_64x2), vget_low_s64(pred5_in_64x2)));
1265 pred_b2_r67_in = vreinterpretq_s16_s64(
1266 vcombine_s64(vget_low_s64(pred6_in_64x2), vget_low_s64(pred7_in_64x2)));
1267 pred_b3_r45_in = vreinterpretq_s16_s64(
1268 vcombine_s64(vget_high_s64(pred4_in_64x2), vget_high_s64(pred5_in_64x2)));
1269 pred_b3_r67_in = vreinterpretq_s16_s64(
1270 vcombine_s64(vget_high_s64(pred6_in_64x2), vget_high_s64(pred7_in_64x2)));
1271
1272 dup_val_1 = vabsq_s16(pred_b0_r01_in);
1273 dup_val_2 = vabsq_s16(pred_b0_r23_in);
1274 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1275 nnz_b0 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1276 dup_abs[6] || dup_abs[7];
1277
1278 dup_val_1 = vabsq_s16(pred_b1_r01_in);
1279 dup_val_2 = vabsq_s16(pred_b1_r23_in);
1280 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1281 nnz_b1 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1282 dup_abs[6] || dup_abs[7];
1283
1284 dup_val_1 = vabsq_s16(pred_b2_r45_in);
1285 dup_val_2 = vabsq_s16(pred_b2_r67_in);
1286 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1287 nnz_b2 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1288 dup_abs[6] || dup_abs[7];
1289
1290 dup_val_1 = vabsq_s16(pred_b3_r45_in);
1291 dup_val_2 = vabsq_s16(pred_b3_r67_in);
1292 dup_abs = vqaddq_s16(dup_val_1, dup_val_2);
1293 nnz_b3 = dup_abs[0] || dup_abs[1] || dup_abs[2] || dup_abs[3] || dup_abs[4] || dup_abs[5] ||
1294 dup_abs[6] || dup_abs[7];
1295
1296 nnz = (nnz_b0 | (nnz_b1 << 1) | (nnz_b2 << 4) | (nnz_b3 << 5));
1297
1298 vst1q_s16((int16_t *) (pi2_out), pred0_in);
1299 vst1q_s16((int16_t *) (pi2_out + out_strd), pred1_in);
1300 vst1q_s16((int16_t *) (pi2_out + (out_strd * 2)), pred2_in);
1301 vst1q_s16((int16_t *) (pi2_out + (out_strd * 3)), pred3_in);
1302 vst1q_s16((int16_t *) (pi2_out + (out_strd * 4)), pred4_in);
1303 vst1q_s16((int16_t *) (pi2_out + (out_strd * 5)), pred5_in);
1304 vst1q_s16((int16_t *) (pi2_out + (out_strd * 6)), pred6_in);
1305 vst1q_s16((int16_t *) (pi2_out + (out_strd * 7)), pred7_in);
1306
1307 return nnz;
1308 }
1309