1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_iquant_itrans_neonintr.c
24 *
25 * @brief
26 * Contains definition of functions for svc inverse quantization inverse
27 * transformation and resd comp
28 *
29 * @author
30 * Kishore
31 *
32 * @par List of Functions:
33 * - isvcd_iquant_itrans_4x4_neonintr()
34 * - isvcd_iquant_itrans_8x8_neonintr()
35 * - isvcd_iquant_itrans_4x4_dc_neonintr()
36 * - isvcd_iquant_itrans_8x8_dc_neonintr()
37 * - isvcd_iquant_itrans_chroma_4x4_neonintr()
38 * - isvcd_iquant_itrans_chroma_4x4_dc_neonintr()
39 *
40 * @remarks
41 *
42 *******************************************************************************
43 */
44
45 /*****************************************************************************/
46 /* File Includes */
47 /*****************************************************************************/
48 #include <string.h>
49 #include <arm_neon.h>
50
51 /* User include files */
52 #include "ih264_typedefs.h"
53 #include "ih264_defs.h"
54 #include "ih264_trans_macros.h"
55 #include "ih264_macros.h"
56 #include "ih264_platform_macros.h"
57 #include "ih264_trans_data.h"
58 #include "ih264_size_defs.h"
59 #include "ih264_structs.h"
60 #include "isvcd_iquant_itrans.h"
61
62 /*****************************************************************************/
63 /* */
64 /* Function Name : isvcd_iquant_itrans_4x4_dc_neonintr */
65 /* */
66 /* Description : this function computes the inverse quantized and */
67 /* inverse transformed output */
68 /* */
69 /* Inputs : */
70 /* Globals : none */
71 /* Processing : */
72 /* */
73 /* Outputs : none */
74 /* Returns : none */
75 /* */
76 /* Issues : none */
77 /* */
78 /* Revision History: */
79 /* */
80 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
81 /* 25 11 2021 Kishore creation */
82 /* */
83 /*****************************************************************************/
84
isvcd_iquant_itrans_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)85 void isvcd_iquant_itrans_4x4_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
86 const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
87 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
88 WORD16 *pi2_dc_ld_addr)
89 {
90 WORD32 i4_iq_out_temp;
91 int16x8_t temp_0, dup_min, dup_max;
92
93 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
94 dup_min = vdupq_n_s16(RSD_MIN);
95 dup_max = vdupq_n_s16(RSD_MAX);
96 UNUSED(pi2_tmp);
97
98 if(iq_start_idx == 0)
99 {
100 i4_iq_out_temp = pi2_src[0];
101 INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
102 }
103 else
104 {
105 i4_iq_out_temp = pi2_dc_ld_addr[0];
106 }
107
108 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
109 temp_0 = vminq_s16(temp_0, dup_max);
110 temp_0 = vmaxq_s16(temp_0, dup_min);
111
112 vst1_s16((int16_t *) (pi2_out), vget_low_s16(temp_0));
113 vst1_s16((int16_t *) (pi2_out + out_strd), vget_high_s16(temp_0));
114 vst1_s16((int16_t *) (pi2_out + (out_strd * 2)), vget_low_s16(temp_0));
115 vst1_s16((int16_t *) (pi2_out + (out_strd * 3)), vget_high_s16(temp_0));
116 }
117
118 /*****************************************************************************/
119 /* */
120 /* Function Name : isvcd_iquant_itrans_chroma_4x4_dc_neonintr */
121 /* */
122 /* Description : this function computes the inverse quantized and */
123 /* inverse transformed output */
124 /* */
125 /* Inputs : */
126 /* Globals : none */
127 /* Processing : */
128 /* */
129 /* Outputs : none */
130 /* Returns : none */
131 /* */
132 /* Issues : none */
133 /* */
134 /* Revision History: */
135 /* */
136 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
137 /* 25 11 2021 Kishore creation */
138 /* */
139 /*****************************************************************************/
140
isvcd_iquant_itrans_chroma_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)141 void isvcd_iquant_itrans_chroma_4x4_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
142 const UWORD16 *pu2_iscal_mat,
143 const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
144 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
145 {
146 int16x8_t temp_0, dup_max, dup_min;
147 WORD32 i4_iq_out_temp;
148
149 int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
150 uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
151
152 UNUSED(pi2_src);
153 UNUSED(pu2_iscal_mat);
154 UNUSED(pu2_weigh_mat);
155 UNUSED(u4_qp_div_6);
156 UNUSED(pi2_tmp);
157
158 i4_iq_out_temp = pi2_dc_src[0];
159 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
160 dup_min = vdupq_n_s16(RSD_MIN);
161 dup_max = vdupq_n_s16(RSD_MAX);
162 temp_0 = vminq_s16(temp_0, dup_max);
163 temp_0 = vmaxq_s16(temp_0, dup_min);
164
165 i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
166 i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
167 i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + out_strd * 2);
168 i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + out_strd * 3);
169
170 i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r0);
171 i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r1);
172 i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r2);
173 i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r3);
174
175 vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
176 vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
177 vst1q_s16((int16_t *) (pi2_out + out_strd * 2), i4_out_horz_16x8_r2);
178 vst1q_s16((int16_t *) (pi2_out + out_strd * 3), i4_out_horz_16x8_r3);
179 }
180
181 /*****************************************************************************/
182 /* */
183 /* Function Name : isvcd_iquant_itrans_8x8_dc_neonintr */
184 /* */
185 /* Description : this function computes the inverse quantized and */
186 /* inverse transformed output */
187 /* */
188 /* Inputs : */
189 /* Globals : none */
190 /* Processing : */
191 /* */
192 /* Outputs : none */
193 /* Returns : none */
194 /* */
195 /* Issues : none */
196 /* */
197 /* Revision History: */
198 /* */
199 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
200 /* 25 11 2021 Kishore creation */
201 /* */
202 /*****************************************************************************/
203
isvcd_iquant_itrans_8x8_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)204 void isvcd_iquant_itrans_8x8_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
205 const UWORD16 *pu2_iscale_mat,
206 const UWORD16 *pu2_weigh_mat, UWORD32 qp_div,
207 WORD16 *pi2_tmp, WORD32 iq_start_idx,
208 WORD16 *pi2_dc_ld_addr)
209 {
210 WORD32 i4_iq_out_temp;
211 int16x8_t temp_0;
212 int16x8_t dup_max, dup_min;
213
214 WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
215 UNUSED(pi2_tmp);
216 UNUSED(iq_start_idx);
217 UNUSED(pi2_dc_ld_addr);
218 i4_iq_out_temp = pi2_src[0];
219
220 INV_QUANT(i4_iq_out_temp, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
221
222 temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
223 dup_min = vdupq_n_s16(RSD_MIN);
224 dup_max = vdupq_n_s16(RSD_MAX);
225 temp_0 = vminq_s16(temp_0, dup_max);
226 temp_0 = vmaxq_s16(temp_0, dup_min);
227
228 vst1q_s16((int16_t *) (pi2_out), temp_0);
229 vst1q_s16((int16_t *) (pi2_out + out_strd), temp_0);
230 vst1q_s16((int16_t *) (pi2_out + (out_strd * 2)), temp_0);
231 vst1q_s16((int16_t *) (pi2_out + (out_strd * 3)), temp_0);
232 vst1q_s16((int16_t *) (pi2_out + (out_strd * 4)), temp_0);
233 vst1q_s16((int16_t *) (pi2_out + (out_strd * 5)), temp_0);
234 vst1q_s16((int16_t *) (pi2_out + (out_strd * 6)), temp_0);
235 vst1q_s16((int16_t *) (pi2_out + (out_strd * 7)), temp_0);
236 }
237
238 /*****************************************************************************/
239 /* */
240 /* Function Name : isvcd_iquant_itrans_chroma_4x4_neonintr */
241 /* */
242 /* Description : this function computes the inverse quantized and */
243 /* inverse transformed output */
244 /* */
245 /* Inputs : */
246 /* Globals : none */
247 /* Processing : */
248 /* */
249 /* Outputs : none */
250 /* Returns : none */
251 /* */
252 /* Issues : none */
253 /* */
254 /* Revision History: */
255 /* */
256 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
257 /* 25 11 2021 Kishore creation */
258 /* */
259 /*****************************************************************************/
260
isvcd_iquant_itrans_chroma_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)261 void isvcd_iquant_itrans_chroma_4x4_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
262 const UWORD16 *pu2_iscal_mat,
263 const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
264 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
265 {
266 int16x4x4_t src_16x4x2;
267 int16x4x4_t iscal_16x4x2;
268 int16x4x4_t weigh_16x4x2;
269
270 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
271 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
272 int16x4_t rq1_16x4, rq3_16x4;
273 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
274 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
275 int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
276 int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
277 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
278 int16x4_t zero_16x4 = vdup_n_s16(0);
279 int16x4_t x_16x4_low, x_16x4_high;
280 int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
281 int16x4_t dup_max, dup_min;
282 int16x4x2_t xx0_16x4_2, xx2_16x4_2, x_16x4x2_t;
283 int32x2x2_t x0_32x2_2, x1_32x2_2;
284 int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
285 uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
286
287 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
288 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
289 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
290 UNUSED(pi2_tmp);
291
292 dup_min = vdup_n_s16(RSD_MIN);
293 dup_max = vdup_n_s16(RSD_MAX);
294
295 src_16x4x2 = vld4_s16(pi2_src);
296 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
297 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
298
299 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
300 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
301 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
302 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
303
304 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
305 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
306 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
307 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
308
309 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
310 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
311 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
312 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
313
314 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
315 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
316 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
317 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
318
319 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
320 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
321 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
322 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
323
324 rq1_16x4 = vshr_n_s16(q1_16x4, 1); // q1 >>1
325 rq3_16x4 = vshr_n_s16(q3_16x4, 1); // q3 >>1
326
327 q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
328
329 x0_16x4 = vadd_s16(q0_16x4, q2_16x4); // x0 = q0 + q2
330 x1_16x4 = vsub_s16(q0_16x4, q2_16x4); // x1 = q0 - q2
331 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); // x2 = q1>>1 - q3
332 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); // x2 = q1 + q3>>1
333
334 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); // x0+x3
335 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); // x1+x2
336 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); // x1-x2
337 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); // x0-x3
338
339 xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
340 xx0_0_16x4 = xx0_16x4_2.val[0];
341 xx0_1_16x4 = xx0_16x4_2.val[1];
342 xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
343 xx2_0_16x4 = xx2_16x4_2.val[0];
344 xx2_1_16x4 = xx2_16x4_2.val[1];
345 x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
346 x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
347 x0_32x2 = x0_32x2_2.val[0];
348 x1_32x2 = x1_32x2_2.val[0];
349 x2_32x2 = x0_32x2_2.val[1];
350 x3_32x2 = x1_32x2_2.val[1];
351
352 x0_16x4 = vreinterpret_s16_s32(x0_32x2);
353 x1_16x4 = vreinterpret_s16_s32(x1_32x2);
354 x2_16x4 = vreinterpret_s16_s32(x2_32x2);
355 x3_16x4 = vreinterpret_s16_s32(x3_32x2);
356
357 /* vertical inverse transform */
358 rq1_16x4 = vshr_n_s16(x1_16x4, 1); // q1 >> 1
359 rq3_16x4 = vshr_n_s16(x3_16x4, 1); // q3 >> 1
360
361 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); // x0 = q0 + q2
362 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); // x1 = q0 - q2
363 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); // x2 = q1>>1 - q3
364 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); // x3 = q1 + q3>>1
365
366 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); // imacro = x0 + x3
367 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); // imacro = x1 + x2
368 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); // imacro = x1 - x2
369 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); // imacro = x0 - x3
370
371 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
372 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
373 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
374 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
375
376 x0_16x4 = vmin_s16(x0_16x4, dup_max);
377 x0_16x4 = vmax_s16(x0_16x4, dup_min);
378 x1_16x4 = vmin_s16(x1_16x4, dup_max);
379 x1_16x4 = vmax_s16(x1_16x4, dup_min);
380 x2_16x4 = vmin_s16(x2_16x4, dup_max);
381 x2_16x4 = vmax_s16(x2_16x4, dup_min);
382 x3_16x4 = vmin_s16(x3_16x4, dup_max);
383 x3_16x4 = vmax_s16(x3_16x4, dup_min);
384
385 x_16x4x2_t = vzip_s16(x0_16x4, zero_16x4);
386 x_16x4_low = x_16x4x2_t.val[0];
387 x_16x4_high = x_16x4x2_t.val[1];
388 x0_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
389
390 x_16x4x2_t = vzip_s16(x1_16x4, zero_16x4);
391 x_16x4_low = x_16x4x2_t.val[0];
392 x_16x4_high = x_16x4x2_t.val[1];
393 x1_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
394
395 x_16x4x2_t = vzip_s16(x2_16x4, zero_16x4);
396 x_16x4_low = x_16x4x2_t.val[0];
397 x_16x4_high = x_16x4x2_t.val[1];
398 x2_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
399
400 x_16x4x2_t = vzip_s16(x3_16x4, zero_16x4);
401 x_16x4_low = x_16x4x2_t.val[0];
402 x_16x4_high = x_16x4x2_t.val[1];
403 x3_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
404
405 i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
406 i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
407 i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + out_strd * 2);
408 i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + out_strd * 3);
409
410 i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, x0_16x8, i4_out_horz_16x8_r0);
411 i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, x1_16x8, i4_out_horz_16x8_r1);
412 i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, x2_16x8, i4_out_horz_16x8_r2);
413 i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, x3_16x8, i4_out_horz_16x8_r3);
414
415 vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
416 vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
417 vst1q_s16((int16_t *) (pi2_out + out_strd * 2), i4_out_horz_16x8_r2);
418 vst1q_s16((int16_t *) (pi2_out + out_strd * 3), i4_out_horz_16x8_r3);
419 }
420
421 /*****************************************************************************/
422 /* */
423 /* Function Name : isvcd_iquant_itrans_8x8_neonintr */
424 /* */
425 /* Description : this function computes the inverse quantized and */
426 /* inverse transformed output */
427 /* */
428 /* Inputs : */
429 /* Globals : none */
430 /* Processing : */
431 /* */
432 /* Outputs : none */
433 /* Returns : none */
434 /* */
435 /* Issues : none */
436 /* */
437 /* Revision History: */
438 /* */
439 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
440 /* 25 11 2021 Kishore creation */
441 /* */
442 /*****************************************************************************/
443
isvcd_iquant_itrans_8x8_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)444 void isvcd_iquant_itrans_8x8_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
445 const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
446 UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx,
447 WORD16 *pi2_dc_ld_addr)
448 {
449 int16x8_t iscal_16x8_0, iscal_16x8_1, iscal_16x8_2, iscal_16x8_3, iscal_16x8_4, iscal_16x8_5,
450 iscal_16x8_6, iscal_16x8_7;
451
452 int16x8_t weigh_16x8_0, weigh_16x8_1, weigh_16x8_2, weigh_16x8_3, weigh_16x8_4, weigh_16x8_5,
453 weigh_16x8_6, weigh_16x8_7;
454
455 int16x8_t src_16x8_0, src_16x8_1, src_16x8_2, src_16x8_3, src_16x8_4, src_16x8_5, src_16x8_6,
456 src_16x8_7;
457 int16x8_t coeff_mul_16x8_0, coeff_mul_16x8_1, coeff_mul_16x8_2, coeff_mul_16x8_3,
458 coeff_mul_16x8_4, coeff_mul_16x8_5, coeff_mul_16x8_6, coeff_mul_16x8_7;
459
460 int32x4_t quant_res_32x4_l_0, quant_res_32x4_l_1, quant_res_32x4_l_2, quant_res_32x4_l_3,
461 quant_res_32x4_l_4, quant_res_32x4_l_5, quant_res_32x4_l_6, quant_res_32x4_l_7;
462 int32x4_t quant_res_32x4_h_0, quant_res_32x4_h_1, quant_res_32x4_h_2, quant_res_32x4_h_3,
463 quant_res_32x4_h_4, quant_res_32x4_h_5, quant_res_32x4_h_6, quant_res_32x4_h_7;
464 int16x4_t quant_res_16x4_l_0, quant_res_16x4_l_1, quant_res_16x4_l_2, quant_res_16x4_l_3,
465 quant_res_16x4_l_4, quant_res_16x4_l_5, quant_res_16x4_l_6, quant_res_16x4_l_7;
466 int16x4_t quant_res_16x4_h_0, quant_res_16x4_h_1, quant_res_16x4_h_2, quant_res_16x4_h_3,
467 quant_res_16x4_h_4, quant_res_16x4_h_5, quant_res_16x4_h_6, quant_res_16x4_h_7;
468
469 int16x8_t quant_res_16x8_0, quant_res_16x8_1, quant_res_16x8_2, quant_res_16x8_3,
470 quant_res_16x8_4, quant_res_16x8_5, quant_res_16x8_6, quant_res_16x8_7;
471
472 int16x8_t trans_16x8_0, trans_16x8_1, trans_16x8_2, trans_16x8_3, trans_16x8_4, trans_16x8_5,
473 trans_16x8_6, trans_16x8_7;
474 int32x4_t trans_32x4_0, trans_32x4_1, trans_32x4_2, trans_32x4_3, trans_32x4_4, trans_32x4_5,
475 trans_32x4_6, trans_32x4_7;
476 int64x2_t trans_64x2_0, trans_64x2_1, trans_64x2_2, trans_64x2_3, trans_64x2_4, trans_64x2_5,
477 trans_64x2_6, trans_64x2_7;
478 int16x4_t trans_16x4_1_l, trans_16x4_3_l, trans_16x4_5_l, trans_16x4_7_l;
479 int16x8_t rs_trans_16x8_1, rs_trans_16x8_2, rs_trans_16x8_3, rs_trans_16x8_5, rs_trans_16x8_6,
480 rs_trans_16x8_7;
481 int32x4_t sub_3_5_l, sub_3_5_h;
482 int32x4_t add_3_5_l, add_3_5_h;
483 int32x4_t sub_1_7_l, sub_1_7_h;
484 int32x4_t add_1_7_l, add_1_7_h;
485 int32x4_t sub_357_l, sub_357_h;
486 int32x4_t add_351_l, add_351_h;
487 int32x4_t add_175_l, add_175_h;
488 int32x4_t sub_173_l, sub_173_h;
489 int32x4_t y1_32x4_l, y1_32x4_h;
490 int32x4_t y3_32x4_l, y3_32x4_h;
491 int32x4_t y5_32x4_l, y5_32x4_h;
492 int32x4_t y7_32x4_l, y7_32x4_h;
493 int16x4_t y1_16x4_l, y3_16x4_l, y5_16x4_l, y7_16x4_l;
494 int16x4_t y1_16x4_h, y3_16x4_h, y5_16x4_h, y7_16x4_h;
495
496 int16x8_t y0_16x8, y1_16x8, y2_16x8, y3_16x8, y4_16x8, y5_16x8, y6_16x8, y7_16x8;
497 int16x8_t rs_y1_16x8, rs_y3_16x8, rs_y5_16x8, rs_y7_16x8;
498 int16x8_t z0_16x8, z1_16x8, z2_16x8, z3_16x8, z4_16x8, z5_16x8, z6_16x8, z7_16x8;
499 int16x8_t dup_max, dup_min;
500 int32x4_t qp_div_32x4 = vdupq_n_s32(qp_div);
501 int16x8x2_t trans_16x8_0_1, trans_16x8_2_3, trans_16x8_4_5, trans_16x8_6_7;
502 int32x4x2_t trans_32x4_0_2, trans_32x4_1_3, trans_32x4_4_6, trans_32x4_5_7;
503 WORD32 i;
504
505 UNUSED(pi2_tmp);
506 UNUSED(iq_start_idx);
507 UNUSED(pi2_dc_ld_addr);
508
509 dup_min = vdupq_n_s16(RSD_MIN);
510 dup_max = vdupq_n_s16(RSD_MAX);
511
512 iscal_16x8_0 = vld1q_s16((const int16_t *) pu2_iscale_mat);
513 iscal_16x8_1 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 8));
514 iscal_16x8_2 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 16));
515 iscal_16x8_3 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 24));
516 iscal_16x8_4 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 32));
517 iscal_16x8_5 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 40));
518 iscal_16x8_6 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 48));
519 iscal_16x8_7 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 56));
520
521 weigh_16x8_0 = vld1q_s16((const int16_t *) pu2_weigh_mat);
522 weigh_16x8_1 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 8));
523 weigh_16x8_2 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 16));
524 weigh_16x8_3 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 24));
525 weigh_16x8_4 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 32));
526 weigh_16x8_5 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 40));
527 weigh_16x8_6 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 48));
528 weigh_16x8_7 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 56));
529
530 src_16x8_0 = vld1q_s16((const int16_t *) pi2_src); // a0 a1 a2 a3 a4 a5 a6 a7
531 src_16x8_1 = vld1q_s16((const int16_t *) (pi2_src + 8)); // b0 b1 b2 b3 b4 b5 b6 b7
532 src_16x8_2 = vld1q_s16((const int16_t *) (pi2_src + 16));
533 src_16x8_3 = vld1q_s16((const int16_t *) (pi2_src + 24));
534 src_16x8_4 = vld1q_s16((const int16_t *) (pi2_src + 32));
535 src_16x8_5 = vld1q_s16((const int16_t *) (pi2_src + 40));
536 src_16x8_6 = vld1q_s16((const int16_t *) (pi2_src + 48));
537 src_16x8_7 = vld1q_s16((const int16_t *) (pi2_src + 56));
538
539 coeff_mul_16x8_0 = vmulq_s16(iscal_16x8_0, weigh_16x8_0);
540 coeff_mul_16x8_1 = vmulq_s16(iscal_16x8_1, weigh_16x8_1);
541 coeff_mul_16x8_2 = vmulq_s16(iscal_16x8_2, weigh_16x8_2);
542 coeff_mul_16x8_3 = vmulq_s16(iscal_16x8_3, weigh_16x8_3);
543 coeff_mul_16x8_4 = vmulq_s16(iscal_16x8_4, weigh_16x8_4);
544 coeff_mul_16x8_5 = vmulq_s16(iscal_16x8_5, weigh_16x8_5);
545 coeff_mul_16x8_6 = vmulq_s16(iscal_16x8_6, weigh_16x8_6);
546 coeff_mul_16x8_7 = vmulq_s16(iscal_16x8_7, weigh_16x8_7);
547
548 quant_res_32x4_l_0 = vmull_s16(vget_low_s16(coeff_mul_16x8_0), vget_low_s16(src_16x8_0));
549 quant_res_32x4_l_1 = vmull_s16(vget_low_s16(coeff_mul_16x8_1), vget_low_s16(src_16x8_1));
550 quant_res_32x4_l_2 = vmull_s16(vget_low_s16(coeff_mul_16x8_2), vget_low_s16(src_16x8_2));
551 quant_res_32x4_l_3 = vmull_s16(vget_low_s16(coeff_mul_16x8_3), vget_low_s16(src_16x8_3));
552 quant_res_32x4_l_4 = vmull_s16(vget_low_s16(coeff_mul_16x8_4), vget_low_s16(src_16x8_4));
553 quant_res_32x4_l_5 = vmull_s16(vget_low_s16(coeff_mul_16x8_5), vget_low_s16(src_16x8_5));
554 quant_res_32x4_l_6 = vmull_s16(vget_low_s16(coeff_mul_16x8_6), vget_low_s16(src_16x8_6));
555 quant_res_32x4_l_7 = vmull_s16(vget_low_s16(coeff_mul_16x8_7), vget_low_s16(src_16x8_7));
556
557 quant_res_32x4_h_0 = vmull_s16(vget_high_s16(coeff_mul_16x8_0), vget_high_s16(src_16x8_0));
558 quant_res_32x4_h_1 = vmull_s16(vget_high_s16(coeff_mul_16x8_1), vget_high_s16(src_16x8_1));
559 quant_res_32x4_h_2 = vmull_s16(vget_high_s16(coeff_mul_16x8_2), vget_high_s16(src_16x8_2));
560 quant_res_32x4_h_3 = vmull_s16(vget_high_s16(coeff_mul_16x8_3), vget_high_s16(src_16x8_3));
561 quant_res_32x4_h_4 = vmull_s16(vget_high_s16(coeff_mul_16x8_4), vget_high_s16(src_16x8_4));
562 quant_res_32x4_h_5 = vmull_s16(vget_high_s16(coeff_mul_16x8_5), vget_high_s16(src_16x8_5));
563 quant_res_32x4_h_6 = vmull_s16(vget_high_s16(coeff_mul_16x8_6), vget_high_s16(src_16x8_6));
564 quant_res_32x4_h_7 = vmull_s16(vget_high_s16(coeff_mul_16x8_7), vget_high_s16(src_16x8_7));
565
566 quant_res_32x4_l_0 = vshlq_s32(quant_res_32x4_l_0, qp_div_32x4);
567 quant_res_32x4_l_1 = vshlq_s32(quant_res_32x4_l_1, qp_div_32x4);
568 quant_res_32x4_l_2 = vshlq_s32(quant_res_32x4_l_2, qp_div_32x4);
569 quant_res_32x4_l_3 = vshlq_s32(quant_res_32x4_l_3, qp_div_32x4);
570 quant_res_32x4_l_4 = vshlq_s32(quant_res_32x4_l_4, qp_div_32x4);
571 quant_res_32x4_l_5 = vshlq_s32(quant_res_32x4_l_5, qp_div_32x4);
572 quant_res_32x4_l_6 = vshlq_s32(quant_res_32x4_l_6, qp_div_32x4);
573 quant_res_32x4_l_7 = vshlq_s32(quant_res_32x4_l_7, qp_div_32x4);
574
575 quant_res_32x4_h_0 = vshlq_s32(quant_res_32x4_h_0, qp_div_32x4);
576 quant_res_32x4_h_1 = vshlq_s32(quant_res_32x4_h_1, qp_div_32x4);
577 quant_res_32x4_h_2 = vshlq_s32(quant_res_32x4_h_2, qp_div_32x4);
578 quant_res_32x4_h_3 = vshlq_s32(quant_res_32x4_h_3, qp_div_32x4);
579 quant_res_32x4_h_4 = vshlq_s32(quant_res_32x4_h_4, qp_div_32x4);
580 quant_res_32x4_h_5 = vshlq_s32(quant_res_32x4_h_5, qp_div_32x4);
581 quant_res_32x4_h_6 = vshlq_s32(quant_res_32x4_h_6, qp_div_32x4);
582 quant_res_32x4_h_7 = vshlq_s32(quant_res_32x4_h_7, qp_div_32x4);
583
584 quant_res_16x4_l_0 = vqrshrn_n_s32(quant_res_32x4_l_0, 6);
585 quant_res_16x4_l_1 = vqrshrn_n_s32(quant_res_32x4_l_1, 6);
586 quant_res_16x4_l_2 = vqrshrn_n_s32(quant_res_32x4_l_2, 6);
587 quant_res_16x4_l_3 = vqrshrn_n_s32(quant_res_32x4_l_3, 6);
588 quant_res_16x4_l_4 = vqrshrn_n_s32(quant_res_32x4_l_4, 6);
589 quant_res_16x4_l_5 = vqrshrn_n_s32(quant_res_32x4_l_5, 6);
590 quant_res_16x4_l_6 = vqrshrn_n_s32(quant_res_32x4_l_6, 6);
591 quant_res_16x4_l_7 = vqrshrn_n_s32(quant_res_32x4_l_7, 6);
592
593 quant_res_16x4_h_0 = vqrshrn_n_s32(quant_res_32x4_h_0, 6);
594 quant_res_16x4_h_1 = vqrshrn_n_s32(quant_res_32x4_h_1, 6);
595 quant_res_16x4_h_2 = vqrshrn_n_s32(quant_res_32x4_h_2, 6);
596 quant_res_16x4_h_3 = vqrshrn_n_s32(quant_res_32x4_h_3, 6);
597 quant_res_16x4_h_4 = vqrshrn_n_s32(quant_res_32x4_h_4, 6);
598 quant_res_16x4_h_5 = vqrshrn_n_s32(quant_res_32x4_h_5, 6);
599 quant_res_16x4_h_6 = vqrshrn_n_s32(quant_res_32x4_h_6, 6);
600 quant_res_16x4_h_7 = vqrshrn_n_s32(quant_res_32x4_h_7, 6);
601
602 quant_res_16x8_0 = vcombine_s16(quant_res_16x4_l_0, quant_res_16x4_h_0);
603 quant_res_16x8_1 = vcombine_s16(quant_res_16x4_l_1, quant_res_16x4_h_1);
604 quant_res_16x8_2 = vcombine_s16(quant_res_16x4_l_2, quant_res_16x4_h_2);
605 quant_res_16x8_3 = vcombine_s16(quant_res_16x4_l_3, quant_res_16x4_h_3);
606 quant_res_16x8_4 = vcombine_s16(quant_res_16x4_l_4, quant_res_16x4_h_4);
607 quant_res_16x8_5 = vcombine_s16(quant_res_16x4_l_5, quant_res_16x4_h_5);
608 quant_res_16x8_6 = vcombine_s16(quant_res_16x4_l_6, quant_res_16x4_h_6);
609 quant_res_16x8_7 = vcombine_s16(quant_res_16x4_l_7, quant_res_16x4_h_7);
610
611 for(i = 0; i < 2; i++)
612 {
613 trans_16x8_0_1 = vtrnq_s16(quant_res_16x8_0, quant_res_16x8_1);
614 trans_16x8_0 = trans_16x8_0_1.val[0];
615 trans_16x8_1 = trans_16x8_0_1.val[1];
616
617 trans_16x8_2_3 = vtrnq_s16(quant_res_16x8_2, quant_res_16x8_3);
618 trans_16x8_2 = trans_16x8_2_3.val[0];
619 trans_16x8_3 = trans_16x8_2_3.val[1];
620
621 trans_16x8_4_5 = vtrnq_s16(quant_res_16x8_4, quant_res_16x8_5);
622 trans_16x8_4 = trans_16x8_4_5.val[0];
623 trans_16x8_5 = trans_16x8_4_5.val[1];
624
625 trans_16x8_6_7 = vtrnq_s16(quant_res_16x8_6, quant_res_16x8_7);
626 trans_16x8_6 = trans_16x8_6_7.val[0];
627 trans_16x8_7 = trans_16x8_6_7.val[1];
628
629 trans_32x4_0_2 =
630 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_0), vreinterpretq_s32_s16(trans_16x8_2));
631 trans_32x4_0 = trans_32x4_0_2.val[0];
632 trans_32x4_2 = trans_32x4_0_2.val[1];
633
634 trans_32x4_1_3 =
635 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_1), vreinterpretq_s32_s16(trans_16x8_3));
636 trans_32x4_1 = trans_32x4_1_3.val[0];
637 trans_32x4_3 = trans_32x4_1_3.val[1];
638
639 trans_32x4_4_6 =
640 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_4), vreinterpretq_s32_s16(trans_16x8_6));
641 trans_32x4_4 = trans_32x4_4_6.val[0];
642 trans_32x4_6 = trans_32x4_4_6.val[1];
643
644 trans_32x4_5_7 =
645 vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_5), vreinterpretq_s32_s16(trans_16x8_7));
646 trans_32x4_5 = trans_32x4_5_7.val[0];
647 trans_32x4_7 = trans_32x4_5_7.val[1];
648
649 trans_64x2_0 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_0)),
650 vreinterpret_s64_s32(vget_low_s32(trans_32x4_4)));
651 trans_64x2_4 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_0)),
652 vreinterpret_s64_s32(vget_high_s32(trans_32x4_4)));
653
654 trans_64x2_1 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_1)),
655 vreinterpret_s64_s32(vget_low_s32(trans_32x4_5)));
656 trans_64x2_5 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_1)),
657 vreinterpret_s64_s32(vget_high_s32(trans_32x4_5)));
658
659 trans_64x2_2 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_2)),
660 vreinterpret_s64_s32(vget_low_s32(trans_32x4_6)));
661 trans_64x2_6 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_2)),
662 vreinterpret_s64_s32(vget_high_s32(trans_32x4_6)));
663
664 trans_64x2_3 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_3)),
665 vreinterpret_s64_s32(vget_low_s32(trans_32x4_7)));
666 trans_64x2_7 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_3)),
667 vreinterpret_s64_s32(vget_high_s32(trans_32x4_7)));
668
669 trans_16x8_0 = vreinterpretq_s16_s64(trans_64x2_0);
670 trans_16x8_1 = vreinterpretq_s16_s64(trans_64x2_1);
671 trans_16x8_2 = vreinterpretq_s16_s64(trans_64x2_2);
672 trans_16x8_3 = vreinterpretq_s16_s64(trans_64x2_3);
673 trans_16x8_4 = vreinterpretq_s16_s64(trans_64x2_4);
674 trans_16x8_5 = vreinterpretq_s16_s64(trans_64x2_5);
675 trans_16x8_6 = vreinterpretq_s16_s64(trans_64x2_6);
676 trans_16x8_7 = vreinterpretq_s16_s64(trans_64x2_7);
677
678 rs_trans_16x8_1 = vshrq_n_s16(trans_16x8_1, 1);
679 rs_trans_16x8_2 = vshrq_n_s16(trans_16x8_2, 1);
680 rs_trans_16x8_3 = vshrq_n_s16(trans_16x8_3, 1);
681 rs_trans_16x8_5 = vshrq_n_s16(trans_16x8_5, 1);
682 rs_trans_16x8_6 = vshrq_n_s16(trans_16x8_6, 1);
683 rs_trans_16x8_7 = vshrq_n_s16(trans_16x8_7, 1);
684
685 y0_16x8 = vaddq_s16(trans_16x8_0, trans_16x8_4);
686 y2_16x8 = vsubq_s16(trans_16x8_0, trans_16x8_4);
687 y4_16x8 = vsubq_s16(rs_trans_16x8_2, trans_16x8_6);
688 y6_16x8 = vaddq_s16(trans_16x8_2, rs_trans_16x8_6);
689
690 trans_16x4_3_l = vget_low_s16(trans_16x8_3);
691 trans_16x4_5_l = vget_low_s16(trans_16x8_5);
692
693 //-w3 + w5
694 sub_3_5_l = vsubl_s16(vget_low_s16(trans_16x8_5), vget_low_s16(trans_16x8_3));
695 sub_3_5_h = vsubl_s16(vget_high_s16(trans_16x8_5), vget_high_s16(trans_16x8_3));
696
697 // w3 + w5
698 add_3_5_l = vaddl_s16(trans_16x4_3_l, trans_16x4_5_l);
699 add_3_5_h = vaddl_s16(vget_high_s16(trans_16x8_3), vget_high_s16(trans_16x8_5));
700
701 trans_16x4_1_l = vget_low_s16(trans_16x8_1);
702 trans_16x4_7_l = vget_low_s16(trans_16x8_7);
703
704 //-w1 + w7
705 sub_1_7_l = vsubl_s16(trans_16x4_7_l, trans_16x4_1_l);
706 sub_1_7_h = vsubl_s16(vget_high_s16(trans_16x8_7), vget_high_s16(trans_16x8_1));
707
708 // w1 + w7
709 add_1_7_l = vaddl_s16(trans_16x4_1_l, trans_16x4_7_l);
710 add_1_7_h = vaddl_s16(vget_high_s16(trans_16x8_1), vget_high_s16(trans_16x8_7));
711
712 //-w3 + w5 - w7
713 sub_357_l = vsubw_s16(sub_3_5_l, trans_16x4_7_l);
714 sub_357_h = vsubw_s16(sub_3_5_h, vget_high_s16(trans_16x8_7));
715
716 // w3 + w5 + w1
717 add_351_l = vaddw_s16(add_3_5_l, trans_16x4_1_l);
718 add_351_h = vaddw_s16(add_3_5_h, vget_high_s16(trans_16x8_1));
719
720 //-w1 + w7 + w5
721 add_175_l = vaddw_s16(sub_1_7_l, trans_16x4_5_l);
722 add_175_h = vaddw_s16(sub_1_7_h, vget_high_s16(trans_16x8_5));
723
724 // w1 + w7 - w3
725 sub_173_l = vsubw_s16(add_1_7_l, trans_16x4_3_l);
726 sub_173_h = vsubw_s16(add_1_7_h, vget_high_s16(trans_16x8_3));
727
728 //-w3 + w5 - w7 - (w7 >> 1)
729 y1_32x4_l = vsubw_s16(sub_357_l, vget_low_s16(rs_trans_16x8_7));
730 y1_32x4_h = vsubw_s16(sub_357_h, vget_high_s16(rs_trans_16x8_7));
731
732 // w1 + w7 - w3 - (w3 >> 1)
733 y3_32x4_l = vsubw_s16(sub_173_l, vget_low_s16(rs_trans_16x8_3));
734 y3_32x4_h = vsubw_s16(sub_173_h, vget_high_s16(rs_trans_16x8_3));
735
736 //-w1 + w7 + w5 + (w5 >> 1)
737 y5_32x4_l = vaddw_s16(add_175_l, vget_low_s16(rs_trans_16x8_5));
738 y5_32x4_h = vaddw_s16(add_175_h, vget_high_s16(rs_trans_16x8_5));
739
740 // w3 + w5 + w1 + (w1 >> 1)
741 y7_32x4_l = vaddw_s16(add_351_l, vget_low_s16(rs_trans_16x8_1));
742 y7_32x4_h = vaddw_s16(add_351_h, vget_high_s16(rs_trans_16x8_1));
743
744 y1_16x4_l = vmovn_s32(y1_32x4_l);
745 y1_16x4_h = vmovn_s32(y1_32x4_h);
746 y1_16x8 = vcombine_s16(y1_16x4_l, y1_16x4_h);
747 y3_16x4_l = vmovn_s32(y3_32x4_l);
748 y3_16x4_h = vmovn_s32(y3_32x4_h);
749 y3_16x8 = vcombine_s16(y3_16x4_l, y3_16x4_h);
750 y5_16x4_l = vmovn_s32(y5_32x4_l);
751 y5_16x4_h = vmovn_s32(y5_32x4_h);
752 y5_16x8 = vcombine_s16(y5_16x4_l, y5_16x4_h);
753 y7_16x4_l = vmovn_s32(y7_32x4_l);
754 y7_16x4_h = vmovn_s32(y7_32x4_h);
755 y7_16x8 = vcombine_s16(y7_16x4_l, y7_16x4_h);
756
757 rs_y1_16x8 = vshrq_n_s16(y1_16x8, 2);
758 rs_y3_16x8 = vshrq_n_s16(y3_16x8, 2);
759 rs_y5_16x8 = vshrq_n_s16(y5_16x8, 2);
760 rs_y7_16x8 = vshrq_n_s16(y7_16x8, 2);
761
762 z0_16x8 = vaddq_s16(y0_16x8, y6_16x8); // z0 = y0 + y6
763 z1_16x8 = vaddq_s16(y1_16x8, rs_y7_16x8); // z1 = y1 + (y7 >> 2)
764 z2_16x8 = vaddq_s16(y2_16x8, y4_16x8); // z2 = y2 + y4
765 z3_16x8 = vaddq_s16(y3_16x8, rs_y5_16x8); // z3 = y3 + (y5 >> 2)
766 z4_16x8 = vsubq_s16(y2_16x8, y4_16x8); // z4 = y2 - y4
767 z5_16x8 = vsubq_s16(rs_y3_16x8, y5_16x8); // z5 = (y3 >> 2) - y5
768 z6_16x8 = vsubq_s16(y0_16x8, y6_16x8); // z6 = y0 - y6
769 z7_16x8 = vsubq_s16(y7_16x8, rs_y1_16x8); // z7 = y7 - (y1 >> 2)
770
771 quant_res_16x8_0 = vaddq_s16(z0_16x8, z7_16x8); // x0 = z0 + z7
772 quant_res_16x8_1 = vaddq_s16(z2_16x8, z5_16x8); // x1 = z2 + z5
773 quant_res_16x8_2 = vaddq_s16(z4_16x8, z3_16x8); // x2 = z4 + z3
774 quant_res_16x8_3 = vaddq_s16(z6_16x8, z1_16x8); // x3 = z6 + z1
775 quant_res_16x8_4 = vsubq_s16(z6_16x8, z1_16x8); // x4 = z6 - z1
776 quant_res_16x8_5 = vsubq_s16(z4_16x8, z3_16x8); // x5 = z4 - z3
777 quant_res_16x8_6 = vsubq_s16(z2_16x8, z5_16x8); // x6 = z2 - z5
778 quant_res_16x8_7 = vsubq_s16(z0_16x8, z7_16x8); // x7 = z0 - z7
779 }
780
781 quant_res_16x8_0 = vrshrq_n_s16(quant_res_16x8_0, 6);
782 quant_res_16x8_1 = vrshrq_n_s16(quant_res_16x8_1, 6);
783 quant_res_16x8_2 = vrshrq_n_s16(quant_res_16x8_2, 6);
784 quant_res_16x8_3 = vrshrq_n_s16(quant_res_16x8_3, 6);
785 quant_res_16x8_4 = vrshrq_n_s16(quant_res_16x8_4, 6);
786 quant_res_16x8_5 = vrshrq_n_s16(quant_res_16x8_5, 6);
787 quant_res_16x8_6 = vrshrq_n_s16(quant_res_16x8_6, 6);
788 quant_res_16x8_7 = vrshrq_n_s16(quant_res_16x8_7, 6);
789
790 quant_res_16x8_0 = vminq_s16(quant_res_16x8_0, dup_max);
791 quant_res_16x8_0 = vmaxq_s16(quant_res_16x8_0, dup_min);
792 quant_res_16x8_1 = vminq_s16(quant_res_16x8_1, dup_max);
793 quant_res_16x8_1 = vmaxq_s16(quant_res_16x8_1, dup_min);
794 quant_res_16x8_2 = vminq_s16(quant_res_16x8_2, dup_max);
795 quant_res_16x8_2 = vmaxq_s16(quant_res_16x8_2, dup_min);
796 quant_res_16x8_3 = vminq_s16(quant_res_16x8_3, dup_max);
797 quant_res_16x8_3 = vmaxq_s16(quant_res_16x8_3, dup_min);
798 quant_res_16x8_4 = vminq_s16(quant_res_16x8_4, dup_max);
799 quant_res_16x8_4 = vmaxq_s16(quant_res_16x8_4, dup_min);
800
801 vst1q_s16(pi2_out, quant_res_16x8_0);
802 vst1q_s16(pi2_out + out_strd, quant_res_16x8_1);
803 vst1q_s16(pi2_out + out_strd * 2, quant_res_16x8_2);
804 vst1q_s16(pi2_out + out_strd * 3, quant_res_16x8_3);
805 vst1q_s16(pi2_out + out_strd * 4, quant_res_16x8_4);
806 vst1q_s16(pi2_out + out_strd * 5, quant_res_16x8_5);
807 vst1q_s16(pi2_out + out_strd * 6, quant_res_16x8_6);
808 vst1q_s16(pi2_out + out_strd * 7, quant_res_16x8_7);
809 }
810
811 /*****************************************************************************/
812 /* */
813 /* Function Name : isvcd_iquant_itrans_4x4_neonintr */
814 /* */
815 /* Description : this function computes the inverse quantized and */
816 /* inverse transformed output */
817 /* */
818 /* Inputs : */
819 /* Globals : none */
820 /* Processing : */
821 /* */
822 /* Outputs : none */
823 /* Returns : none */
824 /* */
825 /* Issues : none */
826 /* */
827 /* Revision History: */
828 /* */
829 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
830 /* 25 11 2021 Kishore creation */
831 /* */
832 /*****************************************************************************/
833
isvcd_iquant_itrans_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)834 void isvcd_iquant_itrans_4x4_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
835 const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
836 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
837 WORD16 *pi2_dc_ld_addr)
838 {
839 int16x4x4_t src_16x4x2;
840 int16x4x4_t iscal_16x4x2;
841 int16x4x4_t weigh_16x4x2;
842
843 int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
844 int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
845 int16x4_t rq1_16x4, rq3_16x4;
846 int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
847 int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
848 int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
849 int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
850 int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
851 int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
852 int16x4_t dup_min, dup_max;
853 int16x4x2_t xx0_16x4_2, xx2_16x4_2;
854 int32x2x2_t x0_32x2_2, x1_32x2_2;
855
856 WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
857 int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
858 UNUSED(pi2_tmp);
859
860 dup_min = vdup_n_s16(RSD_MIN);
861 dup_max = vdup_n_s16(RSD_MAX);
862
863 src_16x4x2 = vld4_s16(pi2_src);
864 iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
865 weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
866
867 weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
868 weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
869 weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
870 weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
871
872 q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
873 q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
874 q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
875 q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
876
877 q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
878 q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
879 q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
880 q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
881
882 q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
883 q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
884 q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
885 q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
886
887 q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
888 q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
889 q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
890 q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
891
892 if(iq_start_idx == 1)
893 {
894 q0_16x4 = vset_lane_s16(pi2_dc_ld_addr[0], q0_16x4, 0);
895 }
896
897 rq1_16x4 = vshr_n_s16(q1_16x4, 1); // q1 >>1
898 rq3_16x4 = vshr_n_s16(q3_16x4, 1); // q3 >>1
899
900 x0_16x4 = vadd_s16(q0_16x4, q2_16x4); // x0 = q0 + q2
901 x1_16x4 = vsub_s16(q0_16x4, q2_16x4); // x1 = q0 - q2
902 x2_16x4 = vsub_s16(rq1_16x4, q3_16x4); // x2 = q1>>1 - q3
903 x3_16x4 = vadd_s16(q1_16x4, rq3_16x4); // x2 = q1 + q3>>1
904
905 xx0_16x4 = vadd_s16(x0_16x4, x3_16x4); // x0+x3
906 xx1_16x4 = vadd_s16(x1_16x4, x2_16x4); // x1+x2
907 xx2_16x4 = vsub_s16(x1_16x4, x2_16x4); // x1-x2
908 xx3_16x4 = vsub_s16(x0_16x4, x3_16x4); // x0-x3
909
910 xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
911 xx0_0_16x4 = xx0_16x4_2.val[0];
912 xx0_1_16x4 = xx0_16x4_2.val[1];
913 xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
914 xx2_0_16x4 = xx2_16x4_2.val[0];
915 xx2_1_16x4 = xx2_16x4_2.val[1];
916 x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
917 x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
918 x0_32x2 = x0_32x2_2.val[0];
919 x1_32x2 = x1_32x2_2.val[0];
920 x2_32x2 = x0_32x2_2.val[1];
921 x3_32x2 = x1_32x2_2.val[1];
922
923 x0_16x4 = vreinterpret_s16_s32(x0_32x2);
924 x1_16x4 = vreinterpret_s16_s32(x1_32x2);
925 x2_16x4 = vreinterpret_s16_s32(x2_32x2);
926 x3_16x4 = vreinterpret_s16_s32(x3_32x2);
927
928 /* vertical inverse transform */
929 rq1_16x4 = vshr_n_s16(x1_16x4, 1); // q1 >> 1
930 rq3_16x4 = vshr_n_s16(x3_16x4, 1); // q3 >> 1
931
932 xx0_16x4 = vadd_s16(x0_16x4, x2_16x4); // x0 = q0 + q2
933 xx1_16x4 = vsub_s16(x0_16x4, x2_16x4); // x1 = q0 - q2
934 xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4); // x2 = q1>>1 - q3
935 xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4); // x3 = q1 + q3>>1
936
937 x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4); // imacro = x0 + x3
938 x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4); // imacro = x1 + x2
939 x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4); // imacro = x1 - x2
940 x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4); // imacro = x0 - x3
941
942 x0_16x4 = vrshr_n_s16(x0_16x4, 6);
943 x1_16x4 = vrshr_n_s16(x1_16x4, 6);
944 x2_16x4 = vrshr_n_s16(x2_16x4, 6);
945 x3_16x4 = vrshr_n_s16(x3_16x4, 6);
946
947 x0_16x4 = vmin_s16(x0_16x4, dup_max);
948 x0_16x4 = vmax_s16(x0_16x4, dup_min);
949 x1_16x4 = vmin_s16(x1_16x4, dup_max);
950 x1_16x4 = vmax_s16(x1_16x4, dup_min);
951 x2_16x4 = vmin_s16(x2_16x4, dup_max);
952 x2_16x4 = vmax_s16(x2_16x4, dup_min);
953 x3_16x4 = vmin_s16(x3_16x4, dup_max);
954 x3_16x4 = vmax_s16(x3_16x4, dup_min);
955
956 vst1_s16(pi2_out, x0_16x4);
957 vst1_s16(pi2_out + out_strd, x1_16x4);
958 vst1_s16(pi2_out + (out_strd << 1), x2_16x4);
959 vst1_s16(pi2_out + ((out_strd << 1) + out_strd), x3_16x4);
960 }
961