xref: /aosp_15_r20/external/libavc/decoder/arm/svc/isvcd_iquant_itrans_neon.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_iquant_itrans_neonintr.c
24  *
25  * @brief
26  *  Contains definition of functions for svc inverse quantization inverse
27  *    transformation and resd comp
28  *
29  * @author
30  *  Kishore
31  *
32  *  @par List of Functions:
33  *  - isvcd_iquant_itrans_4x4_neonintr()
34  *  - isvcd_iquant_itrans_8x8_neonintr()
35  *  - isvcd_iquant_itrans_4x4_dc_neonintr()
36  *  - isvcd_iquant_itrans_8x8_dc_neonintr()
37  *  - isvcd_iquant_itrans_chroma_4x4_neonintr()
38  *  - isvcd_iquant_itrans_chroma_4x4_dc_neonintr()
39  *
40  * @remarks
41  *
42  *******************************************************************************
43  */
44 
45 /*****************************************************************************/
46 /* File Includes                                                             */
47 /*****************************************************************************/
48 #include <string.h>
49 #include <arm_neon.h>
50 
51 /* User include files */
52 #include "ih264_typedefs.h"
53 #include "ih264_defs.h"
54 #include "ih264_trans_macros.h"
55 #include "ih264_macros.h"
56 #include "ih264_platform_macros.h"
57 #include "ih264_trans_data.h"
58 #include "ih264_size_defs.h"
59 #include "ih264_structs.h"
60 #include "isvcd_iquant_itrans.h"
61 
62 /*****************************************************************************/
63 /*                                                                           */
64 /*  Function Name : isvcd_iquant_itrans_4x4_dc_neonintr                       */
65 /*                                                                           */
66 /*  Description   : this function computes the inverse quantized and         */
67 /*                   inverse transformed output                              */
68 /*                                                                           */
69 /*  Inputs        :                                                          */
70 /*  Globals       : none                                                     */
71 /*  Processing    :                                                          */
72 /*                                                                           */
73 /*  Outputs       : none                                                     */
74 /*  Returns       : none                                                     */
75 /*                                                                           */
76 /*  Issues        : none                                                     */
77 /*                                                                           */
78 /*  Revision History:                                                        */
79 /*                                                                           */
80 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
81 /*         25 11 2021   Kishore               creation                       */
82 /*                                                                           */
83 /*****************************************************************************/
84 
isvcd_iquant_itrans_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)85 void isvcd_iquant_itrans_4x4_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
86                                          const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
87                                          UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
88                                          WORD16 *pi2_dc_ld_addr)
89 {
90     WORD32 i4_iq_out_temp;
91     int16x8_t temp_0, dup_min, dup_max;
92 
93     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
94     dup_min = vdupq_n_s16(RSD_MIN);
95     dup_max = vdupq_n_s16(RSD_MAX);
96     UNUSED(pi2_tmp);
97 
98     if(iq_start_idx == 0)
99     {
100         i4_iq_out_temp = pi2_src[0];
101         INV_QUANT(i4_iq_out_temp, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
102     }
103     else
104     {
105         i4_iq_out_temp = pi2_dc_ld_addr[0];
106     }
107 
108     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
109     temp_0 = vminq_s16(temp_0, dup_max);
110     temp_0 = vmaxq_s16(temp_0, dup_min);
111 
112     vst1_s16((int16_t *) (pi2_out), vget_low_s16(temp_0));
113     vst1_s16((int16_t *) (pi2_out + out_strd), vget_high_s16(temp_0));
114     vst1_s16((int16_t *) (pi2_out + (out_strd * 2)), vget_low_s16(temp_0));
115     vst1_s16((int16_t *) (pi2_out + (out_strd * 3)), vget_high_s16(temp_0));
116 }
117 
118 /*****************************************************************************/
119 /*                                                                           */
120 /*  Function Name : isvcd_iquant_itrans_chroma_4x4_dc_neonintr                */
121 /*                                                                           */
122 /*  Description   : this function computes the inverse quantized and         */
123 /*                   inverse transformed output                              */
124 /*                                                                           */
125 /*  Inputs        :                                                          */
126 /*  Globals       : none                                                     */
127 /*  Processing    :                                                          */
128 /*                                                                           */
129 /*  Outputs       : none                                                     */
130 /*  Returns       : none                                                     */
131 /*                                                                           */
132 /*  Issues        : none                                                     */
133 /*                                                                           */
134 /*  Revision History:                                                        */
135 /*                                                                           */
136 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
137 /*         25 11 2021   Kishore               creation                       */
138 /*                                                                           */
139 /*****************************************************************************/
140 
isvcd_iquant_itrans_chroma_4x4_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)141 void isvcd_iquant_itrans_chroma_4x4_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
142                                                 const UWORD16 *pu2_iscal_mat,
143                                                 const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
144                                                 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
145 {
146     int16x8_t temp_0, dup_max, dup_min;
147     WORD32 i4_iq_out_temp;
148 
149     int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
150     uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
151 
152     UNUSED(pi2_src);
153     UNUSED(pu2_iscal_mat);
154     UNUSED(pu2_weigh_mat);
155     UNUSED(u4_qp_div_6);
156     UNUSED(pi2_tmp);
157 
158     i4_iq_out_temp = pi2_dc_src[0];
159     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
160     dup_min = vdupq_n_s16(RSD_MIN);
161     dup_max = vdupq_n_s16(RSD_MAX);
162     temp_0 = vminq_s16(temp_0, dup_max);
163     temp_0 = vmaxq_s16(temp_0, dup_min);
164 
165     i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
166     i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
167     i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + out_strd * 2);
168     i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + out_strd * 3);
169 
170     i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r0);
171     i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r1);
172     i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r2);
173     i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, temp_0, i4_out_horz_16x8_r3);
174 
175     vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
176     vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
177     vst1q_s16((int16_t *) (pi2_out + out_strd * 2), i4_out_horz_16x8_r2);
178     vst1q_s16((int16_t *) (pi2_out + out_strd * 3), i4_out_horz_16x8_r3);
179 }
180 
181 /*****************************************************************************/
182 /*                                                                           */
183 /*  Function Name : isvcd_iquant_itrans_8x8_dc_neonintr                       */
184 /*                                                                           */
185 /*  Description   : this function computes the inverse quantized and         */
186 /*                   inverse transformed output                              */
187 /*                                                                           */
188 /*  Inputs        :                                                          */
189 /*  Globals       : none                                                     */
190 /*  Processing    :                                                          */
191 /*                                                                           */
192 /*  Outputs       : none                                                     */
193 /*  Returns       : none                                                     */
194 /*                                                                           */
195 /*  Issues        : none                                                     */
196 /*                                                                           */
197 /*  Revision History:                                                        */
198 /*                                                                           */
199 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
200 /*         25 11 2021   Kishore               creation                       */
201 /*                                                                           */
202 /*****************************************************************************/
203 
isvcd_iquant_itrans_8x8_dc_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)204 void isvcd_iquant_itrans_8x8_dc_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
205                                          const UWORD16 *pu2_iscale_mat,
206                                          const UWORD16 *pu2_weigh_mat, UWORD32 qp_div,
207                                          WORD16 *pi2_tmp, WORD32 iq_start_idx,
208                                          WORD16 *pi2_dc_ld_addr)
209 {
210     WORD32 i4_iq_out_temp;
211     int16x8_t temp_0;
212     int16x8_t dup_max, dup_min;
213 
214     WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
215     UNUSED(pi2_tmp);
216     UNUSED(iq_start_idx);
217     UNUSED(pi2_dc_ld_addr);
218     i4_iq_out_temp = pi2_src[0];
219 
220     INV_QUANT(i4_iq_out_temp, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
221 
222     temp_0 = vdupq_n_s16((i4_iq_out_temp + 32) >> 6);
223     dup_min = vdupq_n_s16(RSD_MIN);
224     dup_max = vdupq_n_s16(RSD_MAX);
225     temp_0 = vminq_s16(temp_0, dup_max);
226     temp_0 = vmaxq_s16(temp_0, dup_min);
227 
228     vst1q_s16((int16_t *) (pi2_out), temp_0);
229     vst1q_s16((int16_t *) (pi2_out + out_strd), temp_0);
230     vst1q_s16((int16_t *) (pi2_out + (out_strd * 2)), temp_0);
231     vst1q_s16((int16_t *) (pi2_out + (out_strd * 3)), temp_0);
232     vst1q_s16((int16_t *) (pi2_out + (out_strd * 4)), temp_0);
233     vst1q_s16((int16_t *) (pi2_out + (out_strd * 5)), temp_0);
234     vst1q_s16((int16_t *) (pi2_out + (out_strd * 6)), temp_0);
235     vst1q_s16((int16_t *) (pi2_out + (out_strd * 7)), temp_0);
236 }
237 
238 /*****************************************************************************/
239 /*                                                                           */
240 /*  Function Name : isvcd_iquant_itrans_chroma_4x4_neonintr                   */
241 /*                                                                           */
242 /*  Description   : this function computes the inverse quantized and         */
243 /*                   inverse transformed output                              */
244 /*                                                                           */
245 /*  Inputs        :                                                          */
246 /*  Globals       : none                                                     */
247 /*  Processing    :                                                          */
248 /*                                                                           */
249 /*  Outputs       : none                                                     */
250 /*  Returns       : none                                                     */
251 /*                                                                           */
252 /*  Issues        : none                                                     */
253 /*                                                                           */
254 /*  Revision History:                                                        */
255 /*                                                                           */
256 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
257 /*         25 11 2021   Kishore               creation                       */
258 /*                                                                           */
259 /*****************************************************************************/
260 
isvcd_iquant_itrans_chroma_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)261 void isvcd_iquant_itrans_chroma_4x4_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
262                                              const UWORD16 *pu2_iscal_mat,
263                                              const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
264                                              WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
265 {
266     int16x4x4_t src_16x4x2;
267     int16x4x4_t iscal_16x4x2;
268     int16x4x4_t weigh_16x4x2;
269 
270     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
271     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
272     int16x4_t rq1_16x4, rq3_16x4;
273     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
274     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
275     int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
276     int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
277     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
278     int16x4_t zero_16x4 = vdup_n_s16(0);
279     int16x4_t x_16x4_low, x_16x4_high;
280     int16x8_t x0_16x8, x1_16x8, x2_16x8, x3_16x8;
281     int16x4_t dup_max, dup_min;
282     int16x4x2_t xx0_16x4_2, xx2_16x4_2, x_16x4x2_t;
283     int32x2x2_t x0_32x2_2, x1_32x2_2;
284     int16x8_t i4_out_horz_16x8_r0, i4_out_horz_16x8_r1, i4_out_horz_16x8_r2, i4_out_horz_16x8_r3;
285     uint16x8_t chroma_mask_16x8 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000ffff));
286 
287     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
288     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
289     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
290     UNUSED(pi2_tmp);
291 
292     dup_min = vdup_n_s16(RSD_MIN);
293     dup_max = vdup_n_s16(RSD_MAX);
294 
295     src_16x4x2 = vld4_s16(pi2_src);
296     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
297     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
298 
299     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
300     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
301     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
302     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
303 
304     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
305     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
306     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
307     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
308 
309     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
310     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
311     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
312     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
313 
314     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
315     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
316     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
317     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
318 
319     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
320     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
321     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
322     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
323 
324     rq1_16x4 = vshr_n_s16(q1_16x4, 1);  // q1 >>1
325     rq3_16x4 = vshr_n_s16(q3_16x4, 1);  // q3 >>1
326 
327     q0_16x4 = vset_lane_s16(pi2_dc_src[0], q0_16x4, 0);
328 
329     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);   // x0 =  q0 + q2
330     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);   // x1 =  q0 - q2
331     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);  // x2 =  q1>>1 - q3
332     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);  // x2 =  q1 + q3>>1
333 
334     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);  // x0+x3
335     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);  // x1+x2
336     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);  // x1-x2
337     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);  // x0-x3
338 
339     xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
340     xx0_0_16x4 = xx0_16x4_2.val[0];
341     xx0_1_16x4 = xx0_16x4_2.val[1];
342     xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
343     xx2_0_16x4 = xx2_16x4_2.val[0];
344     xx2_1_16x4 = xx2_16x4_2.val[1];
345     x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
346     x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
347     x0_32x2 = x0_32x2_2.val[0];
348     x1_32x2 = x1_32x2_2.val[0];
349     x2_32x2 = x0_32x2_2.val[1];
350     x3_32x2 = x1_32x2_2.val[1];
351 
352     x0_16x4 = vreinterpret_s16_s32(x0_32x2);
353     x1_16x4 = vreinterpret_s16_s32(x1_32x2);
354     x2_16x4 = vreinterpret_s16_s32(x2_32x2);
355     x3_16x4 = vreinterpret_s16_s32(x3_32x2);
356 
357     /* vertical inverse transform */
358     rq1_16x4 = vshr_n_s16(x1_16x4, 1);       // q1 >> 1
359     rq3_16x4 = vshr_n_s16(x3_16x4, 1);       // q3 >> 1
360 
361     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);   // x0 =  q0 + q2
362     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);   // x1 =  q0 - q2
363     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);  // x2 =  q1>>1 - q3
364     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);  // x3 =  q1 + q3>>1
365 
366     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);  // imacro = x0 + x3
367     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);  // imacro = x1 + x2
368     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);  // imacro = x1 - x2
369     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);  // imacro = x0 - x3
370 
371     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
372     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
373     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
374     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
375 
376     x0_16x4 = vmin_s16(x0_16x4, dup_max);
377     x0_16x4 = vmax_s16(x0_16x4, dup_min);
378     x1_16x4 = vmin_s16(x1_16x4, dup_max);
379     x1_16x4 = vmax_s16(x1_16x4, dup_min);
380     x2_16x4 = vmin_s16(x2_16x4, dup_max);
381     x2_16x4 = vmax_s16(x2_16x4, dup_min);
382     x3_16x4 = vmin_s16(x3_16x4, dup_max);
383     x3_16x4 = vmax_s16(x3_16x4, dup_min);
384 
385     x_16x4x2_t = vzip_s16(x0_16x4, zero_16x4);
386     x_16x4_low = x_16x4x2_t.val[0];
387     x_16x4_high = x_16x4x2_t.val[1];
388     x0_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
389 
390     x_16x4x2_t = vzip_s16(x1_16x4, zero_16x4);
391     x_16x4_low = x_16x4x2_t.val[0];
392     x_16x4_high = x_16x4x2_t.val[1];
393     x1_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
394 
395     x_16x4x2_t = vzip_s16(x2_16x4, zero_16x4);
396     x_16x4_low = x_16x4x2_t.val[0];
397     x_16x4_high = x_16x4x2_t.val[1];
398     x2_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
399 
400     x_16x4x2_t = vzip_s16(x3_16x4, zero_16x4);
401     x_16x4_low = x_16x4x2_t.val[0];
402     x_16x4_high = x_16x4x2_t.val[1];
403     x3_16x8 = vcombine_s16(x_16x4_low, x_16x4_high);
404 
405     i4_out_horz_16x8_r0 = vld1q_s16(pi2_out);
406     i4_out_horz_16x8_r1 = vld1q_s16(pi2_out + out_strd);
407     i4_out_horz_16x8_r2 = vld1q_s16(pi2_out + out_strd * 2);
408     i4_out_horz_16x8_r3 = vld1q_s16(pi2_out + out_strd * 3);
409 
410     i4_out_horz_16x8_r0 = vbslq_s16(chroma_mask_16x8, x0_16x8, i4_out_horz_16x8_r0);
411     i4_out_horz_16x8_r1 = vbslq_s16(chroma_mask_16x8, x1_16x8, i4_out_horz_16x8_r1);
412     i4_out_horz_16x8_r2 = vbslq_s16(chroma_mask_16x8, x2_16x8, i4_out_horz_16x8_r2);
413     i4_out_horz_16x8_r3 = vbslq_s16(chroma_mask_16x8, x3_16x8, i4_out_horz_16x8_r3);
414 
415     vst1q_s16((int16_t *) (pi2_out), i4_out_horz_16x8_r0);
416     vst1q_s16((int16_t *) (pi2_out + out_strd), i4_out_horz_16x8_r1);
417     vst1q_s16((int16_t *) (pi2_out + out_strd * 2), i4_out_horz_16x8_r2);
418     vst1q_s16((int16_t *) (pi2_out + out_strd * 3), i4_out_horz_16x8_r3);
419 }
420 
421 /*****************************************************************************/
422 /*                                                                           */
423 /*  Function Name : isvcd_iquant_itrans_8x8_neonintr                          */
424 /*                                                                           */
425 /*  Description   : this function computes the inverse quantized and         */
426 /*                   inverse transformed output                              */
427 /*                                                                           */
428 /*  Inputs        :                                                          */
429 /*  Globals       : none                                                     */
430 /*  Processing    :                                                          */
431 /*                                                                           */
432 /*  Outputs       : none                                                     */
433 /*  Returns       : none                                                     */
434 /*                                                                           */
435 /*  Issues        : none                                                     */
436 /*                                                                           */
437 /*  Revision History:                                                        */
438 /*                                                                           */
439 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
440 /*         25 11 2021   Kishore               creation                       */
441 /*                                                                           */
442 /*****************************************************************************/
443 
isvcd_iquant_itrans_8x8_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)444 void isvcd_iquant_itrans_8x8_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
445                                       const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
446                                       UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx,
447                                       WORD16 *pi2_dc_ld_addr)
448 {
449     int16x8_t iscal_16x8_0, iscal_16x8_1, iscal_16x8_2, iscal_16x8_3, iscal_16x8_4, iscal_16x8_5,
450         iscal_16x8_6, iscal_16x8_7;
451 
452     int16x8_t weigh_16x8_0, weigh_16x8_1, weigh_16x8_2, weigh_16x8_3, weigh_16x8_4, weigh_16x8_5,
453         weigh_16x8_6, weigh_16x8_7;
454 
455     int16x8_t src_16x8_0, src_16x8_1, src_16x8_2, src_16x8_3, src_16x8_4, src_16x8_5, src_16x8_6,
456         src_16x8_7;
457     int16x8_t coeff_mul_16x8_0, coeff_mul_16x8_1, coeff_mul_16x8_2, coeff_mul_16x8_3,
458         coeff_mul_16x8_4, coeff_mul_16x8_5, coeff_mul_16x8_6, coeff_mul_16x8_7;
459 
460     int32x4_t quant_res_32x4_l_0, quant_res_32x4_l_1, quant_res_32x4_l_2, quant_res_32x4_l_3,
461         quant_res_32x4_l_4, quant_res_32x4_l_5, quant_res_32x4_l_6, quant_res_32x4_l_7;
462     int32x4_t quant_res_32x4_h_0, quant_res_32x4_h_1, quant_res_32x4_h_2, quant_res_32x4_h_3,
463         quant_res_32x4_h_4, quant_res_32x4_h_5, quant_res_32x4_h_6, quant_res_32x4_h_7;
464     int16x4_t quant_res_16x4_l_0, quant_res_16x4_l_1, quant_res_16x4_l_2, quant_res_16x4_l_3,
465         quant_res_16x4_l_4, quant_res_16x4_l_5, quant_res_16x4_l_6, quant_res_16x4_l_7;
466     int16x4_t quant_res_16x4_h_0, quant_res_16x4_h_1, quant_res_16x4_h_2, quant_res_16x4_h_3,
467         quant_res_16x4_h_4, quant_res_16x4_h_5, quant_res_16x4_h_6, quant_res_16x4_h_7;
468 
469     int16x8_t quant_res_16x8_0, quant_res_16x8_1, quant_res_16x8_2, quant_res_16x8_3,
470         quant_res_16x8_4, quant_res_16x8_5, quant_res_16x8_6, quant_res_16x8_7;
471 
472     int16x8_t trans_16x8_0, trans_16x8_1, trans_16x8_2, trans_16x8_3, trans_16x8_4, trans_16x8_5,
473         trans_16x8_6, trans_16x8_7;
474     int32x4_t trans_32x4_0, trans_32x4_1, trans_32x4_2, trans_32x4_3, trans_32x4_4, trans_32x4_5,
475         trans_32x4_6, trans_32x4_7;
476     int64x2_t trans_64x2_0, trans_64x2_1, trans_64x2_2, trans_64x2_3, trans_64x2_4, trans_64x2_5,
477         trans_64x2_6, trans_64x2_7;
478     int16x4_t trans_16x4_1_l, trans_16x4_3_l, trans_16x4_5_l, trans_16x4_7_l;
479     int16x8_t rs_trans_16x8_1, rs_trans_16x8_2, rs_trans_16x8_3, rs_trans_16x8_5, rs_trans_16x8_6,
480         rs_trans_16x8_7;
481     int32x4_t sub_3_5_l, sub_3_5_h;
482     int32x4_t add_3_5_l, add_3_5_h;
483     int32x4_t sub_1_7_l, sub_1_7_h;
484     int32x4_t add_1_7_l, add_1_7_h;
485     int32x4_t sub_357_l, sub_357_h;
486     int32x4_t add_351_l, add_351_h;
487     int32x4_t add_175_l, add_175_h;
488     int32x4_t sub_173_l, sub_173_h;
489     int32x4_t y1_32x4_l, y1_32x4_h;
490     int32x4_t y3_32x4_l, y3_32x4_h;
491     int32x4_t y5_32x4_l, y5_32x4_h;
492     int32x4_t y7_32x4_l, y7_32x4_h;
493     int16x4_t y1_16x4_l, y3_16x4_l, y5_16x4_l, y7_16x4_l;
494     int16x4_t y1_16x4_h, y3_16x4_h, y5_16x4_h, y7_16x4_h;
495 
496     int16x8_t y0_16x8, y1_16x8, y2_16x8, y3_16x8, y4_16x8, y5_16x8, y6_16x8, y7_16x8;
497     int16x8_t rs_y1_16x8, rs_y3_16x8, rs_y5_16x8, rs_y7_16x8;
498     int16x8_t z0_16x8, z1_16x8, z2_16x8, z3_16x8, z4_16x8, z5_16x8, z6_16x8, z7_16x8;
499     int16x8_t dup_max, dup_min;
500     int32x4_t qp_div_32x4 = vdupq_n_s32(qp_div);
501     int16x8x2_t trans_16x8_0_1, trans_16x8_2_3, trans_16x8_4_5, trans_16x8_6_7;
502     int32x4x2_t trans_32x4_0_2, trans_32x4_1_3, trans_32x4_4_6, trans_32x4_5_7;
503     WORD32 i;
504 
505     UNUSED(pi2_tmp);
506     UNUSED(iq_start_idx);
507     UNUSED(pi2_dc_ld_addr);
508 
509     dup_min = vdupq_n_s16(RSD_MIN);
510     dup_max = vdupq_n_s16(RSD_MAX);
511 
512     iscal_16x8_0 = vld1q_s16((const int16_t *) pu2_iscale_mat);
513     iscal_16x8_1 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 8));
514     iscal_16x8_2 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 16));
515     iscal_16x8_3 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 24));
516     iscal_16x8_4 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 32));
517     iscal_16x8_5 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 40));
518     iscal_16x8_6 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 48));
519     iscal_16x8_7 = vld1q_s16((const int16_t *) (pu2_iscale_mat + 56));
520 
521     weigh_16x8_0 = vld1q_s16((const int16_t *) pu2_weigh_mat);
522     weigh_16x8_1 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 8));
523     weigh_16x8_2 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 16));
524     weigh_16x8_3 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 24));
525     weigh_16x8_4 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 32));
526     weigh_16x8_5 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 40));
527     weigh_16x8_6 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 48));
528     weigh_16x8_7 = vld1q_s16((const int16_t *) (pu2_weigh_mat + 56));
529 
530     src_16x8_0 = vld1q_s16((const int16_t *) pi2_src);        // a0 a1 a2 a3 a4 a5 a6 a7
531     src_16x8_1 = vld1q_s16((const int16_t *) (pi2_src + 8));  // b0 b1 b2 b3 b4 b5 b6 b7
532     src_16x8_2 = vld1q_s16((const int16_t *) (pi2_src + 16));
533     src_16x8_3 = vld1q_s16((const int16_t *) (pi2_src + 24));
534     src_16x8_4 = vld1q_s16((const int16_t *) (pi2_src + 32));
535     src_16x8_5 = vld1q_s16((const int16_t *) (pi2_src + 40));
536     src_16x8_6 = vld1q_s16((const int16_t *) (pi2_src + 48));
537     src_16x8_7 = vld1q_s16((const int16_t *) (pi2_src + 56));
538 
539     coeff_mul_16x8_0 = vmulq_s16(iscal_16x8_0, weigh_16x8_0);
540     coeff_mul_16x8_1 = vmulq_s16(iscal_16x8_1, weigh_16x8_1);
541     coeff_mul_16x8_2 = vmulq_s16(iscal_16x8_2, weigh_16x8_2);
542     coeff_mul_16x8_3 = vmulq_s16(iscal_16x8_3, weigh_16x8_3);
543     coeff_mul_16x8_4 = vmulq_s16(iscal_16x8_4, weigh_16x8_4);
544     coeff_mul_16x8_5 = vmulq_s16(iscal_16x8_5, weigh_16x8_5);
545     coeff_mul_16x8_6 = vmulq_s16(iscal_16x8_6, weigh_16x8_6);
546     coeff_mul_16x8_7 = vmulq_s16(iscal_16x8_7, weigh_16x8_7);
547 
548     quant_res_32x4_l_0 = vmull_s16(vget_low_s16(coeff_mul_16x8_0), vget_low_s16(src_16x8_0));
549     quant_res_32x4_l_1 = vmull_s16(vget_low_s16(coeff_mul_16x8_1), vget_low_s16(src_16x8_1));
550     quant_res_32x4_l_2 = vmull_s16(vget_low_s16(coeff_mul_16x8_2), vget_low_s16(src_16x8_2));
551     quant_res_32x4_l_3 = vmull_s16(vget_low_s16(coeff_mul_16x8_3), vget_low_s16(src_16x8_3));
552     quant_res_32x4_l_4 = vmull_s16(vget_low_s16(coeff_mul_16x8_4), vget_low_s16(src_16x8_4));
553     quant_res_32x4_l_5 = vmull_s16(vget_low_s16(coeff_mul_16x8_5), vget_low_s16(src_16x8_5));
554     quant_res_32x4_l_6 = vmull_s16(vget_low_s16(coeff_mul_16x8_6), vget_low_s16(src_16x8_6));
555     quant_res_32x4_l_7 = vmull_s16(vget_low_s16(coeff_mul_16x8_7), vget_low_s16(src_16x8_7));
556 
557     quant_res_32x4_h_0 = vmull_s16(vget_high_s16(coeff_mul_16x8_0), vget_high_s16(src_16x8_0));
558     quant_res_32x4_h_1 = vmull_s16(vget_high_s16(coeff_mul_16x8_1), vget_high_s16(src_16x8_1));
559     quant_res_32x4_h_2 = vmull_s16(vget_high_s16(coeff_mul_16x8_2), vget_high_s16(src_16x8_2));
560     quant_res_32x4_h_3 = vmull_s16(vget_high_s16(coeff_mul_16x8_3), vget_high_s16(src_16x8_3));
561     quant_res_32x4_h_4 = vmull_s16(vget_high_s16(coeff_mul_16x8_4), vget_high_s16(src_16x8_4));
562     quant_res_32x4_h_5 = vmull_s16(vget_high_s16(coeff_mul_16x8_5), vget_high_s16(src_16x8_5));
563     quant_res_32x4_h_6 = vmull_s16(vget_high_s16(coeff_mul_16x8_6), vget_high_s16(src_16x8_6));
564     quant_res_32x4_h_7 = vmull_s16(vget_high_s16(coeff_mul_16x8_7), vget_high_s16(src_16x8_7));
565 
566     quant_res_32x4_l_0 = vshlq_s32(quant_res_32x4_l_0, qp_div_32x4);
567     quant_res_32x4_l_1 = vshlq_s32(quant_res_32x4_l_1, qp_div_32x4);
568     quant_res_32x4_l_2 = vshlq_s32(quant_res_32x4_l_2, qp_div_32x4);
569     quant_res_32x4_l_3 = vshlq_s32(quant_res_32x4_l_3, qp_div_32x4);
570     quant_res_32x4_l_4 = vshlq_s32(quant_res_32x4_l_4, qp_div_32x4);
571     quant_res_32x4_l_5 = vshlq_s32(quant_res_32x4_l_5, qp_div_32x4);
572     quant_res_32x4_l_6 = vshlq_s32(quant_res_32x4_l_6, qp_div_32x4);
573     quant_res_32x4_l_7 = vshlq_s32(quant_res_32x4_l_7, qp_div_32x4);
574 
575     quant_res_32x4_h_0 = vshlq_s32(quant_res_32x4_h_0, qp_div_32x4);
576     quant_res_32x4_h_1 = vshlq_s32(quant_res_32x4_h_1, qp_div_32x4);
577     quant_res_32x4_h_2 = vshlq_s32(quant_res_32x4_h_2, qp_div_32x4);
578     quant_res_32x4_h_3 = vshlq_s32(quant_res_32x4_h_3, qp_div_32x4);
579     quant_res_32x4_h_4 = vshlq_s32(quant_res_32x4_h_4, qp_div_32x4);
580     quant_res_32x4_h_5 = vshlq_s32(quant_res_32x4_h_5, qp_div_32x4);
581     quant_res_32x4_h_6 = vshlq_s32(quant_res_32x4_h_6, qp_div_32x4);
582     quant_res_32x4_h_7 = vshlq_s32(quant_res_32x4_h_7, qp_div_32x4);
583 
584     quant_res_16x4_l_0 = vqrshrn_n_s32(quant_res_32x4_l_0, 6);
585     quant_res_16x4_l_1 = vqrshrn_n_s32(quant_res_32x4_l_1, 6);
586     quant_res_16x4_l_2 = vqrshrn_n_s32(quant_res_32x4_l_2, 6);
587     quant_res_16x4_l_3 = vqrshrn_n_s32(quant_res_32x4_l_3, 6);
588     quant_res_16x4_l_4 = vqrshrn_n_s32(quant_res_32x4_l_4, 6);
589     quant_res_16x4_l_5 = vqrshrn_n_s32(quant_res_32x4_l_5, 6);
590     quant_res_16x4_l_6 = vqrshrn_n_s32(quant_res_32x4_l_6, 6);
591     quant_res_16x4_l_7 = vqrshrn_n_s32(quant_res_32x4_l_7, 6);
592 
593     quant_res_16x4_h_0 = vqrshrn_n_s32(quant_res_32x4_h_0, 6);
594     quant_res_16x4_h_1 = vqrshrn_n_s32(quant_res_32x4_h_1, 6);
595     quant_res_16x4_h_2 = vqrshrn_n_s32(quant_res_32x4_h_2, 6);
596     quant_res_16x4_h_3 = vqrshrn_n_s32(quant_res_32x4_h_3, 6);
597     quant_res_16x4_h_4 = vqrshrn_n_s32(quant_res_32x4_h_4, 6);
598     quant_res_16x4_h_5 = vqrshrn_n_s32(quant_res_32x4_h_5, 6);
599     quant_res_16x4_h_6 = vqrshrn_n_s32(quant_res_32x4_h_6, 6);
600     quant_res_16x4_h_7 = vqrshrn_n_s32(quant_res_32x4_h_7, 6);
601 
602     quant_res_16x8_0 = vcombine_s16(quant_res_16x4_l_0, quant_res_16x4_h_0);
603     quant_res_16x8_1 = vcombine_s16(quant_res_16x4_l_1, quant_res_16x4_h_1);
604     quant_res_16x8_2 = vcombine_s16(quant_res_16x4_l_2, quant_res_16x4_h_2);
605     quant_res_16x8_3 = vcombine_s16(quant_res_16x4_l_3, quant_res_16x4_h_3);
606     quant_res_16x8_4 = vcombine_s16(quant_res_16x4_l_4, quant_res_16x4_h_4);
607     quant_res_16x8_5 = vcombine_s16(quant_res_16x4_l_5, quant_res_16x4_h_5);
608     quant_res_16x8_6 = vcombine_s16(quant_res_16x4_l_6, quant_res_16x4_h_6);
609     quant_res_16x8_7 = vcombine_s16(quant_res_16x4_l_7, quant_res_16x4_h_7);
610 
611     for(i = 0; i < 2; i++)
612     {
613         trans_16x8_0_1 = vtrnq_s16(quant_res_16x8_0, quant_res_16x8_1);
614         trans_16x8_0 = trans_16x8_0_1.val[0];
615         trans_16x8_1 = trans_16x8_0_1.val[1];
616 
617         trans_16x8_2_3 = vtrnq_s16(quant_res_16x8_2, quant_res_16x8_3);
618         trans_16x8_2 = trans_16x8_2_3.val[0];
619         trans_16x8_3 = trans_16x8_2_3.val[1];
620 
621         trans_16x8_4_5 = vtrnq_s16(quant_res_16x8_4, quant_res_16x8_5);
622         trans_16x8_4 = trans_16x8_4_5.val[0];
623         trans_16x8_5 = trans_16x8_4_5.val[1];
624 
625         trans_16x8_6_7 = vtrnq_s16(quant_res_16x8_6, quant_res_16x8_7);
626         trans_16x8_6 = trans_16x8_6_7.val[0];
627         trans_16x8_7 = trans_16x8_6_7.val[1];
628 
629         trans_32x4_0_2 =
630             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_0), vreinterpretq_s32_s16(trans_16x8_2));
631         trans_32x4_0 = trans_32x4_0_2.val[0];
632         trans_32x4_2 = trans_32x4_0_2.val[1];
633 
634         trans_32x4_1_3 =
635             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_1), vreinterpretq_s32_s16(trans_16x8_3));
636         trans_32x4_1 = trans_32x4_1_3.val[0];
637         trans_32x4_3 = trans_32x4_1_3.val[1];
638 
639         trans_32x4_4_6 =
640             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_4), vreinterpretq_s32_s16(trans_16x8_6));
641         trans_32x4_4 = trans_32x4_4_6.val[0];
642         trans_32x4_6 = trans_32x4_4_6.val[1];
643 
644         trans_32x4_5_7 =
645             vtrnq_s32(vreinterpretq_s32_s16(trans_16x8_5), vreinterpretq_s32_s16(trans_16x8_7));
646         trans_32x4_5 = trans_32x4_5_7.val[0];
647         trans_32x4_7 = trans_32x4_5_7.val[1];
648 
649         trans_64x2_0 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_0)),
650                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_4)));
651         trans_64x2_4 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_0)),
652                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_4)));
653 
654         trans_64x2_1 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_1)),
655                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_5)));
656         trans_64x2_5 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_1)),
657                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_5)));
658 
659         trans_64x2_2 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_2)),
660                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_6)));
661         trans_64x2_6 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_2)),
662                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_6)));
663 
664         trans_64x2_3 = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(trans_32x4_3)),
665                                     vreinterpret_s64_s32(vget_low_s32(trans_32x4_7)));
666         trans_64x2_7 = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(trans_32x4_3)),
667                                     vreinterpret_s64_s32(vget_high_s32(trans_32x4_7)));
668 
669         trans_16x8_0 = vreinterpretq_s16_s64(trans_64x2_0);
670         trans_16x8_1 = vreinterpretq_s16_s64(trans_64x2_1);
671         trans_16x8_2 = vreinterpretq_s16_s64(trans_64x2_2);
672         trans_16x8_3 = vreinterpretq_s16_s64(trans_64x2_3);
673         trans_16x8_4 = vreinterpretq_s16_s64(trans_64x2_4);
674         trans_16x8_5 = vreinterpretq_s16_s64(trans_64x2_5);
675         trans_16x8_6 = vreinterpretq_s16_s64(trans_64x2_6);
676         trans_16x8_7 = vreinterpretq_s16_s64(trans_64x2_7);
677 
678         rs_trans_16x8_1 = vshrq_n_s16(trans_16x8_1, 1);
679         rs_trans_16x8_2 = vshrq_n_s16(trans_16x8_2, 1);
680         rs_trans_16x8_3 = vshrq_n_s16(trans_16x8_3, 1);
681         rs_trans_16x8_5 = vshrq_n_s16(trans_16x8_5, 1);
682         rs_trans_16x8_6 = vshrq_n_s16(trans_16x8_6, 1);
683         rs_trans_16x8_7 = vshrq_n_s16(trans_16x8_7, 1);
684 
685         y0_16x8 = vaddq_s16(trans_16x8_0, trans_16x8_4);
686         y2_16x8 = vsubq_s16(trans_16x8_0, trans_16x8_4);
687         y4_16x8 = vsubq_s16(rs_trans_16x8_2, trans_16x8_6);
688         y6_16x8 = vaddq_s16(trans_16x8_2, rs_trans_16x8_6);
689 
690         trans_16x4_3_l = vget_low_s16(trans_16x8_3);
691         trans_16x4_5_l = vget_low_s16(trans_16x8_5);
692 
693         //-w3 + w5
694         sub_3_5_l = vsubl_s16(vget_low_s16(trans_16x8_5), vget_low_s16(trans_16x8_3));
695         sub_3_5_h = vsubl_s16(vget_high_s16(trans_16x8_5), vget_high_s16(trans_16x8_3));
696 
697         // w3 + w5
698         add_3_5_l = vaddl_s16(trans_16x4_3_l, trans_16x4_5_l);
699         add_3_5_h = vaddl_s16(vget_high_s16(trans_16x8_3), vget_high_s16(trans_16x8_5));
700 
701         trans_16x4_1_l = vget_low_s16(trans_16x8_1);
702         trans_16x4_7_l = vget_low_s16(trans_16x8_7);
703 
704         //-w1 + w7
705         sub_1_7_l = vsubl_s16(trans_16x4_7_l, trans_16x4_1_l);
706         sub_1_7_h = vsubl_s16(vget_high_s16(trans_16x8_7), vget_high_s16(trans_16x8_1));
707 
708         // w1 + w7
709         add_1_7_l = vaddl_s16(trans_16x4_1_l, trans_16x4_7_l);
710         add_1_7_h = vaddl_s16(vget_high_s16(trans_16x8_1), vget_high_s16(trans_16x8_7));
711 
712         //-w3 + w5 - w7
713         sub_357_l = vsubw_s16(sub_3_5_l, trans_16x4_7_l);
714         sub_357_h = vsubw_s16(sub_3_5_h, vget_high_s16(trans_16x8_7));
715 
716         // w3 + w5 + w1
717         add_351_l = vaddw_s16(add_3_5_l, trans_16x4_1_l);
718         add_351_h = vaddw_s16(add_3_5_h, vget_high_s16(trans_16x8_1));
719 
720         //-w1 + w7 + w5
721         add_175_l = vaddw_s16(sub_1_7_l, trans_16x4_5_l);
722         add_175_h = vaddw_s16(sub_1_7_h, vget_high_s16(trans_16x8_5));
723 
724         // w1 + w7 - w3
725         sub_173_l = vsubw_s16(add_1_7_l, trans_16x4_3_l);
726         sub_173_h = vsubw_s16(add_1_7_h, vget_high_s16(trans_16x8_3));
727 
728         //-w3 + w5 - w7 - (w7 >> 1)
729         y1_32x4_l = vsubw_s16(sub_357_l, vget_low_s16(rs_trans_16x8_7));
730         y1_32x4_h = vsubw_s16(sub_357_h, vget_high_s16(rs_trans_16x8_7));
731 
732         // w1 + w7 - w3 - (w3 >> 1)
733         y3_32x4_l = vsubw_s16(sub_173_l, vget_low_s16(rs_trans_16x8_3));
734         y3_32x4_h = vsubw_s16(sub_173_h, vget_high_s16(rs_trans_16x8_3));
735 
736         //-w1 + w7 + w5 + (w5 >> 1)
737         y5_32x4_l = vaddw_s16(add_175_l, vget_low_s16(rs_trans_16x8_5));
738         y5_32x4_h = vaddw_s16(add_175_h, vget_high_s16(rs_trans_16x8_5));
739 
740         // w3 + w5 + w1 + (w1 >> 1)
741         y7_32x4_l = vaddw_s16(add_351_l, vget_low_s16(rs_trans_16x8_1));
742         y7_32x4_h = vaddw_s16(add_351_h, vget_high_s16(rs_trans_16x8_1));
743 
744         y1_16x4_l = vmovn_s32(y1_32x4_l);
745         y1_16x4_h = vmovn_s32(y1_32x4_h);
746         y1_16x8 = vcombine_s16(y1_16x4_l, y1_16x4_h);
747         y3_16x4_l = vmovn_s32(y3_32x4_l);
748         y3_16x4_h = vmovn_s32(y3_32x4_h);
749         y3_16x8 = vcombine_s16(y3_16x4_l, y3_16x4_h);
750         y5_16x4_l = vmovn_s32(y5_32x4_l);
751         y5_16x4_h = vmovn_s32(y5_32x4_h);
752         y5_16x8 = vcombine_s16(y5_16x4_l, y5_16x4_h);
753         y7_16x4_l = vmovn_s32(y7_32x4_l);
754         y7_16x4_h = vmovn_s32(y7_32x4_h);
755         y7_16x8 = vcombine_s16(y7_16x4_l, y7_16x4_h);
756 
757         rs_y1_16x8 = vshrq_n_s16(y1_16x8, 2);
758         rs_y3_16x8 = vshrq_n_s16(y3_16x8, 2);
759         rs_y5_16x8 = vshrq_n_s16(y5_16x8, 2);
760         rs_y7_16x8 = vshrq_n_s16(y7_16x8, 2);
761 
762         z0_16x8 = vaddq_s16(y0_16x8, y6_16x8);           // z0 = y0 + y6
763         z1_16x8 = vaddq_s16(y1_16x8, rs_y7_16x8);        // z1 = y1 + (y7 >> 2)
764         z2_16x8 = vaddq_s16(y2_16x8, y4_16x8);           // z2 = y2 + y4
765         z3_16x8 = vaddq_s16(y3_16x8, rs_y5_16x8);        // z3 = y3 + (y5 >> 2)
766         z4_16x8 = vsubq_s16(y2_16x8, y4_16x8);           // z4 = y2 - y4
767         z5_16x8 = vsubq_s16(rs_y3_16x8, y5_16x8);        // z5 = (y3 >> 2) - y5
768         z6_16x8 = vsubq_s16(y0_16x8, y6_16x8);           // z6 = y0 - y6
769         z7_16x8 = vsubq_s16(y7_16x8, rs_y1_16x8);        // z7 = y7 - (y1 >> 2)
770 
771         quant_res_16x8_0 = vaddq_s16(z0_16x8, z7_16x8);  // x0 = z0 + z7
772         quant_res_16x8_1 = vaddq_s16(z2_16x8, z5_16x8);  // x1 = z2 + z5
773         quant_res_16x8_2 = vaddq_s16(z4_16x8, z3_16x8);  // x2 = z4 + z3
774         quant_res_16x8_3 = vaddq_s16(z6_16x8, z1_16x8);  // x3 = z6 + z1
775         quant_res_16x8_4 = vsubq_s16(z6_16x8, z1_16x8);  // x4 = z6 - z1
776         quant_res_16x8_5 = vsubq_s16(z4_16x8, z3_16x8);  // x5 = z4 - z3
777         quant_res_16x8_6 = vsubq_s16(z2_16x8, z5_16x8);  // x6 = z2 - z5
778         quant_res_16x8_7 = vsubq_s16(z0_16x8, z7_16x8);  // x7 = z0 - z7
779     }
780 
781     quant_res_16x8_0 = vrshrq_n_s16(quant_res_16x8_0, 6);
782     quant_res_16x8_1 = vrshrq_n_s16(quant_res_16x8_1, 6);
783     quant_res_16x8_2 = vrshrq_n_s16(quant_res_16x8_2, 6);
784     quant_res_16x8_3 = vrshrq_n_s16(quant_res_16x8_3, 6);
785     quant_res_16x8_4 = vrshrq_n_s16(quant_res_16x8_4, 6);
786     quant_res_16x8_5 = vrshrq_n_s16(quant_res_16x8_5, 6);
787     quant_res_16x8_6 = vrshrq_n_s16(quant_res_16x8_6, 6);
788     quant_res_16x8_7 = vrshrq_n_s16(quant_res_16x8_7, 6);
789 
790     quant_res_16x8_0 = vminq_s16(quant_res_16x8_0, dup_max);
791     quant_res_16x8_0 = vmaxq_s16(quant_res_16x8_0, dup_min);
792     quant_res_16x8_1 = vminq_s16(quant_res_16x8_1, dup_max);
793     quant_res_16x8_1 = vmaxq_s16(quant_res_16x8_1, dup_min);
794     quant_res_16x8_2 = vminq_s16(quant_res_16x8_2, dup_max);
795     quant_res_16x8_2 = vmaxq_s16(quant_res_16x8_2, dup_min);
796     quant_res_16x8_3 = vminq_s16(quant_res_16x8_3, dup_max);
797     quant_res_16x8_3 = vmaxq_s16(quant_res_16x8_3, dup_min);
798     quant_res_16x8_4 = vminq_s16(quant_res_16x8_4, dup_max);
799     quant_res_16x8_4 = vmaxq_s16(quant_res_16x8_4, dup_min);
800 
801     vst1q_s16(pi2_out, quant_res_16x8_0);
802     vst1q_s16(pi2_out + out_strd, quant_res_16x8_1);
803     vst1q_s16(pi2_out + out_strd * 2, quant_res_16x8_2);
804     vst1q_s16(pi2_out + out_strd * 3, quant_res_16x8_3);
805     vst1q_s16(pi2_out + out_strd * 4, quant_res_16x8_4);
806     vst1q_s16(pi2_out + out_strd * 5, quant_res_16x8_5);
807     vst1q_s16(pi2_out + out_strd * 6, quant_res_16x8_6);
808     vst1q_s16(pi2_out + out_strd * 7, quant_res_16x8_7);
809 }
810 
811 /*****************************************************************************/
812 /*                                                                           */
813 /*  Function Name : isvcd_iquant_itrans_4x4_neonintr                          */
814 /*                                                                           */
815 /*  Description   : this function computes the inverse quantized and         */
816 /*                   inverse transformed output                              */
817 /*                                                                           */
818 /*  Inputs        :                                                          */
819 /*  Globals       : none                                                     */
820 /*  Processing    :                                                          */
821 /*                                                                           */
822 /*  Outputs       : none                                                     */
823 /*  Returns       : none                                                     */
824 /*                                                                           */
825 /*  Issues        : none                                                     */
826 /*                                                                           */
827 /*  Revision History:                                                        */
828 /*                                                                           */
829 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
830 /*         25 11 2021   Kishore               creation                       */
831 /*                                                                           */
832 /*****************************************************************************/
833 
isvcd_iquant_itrans_4x4_neonintr(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)834 void isvcd_iquant_itrans_4x4_neonintr(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
835                                       const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
836                                       UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
837                                       WORD16 *pi2_dc_ld_addr)
838 {
839     int16x4x4_t src_16x4x2;
840     int16x4x4_t iscal_16x4x2;
841     int16x4x4_t weigh_16x4x2;
842 
843     int16x4_t q0_16x4, q1_16x4, q2_16x4, q3_16x4;
844     int32x4_t q0_32x4, q1_32x4, q2_32x4, q3_32x4;
845     int16x4_t rq1_16x4, rq3_16x4;
846     int16x4_t x0_16x4, x1_16x4, x2_16x4, x3_16x4;
847     int16x4_t xx0_16x4, xx1_16x4, xx2_16x4, xx3_16x4;
848     int16x4_t xx0_0_16x4, xx0_1_16x4, xx2_0_16x4, xx2_1_16x4;
849     int32x2_t x0_32x2, x1_32x2, x2_32x2, x3_32x2;
850     int16x4_t weigh0_16x4, weigh1_16x4, weigh2_16x4, weigh3_16x4;
851     int32x4_t qp_div_6_32x4 = vdupq_n_s32(u4_qp_div_6);
852     int16x4_t dup_min, dup_max;
853     int16x4x2_t xx0_16x4_2, xx2_16x4_2;
854     int32x2x2_t x0_32x2_2, x1_32x2_2;
855 
856     WORD16 rnd_factor = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
857     int32x4_t rnd_fact = vdupq_n_s32(rnd_factor);
858     UNUSED(pi2_tmp);
859 
860     dup_min = vdup_n_s16(RSD_MIN);
861     dup_max = vdup_n_s16(RSD_MAX);
862 
863     src_16x4x2 = vld4_s16(pi2_src);
864     iscal_16x4x2 = vld4_s16((const int16_t *) pu2_iscal_mat);
865     weigh_16x4x2 = vld4_s16((const int16_t *) pu2_weigh_mat);
866 
867     weigh0_16x4 = vmul_s16(weigh_16x4x2.val[0], iscal_16x4x2.val[0]);
868     weigh1_16x4 = vmul_s16(weigh_16x4x2.val[1], iscal_16x4x2.val[1]);
869     weigh2_16x4 = vmul_s16(weigh_16x4x2.val[2], iscal_16x4x2.val[2]);
870     weigh3_16x4 = vmul_s16(weigh_16x4x2.val[3], iscal_16x4x2.val[3]);
871 
872     q0_32x4 = vmull_s16(weigh0_16x4, src_16x4x2.val[0]);
873     q1_32x4 = vmull_s16(weigh1_16x4, src_16x4x2.val[1]);
874     q2_32x4 = vmull_s16(weigh2_16x4, src_16x4x2.val[2]);
875     q3_32x4 = vmull_s16(weigh3_16x4, src_16x4x2.val[3]);
876 
877     q0_32x4 = vaddq_s32(q0_32x4, rnd_fact);
878     q1_32x4 = vaddq_s32(q1_32x4, rnd_fact);
879     q2_32x4 = vaddq_s32(q2_32x4, rnd_fact);
880     q3_32x4 = vaddq_s32(q3_32x4, rnd_fact);
881 
882     q0_32x4 = vshlq_s32(q0_32x4, qp_div_6_32x4);
883     q1_32x4 = vshlq_s32(q1_32x4, qp_div_6_32x4);
884     q2_32x4 = vshlq_s32(q2_32x4, qp_div_6_32x4);
885     q3_32x4 = vshlq_s32(q3_32x4, qp_div_6_32x4);
886 
887     q0_16x4 = vqshrn_n_s32(q0_32x4, 4);
888     q1_16x4 = vqshrn_n_s32(q1_32x4, 4);
889     q2_16x4 = vqshrn_n_s32(q2_32x4, 4);
890     q3_16x4 = vqshrn_n_s32(q3_32x4, 4);
891 
892     if(iq_start_idx == 1)
893     {
894         q0_16x4 = vset_lane_s16(pi2_dc_ld_addr[0], q0_16x4, 0);
895     }
896 
897     rq1_16x4 = vshr_n_s16(q1_16x4, 1);      // q1 >>1
898     rq3_16x4 = vshr_n_s16(q3_16x4, 1);      // q3 >>1
899 
900     x0_16x4 = vadd_s16(q0_16x4, q2_16x4);   // x0 =  q0 + q2
901     x1_16x4 = vsub_s16(q0_16x4, q2_16x4);   // x1 =  q0 - q2
902     x2_16x4 = vsub_s16(rq1_16x4, q3_16x4);  // x2 =  q1>>1 - q3
903     x3_16x4 = vadd_s16(q1_16x4, rq3_16x4);  // x2 =  q1 + q3>>1
904 
905     xx0_16x4 = vadd_s16(x0_16x4, x3_16x4);  // x0+x3
906     xx1_16x4 = vadd_s16(x1_16x4, x2_16x4);  // x1+x2
907     xx2_16x4 = vsub_s16(x1_16x4, x2_16x4);  // x1-x2
908     xx3_16x4 = vsub_s16(x0_16x4, x3_16x4);  // x0-x3
909 
910     xx0_16x4_2 = vtrn_s16(xx0_16x4, xx1_16x4);
911     xx0_0_16x4 = xx0_16x4_2.val[0];
912     xx0_1_16x4 = xx0_16x4_2.val[1];
913     xx2_16x4_2 = vtrn_s16(xx2_16x4, xx3_16x4);
914     xx2_0_16x4 = xx2_16x4_2.val[0];
915     xx2_1_16x4 = xx2_16x4_2.val[1];
916     x0_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_0_16x4), vreinterpret_s32_s16(xx2_0_16x4));
917     x1_32x2_2 = vtrn_s32(vreinterpret_s32_s16(xx0_1_16x4), vreinterpret_s32_s16(xx2_1_16x4));
918     x0_32x2 = x0_32x2_2.val[0];
919     x1_32x2 = x1_32x2_2.val[0];
920     x2_32x2 = x0_32x2_2.val[1];
921     x3_32x2 = x1_32x2_2.val[1];
922 
923     x0_16x4 = vreinterpret_s16_s32(x0_32x2);
924     x1_16x4 = vreinterpret_s16_s32(x1_32x2);
925     x2_16x4 = vreinterpret_s16_s32(x2_32x2);
926     x3_16x4 = vreinterpret_s16_s32(x3_32x2);
927 
928     /* vertical inverse transform */
929     rq1_16x4 = vshr_n_s16(x1_16x4, 1);       // q1 >> 1
930     rq3_16x4 = vshr_n_s16(x3_16x4, 1);       // q3 >> 1
931 
932     xx0_16x4 = vadd_s16(x0_16x4, x2_16x4);   // x0 =  q0 + q2
933     xx1_16x4 = vsub_s16(x0_16x4, x2_16x4);   // x1 =  q0 - q2
934     xx2_16x4 = vsub_s16(rq1_16x4, x3_16x4);  // x2 =  q1>>1 - q3
935     xx3_16x4 = vadd_s16(x1_16x4, rq3_16x4);  // x3 =  q1 + q3>>1
936 
937     x0_16x4 = vadd_s16(xx0_16x4, xx3_16x4);  // imacro = x0 + x3
938     x1_16x4 = vadd_s16(xx1_16x4, xx2_16x4);  // imacro = x1 + x2
939     x2_16x4 = vsub_s16(xx1_16x4, xx2_16x4);  // imacro = x1 - x2
940     x3_16x4 = vsub_s16(xx0_16x4, xx3_16x4);  // imacro = x0 - x3
941 
942     x0_16x4 = vrshr_n_s16(x0_16x4, 6);
943     x1_16x4 = vrshr_n_s16(x1_16x4, 6);
944     x2_16x4 = vrshr_n_s16(x2_16x4, 6);
945     x3_16x4 = vrshr_n_s16(x3_16x4, 6);
946 
947     x0_16x4 = vmin_s16(x0_16x4, dup_max);
948     x0_16x4 = vmax_s16(x0_16x4, dup_min);
949     x1_16x4 = vmin_s16(x1_16x4, dup_max);
950     x1_16x4 = vmax_s16(x1_16x4, dup_min);
951     x2_16x4 = vmin_s16(x2_16x4, dup_max);
952     x2_16x4 = vmax_s16(x2_16x4, dup_min);
953     x3_16x4 = vmin_s16(x3_16x4, dup_max);
954     x3_16x4 = vmax_s16(x3_16x4, dup_min);
955 
956     vst1_s16(pi2_out, x0_16x4);
957     vst1_s16(pi2_out + out_strd, x1_16x4);
958     vst1_s16(pi2_out + (out_strd << 1), x2_16x4);
959     vst1_s16(pi2_out + ((out_strd << 1) + out_strd), x3_16x4);
960 }
961