xref: /aosp_15_r20/external/libavc/decoder/x86/svc/isvcd_iquant_itrans_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21 *******************************************************************************
22 * @file
23 *  isvcd_iquant_itrans_sse42.c
24 *
25 * @brief
26 *  Contains function definitions for inverse  quantization, inverse
27 * transform
28 *
29 * @author
30 *  Kishore
31 *
32 * @par List of Functions:
33 *  - isvcd_iquant_itrans_4x4_sse42()
34 *  - isvcd_iquant_itrans_chroma_4x4_sse42()
35 *  - isvcd_iquant_itrans_8x8_dc_sse42()
36 *  - isvcd_iquant_itrans_4x4_dc_sse42()
37 *  - isvcd_iquant_itrans_chroma_4x4_dc_sse42()
38 *  - isvcd_iquant_itrans_8x8_sse42()
39 *
40 * @remarks
41 *  None
42 *
43 *******************************************************************************
44 */
45 /* User include files */
46 #include <immintrin.h>
47 #include "ih264_typedefs.h"
48 #include "ih264_defs.h"
49 #include "ih264_trans_macros.h"
50 #include "ih264_macros.h"
51 #include "ih264_platform_macros.h"
52 #include "ih264_trans_data.h"
53 #include "ih264_size_defs.h"
54 #include "ih264_structs.h"
55 #include "isvcd_iquant_itrans.h"
56 
57 /*****************************************************************************/
58 /*                                                                           */
59 /*  Function Name : isvcd_iquant_itrans_4x4_dc_sse42                          */
60 /*                                                                           */
61 /*  Description   : this function computes the inverse quantized and         */
62 /*                   inverse transformed output                              */
63 /*                                                                           */
64 /*  Inputs        :                                                          */
65 /*  Globals       : none                                                     */
66 /*  Processing    :                                                          */
67 /*                                                                           */
68 /*  Outputs       : none                                                     */
69 /*  Returns       : none                                                     */
70 /*                                                                           */
71 /*  Issues        : none                                                     */
72 /*                                                                           */
73 /*  Revision History:                                                        */
74 /*                                                                           */
75 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
76 /*         25 11 2021   Kishore               creation                       */
77 /*                                                                           */
78 /*****************************************************************************/
79 
isvcd_iquant_itrans_4x4_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)80 void isvcd_iquant_itrans_4x4_dc_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
81                                       const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
82                                       UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
83                                       WORD16 *pi2_dc_ld_addr)
84 {
85     WORD32 q0;
86     WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
87     __m128i dupmax_8x16b = _mm_set1_epi16(RSD_MAX);
88     __m128i dupmin_8x16b = _mm_set1_epi16(RSD_MIN);
89     __m128i i_macro;
90     UNUSED(pi2_tmp);
91 
92     if(iq_start_idx == 0)
93     {
94         q0 = pi2_src[0];
95         INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
96     }
97     else
98     {
99         q0 = pi2_dc_ld_addr[0];  // Restoring dc value for intra case3
100     }
101 
102     i_macro = _mm_set1_epi16((q0 + 32) >> 6);
103     i_macro = _mm_min_epi16(dupmax_8x16b, i_macro);
104     i_macro = _mm_max_epi16(dupmin_8x16b, i_macro);
105 
106     _mm_storel_epi64((__m128i *) pi2_out, i_macro);
107     pi2_out += out_strd;
108     _mm_storel_epi64((__m128i *) pi2_out, i_macro);
109     pi2_out += out_strd;
110     _mm_storel_epi64((__m128i *) pi2_out, i_macro);
111     pi2_out += out_strd;
112     _mm_storel_epi64((__m128i *) pi2_out, i_macro);
113 }
114 
115 /*****************************************************************************/
116 /*                                                                           */
117 /*  Function Name : isvcd_iquant_itrans_8x8_dc_sse42                          */
118 /*                                                                           */
119 /*  Description   : this function computes the inverse quantized and         */
120 /*                   inverse transformed output                              */
121 /*                                                                           */
122 /*  Inputs        :                                                          */
123 /*  Globals       : none                                                     */
124 /*  Processing    :                                                          */
125 /*                                                                           */
126 /*  Outputs       : none                                                     */
127 /*  Returns       : none                                                     */
128 /*                                                                           */
129 /*  Issues        : none                                                     */
130 /*                                                                           */
131 /*  Revision History:                                                        */
132 /*                                                                           */
133 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
134 /*         25 11 2021   Kishore               creation                       */
135 /*                                                                           */
136 /*****************************************************************************/
137 
isvcd_iquant_itrans_8x8_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)138 void isvcd_iquant_itrans_8x8_dc_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
139                                       const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
140                                       UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx,
141                                       WORD16 *pi2_dc_ld_addr)
142 {
143     WORD32 q;
144     WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
145     __m128i dupmin_8x16b, dupmax_8x16b, i_macro;
146 
147     UNUSED(pi2_tmp);
148     UNUSED(iq_start_idx);
149     UNUSED(pi2_dc_ld_addr);
150 
151     q = pi2_src[0];
152     INV_QUANT(q, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
153 
154     i_macro = _mm_set1_epi16((q + 32) >> 6);
155     dupmax_8x16b = _mm_set1_epi16(RSD_MAX);
156     dupmin_8x16b = _mm_set1_epi16(RSD_MIN);
157 
158     i_macro = _mm_min_epi16(dupmax_8x16b, i_macro);
159     i_macro = _mm_max_epi16(dupmin_8x16b, i_macro);
160 
161     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
162     pi2_out += out_strd;
163     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
164     pi2_out += out_strd;
165     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
166     pi2_out += out_strd;
167     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
168     pi2_out += out_strd;
169     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
170     pi2_out += out_strd;
171     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
172     pi2_out += out_strd;
173     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
174     pi2_out += out_strd;
175     _mm_storeu_si128((__m128i *) pi2_out, i_macro);
176 }
177 /*****************************************************************************/
178 /*                                                                           */
179 /*  Function Name : isvcd_iquant_itrans_chroma_4x4_dc_sse42                   */
180 /*                                                                           */
181 /*  Description   : this function computes the inverse quantized and         */
182 /*                   inverse transformed output                              */
183 /*                                                                           */
184 /*  Inputs        :                                                          */
185 /*  Globals       : none                                                     */
186 /*  Processing    :                                                          */
187 /*                                                                           */
188 /*  Outputs       : none                                                     */
189 /*  Returns       : none                                                     */
190 /*                                                                           */
191 /*  Issues        : none                                                     */
192 /*                                                                           */
193 /*  Revision History:                                                        */
194 /*                                                                           */
195 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
196 /*         25 11 2021   Kishore               creation                       */
197 /*                                                                           */
198 /*****************************************************************************/
199 
isvcd_iquant_itrans_chroma_4x4_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)200 void isvcd_iquant_itrans_chroma_4x4_dc_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
201                                              const UWORD16 *pu2_iscal_mat,
202                                              const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
203                                              WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
204 {
205     __m128i src_r0, src_r1, src_r2, src_r3;
206     __m128i i_macro = _mm_set1_epi16((pi2_dc_src[0] + 32) >> 6);
207     __m128i chroma_mask_even, chroma_mask_odd;
208     __m128i dupmax_8x16b = _mm_set1_epi16(RSD_MAX);
209     __m128i dupmin_8x16b = _mm_set1_epi16(RSD_MIN);
210 
211     UNUSED(pi2_src);
212     UNUSED(pu2_iscal_mat);
213     UNUSED(pu2_weigh_mat);
214     UNUSED(pi2_tmp);
215     UNUSED(u4_qp_div_6);
216 
217     i_macro = _mm_min_epi16(dupmax_8x16b, i_macro);
218     i_macro = _mm_max_epi16(dupmin_8x16b, i_macro);
219 
220     // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
221     src_r0 = _mm_loadu_si128((__m128i *) (pi2_out));
222     // a20 a21 a22 a23 a30 a31 a32 a33  -- the source matrix 2nd,3rd row
223     src_r1 = _mm_loadu_si128((__m128i *) (pi2_out + (1 * out_strd)));
224     src_r2 = _mm_loadu_si128((__m128i *) (pi2_out + (2 * out_strd)));
225     src_r3 = _mm_loadu_si128((__m128i *) (pi2_out + (3 * out_strd)));
226 
227     chroma_mask_even =
228         _mm_set_epi16(0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff);
229     chroma_mask_odd = _mm_set_epi16(0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000);
230 
231     src_r0 = _mm_and_si128(src_r0, chroma_mask_odd);  // 0 src1 0 src2 0 ...
232     src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
233     src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
234     src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
235 
236     i_macro = _mm_and_si128(i_macro, chroma_mask_even);  // macro 0 macro 0 ..
237 
238     src_r0 = _mm_add_epi16(src_r0, i_macro);             // macro  src1 macro src2 macro ...
239     src_r1 = _mm_add_epi16(src_r1, i_macro);
240     src_r2 = _mm_add_epi16(src_r2, i_macro);
241     src_r3 = _mm_add_epi16(src_r3, i_macro);
242 
243     _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0);
244     _mm_storeu_si128((__m128i *) (&pi2_out[out_strd]), src_r1);
245     _mm_storeu_si128((__m128i *) (&pi2_out[2 * out_strd]), src_r2);
246     _mm_storeu_si128((__m128i *) (&pi2_out[3 * out_strd]), src_r3);
247 }
248 /*****************************************************************************/
249 /*                                                                           */
250 /*  Function Name : isvcd_iquant_itrans_4x4_sse42                             */
251 /*                                                                           */
252 /*  Description   : this function computes the inverse quantized and         */
253 /*                   inverse transformed output                              */
254 /*                                                                           */
255 /*  Inputs        :                                                          */
256 /*  Globals       : none                                                     */
257 /*  Processing    :                                                          */
258 /*                                                                           */
259 /*  Outputs       : none                                                     */
260 /*  Returns       : none                                                     */
261 /*                                                                           */
262 /*  Issues        : none                                                     */
263 /*                                                                           */
264 /*  Revision History:                                                        */
265 /*                                                                           */
266 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
267 /*         25 11 2021   Kishore               creation                       */
268 /*                                                                           */
269 /*****************************************************************************/
270 
isvcd_iquant_itrans_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)271 void isvcd_iquant_itrans_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
272                                    const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
273                                    UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
274                                    WORD16 *pi2_dc_ld_addr)
275 {
276     __m128i src_r0_r1, src_r2_r3;
277     __m128i src_r0, src_r1, src_r2, src_r3;
278     __m128i scalemat_r0_r1, scalemat_r2_r3;
279     __m128i dequant_r0_r1, dequant_r2_r3;
280     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
281     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
282     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
283     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
284     __m128i value_32 = _mm_set1_epi32(32);
285     __m128i dupmax_4x32b = _mm_set1_epi32(RSD_MAX);
286     __m128i dupmin_4x32b = _mm_set1_epi32(RSD_MIN);
287     UNUSED(pi2_tmp);
288 
289     /*************************************************************/
290     /* Dequantization of coefficients. Will be replaced by SIMD  */
291     /* operations on platform                                    */
292     /*************************************************************/
293     // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
294     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
295     // a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
296     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
297     // b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
298     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
299     // b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
300     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
301     // q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
302     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
303     // q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
304     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
305 
306     // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
307     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
308     // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
309     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
310 
311     // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
312     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
313     // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
314     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
315     // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
316     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
317     // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
318     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
319 
320     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
321     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);  // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
322     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);  // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
323     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);  // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
324 
325     // a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
326     temp4 = _mm_madd_epi16(src_r0, temp4);
327     temp5 = _mm_madd_epi16(src_r1, temp5);
328     temp6 = _mm_madd_epi16(src_r2, temp6);
329     temp7 = _mm_madd_epi16(src_r3, temp7);
330 
331     if(u4_qp_div_6 >= 4)
332     {
333         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
334         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
335         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
336         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
337     }
338     else
339     {
340         temp4 = _mm_add_epi32(temp4, add_rshift);
341         temp5 = _mm_add_epi32(temp5, add_rshift);
342         temp6 = _mm_add_epi32(temp6, add_rshift);
343         temp7 = _mm_add_epi32(temp7, add_rshift);
344         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
345         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
346         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
347         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
348     }
349 
350     if(iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_ld_addr[0], 0);
351     /* Perform Inverse transform */
352     /*-------------------------------------------------------------*/
353     /* IDCT [ Horizontal transformation ]                          */
354     /*-------------------------------------------------------------*/
355     // Matrix transpose
356     /*
357      *  a0 a1 a2 a3
358      *  b0 b1 b2 b3
359      *  c0 c1 c2 c3
360      *  d0 d1 d2 d3
361      */
362     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);  // a0 b0 a1 b1
363     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);  // c0 d0 c1 d1
364     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);  // a2 b2 a3 b3
365     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);  // c2 d2 c3 d3
366     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);    // a0 b0 c0 d0
367     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);    // a1 b1 c1 d1
368     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);    // a2 b2 c2 d2
369     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);    // a3 b3 c3 d3
370     // Transform starts -- horizontal transform
371     /*------------------------------------------------------------------*/
372     /* z0 = w0 + w2                                             */
373     temp0 = _mm_add_epi32(resq_r0, resq_r2);
374     /* z1 = w0 - w2                                             */
375     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
376     /* z2 = (w1 >> 1) - w3                                      */
377     temp2 = _mm_srai_epi32(resq_r1, 1);     //(w1>>1)
378     temp2 = _mm_sub_epi32(temp2, resq_r3);  //(w1>>1) - w3
379     /* z3 = w1 + (w3 >> 1)                                      */
380     temp3 = _mm_srai_epi32(resq_r3, 1);  //(w3>>1) + w1
381     temp3 = _mm_add_epi32(temp3, resq_r1);
382     /*----------------------------------------------------------*/
383     /* x0 = z0 + z3                                             */
384     resq_r0 = _mm_add_epi32(temp0, temp3);
385     /* x1 = z1 + z2                                             */
386     resq_r1 = _mm_add_epi32(temp1, temp2);
387     /* x2 = z1 - z2                                             */
388     resq_r2 = _mm_sub_epi32(temp1, temp2);
389     /* x3 = z0 - z3                                             */
390     resq_r3 = _mm_sub_epi32(temp0, temp3);
391     // Matrix transpose
392     /*
393      *  a0 b0 c0 d0
394      *  a1 b1 c1 d1
395      *  a2 b2 c2 d2
396      *  a3 b3 c3 d3
397      */
398     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);  // a0 a1 b0 b1
399     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);  // a2 a3 b2 b3
400     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);  // c0 c1 d0 d1
401     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);  // c2 c3 d2 d3
402     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);    // a0 a1 a2 a3
403     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);    // b0 b1 b2 b3
404     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);    // c0 c1 c2 c3
405     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);    // d0 d1 d2 d3
406     // Transform ends -- horizontal transform
407 
408     /*--------------------------------------------------------------*/
409     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
410     /*                                                              */
411     /* Add the prediction and store it back to same buffer          */
412     /*--------------------------------------------------------------*/
413     /* z0j = y0j + y2j                                                        */
414     temp0 = _mm_add_epi32(resq_r0, resq_r2);
415     /* z1j = y0j - y2j                                                        */
416     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
417     /* z2j = (y1j>>1) - y3j */
418     temp2 = _mm_srai_epi32(resq_r1, 1);  //(y1j>>1)
419     temp2 = _mm_sub_epi32(temp2, resq_r3);
420     /* z3j = y1j + (y3j>>1) */
421     temp3 = _mm_srai_epi32(resq_r3, 1);  //(y3j>>1)
422     temp3 = _mm_add_epi32(temp3, resq_r1);
423 
424     /* x0j = z0j + z3j                                                        */
425     temp4 = _mm_add_epi32(temp0, temp3);
426     temp4 = _mm_add_epi32(temp4, value_32);
427     temp4 = _mm_srai_epi32(temp4, 6);
428     temp4 = _mm_min_epi32(dupmax_4x32b, temp4);
429     temp4 = _mm_max_epi32(dupmin_4x32b, temp4);
430 
431     /* x1j = z1j + z2j                                                        */
432     temp5 = _mm_add_epi32(temp1, temp2);
433     temp5 = _mm_add_epi32(temp5, value_32);
434     temp5 = _mm_srai_epi32(temp5, 6);
435     temp5 = _mm_min_epi32(dupmax_4x32b, temp5);
436     temp5 = _mm_max_epi32(dupmin_4x32b, temp5);
437 
438     /* x2j = z1j - z2j                                                        */
439     temp6 = _mm_sub_epi32(temp1, temp2);
440     temp6 = _mm_add_epi32(temp6, value_32);
441     temp6 = _mm_srai_epi32(temp6, 6);
442     temp6 = _mm_min_epi32(dupmax_4x32b, temp6);
443     temp6 = _mm_max_epi32(dupmin_4x32b, temp6);
444 
445     /* x3j = z0j - z3j                                                        */
446     temp7 = _mm_sub_epi32(temp0, temp3);
447     temp7 = _mm_add_epi32(temp7, value_32);
448     temp7 = _mm_srai_epi32(temp7, 6);
449     temp7 = _mm_min_epi32(dupmax_4x32b, temp7);
450     temp7 = _mm_max_epi32(dupmin_4x32b, temp7);
451 
452     // 32-bit to 16-bit conversion
453     temp0 = _mm_packs_epi32(temp4, temp5);
454     temp1 = _mm_packs_epi32(temp6, temp7);
455 
456     resq_r0 = temp0;
457     resq_r1 = _mm_srli_si128(temp0, 8);
458     resq_r2 = temp1;
459     resq_r3 = _mm_srli_si128(temp1, 8);
460 
461     _mm_storel_epi64((__m128i *) pi2_out, resq_r0);
462     pi2_out += out_strd;
463     _mm_storel_epi64((__m128i *) pi2_out, resq_r1);
464     pi2_out += out_strd;
465     _mm_storel_epi64((__m128i *) pi2_out, resq_r2);
466     pi2_out += out_strd;
467     _mm_storel_epi64((__m128i *) pi2_out, resq_r3);
468 }
469 
470 /*****************************************************************************/
471 /*                                                                           */
472 /*  Function Name : isvcd_iquant_itrans_8x8_sse42                             */
473 /*                                                                           */
474 /*  Description   : this function computes the inverse quantized and         */
475 /*                   inverse transformed output                              */
476 /*                                                                           */
477 /*  Inputs        :                                                          */
478 /*  Globals       : none                                                     */
479 /*  Processing    :                                                          */
480 /*                                                                           */
481 /*  Outputs       : none                                                     */
482 /*  Returns       : none                                                     */
483 /*                                                                           */
484 /*  Issues        : none                                                     */
485 /*                                                                           */
486 /*  Revision History:                                                        */
487 /*                                                                           */
488 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
489 /*         25 11 2021   Kishore               creation                       */
490 /*                                                                           */
491 /*****************************************************************************/
492 
isvcd_iquant_itrans_8x8_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)493 void isvcd_iquant_itrans_8x8_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
494                                    const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
495                                    UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx,
496                                    WORD16 *pi2_dc_ld_addr)
497 {
498     __m128i src_r0;
499     __m128i scalemat_r0;
500     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
501     __m128i value_32 = _mm_set1_epi32(32);
502     __m128i add_rshift = _mm_set1_epi32((qp_div < 6) ? (1 << (5 - qp_div)) : 0);
503     __m128i dequant_r0;
504     __m128i sign_reg;
505     __m128i src_r0_1, src_r0_2;
506     __m128i scalemat_r0_1, scalemat_r0_2;
507     __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
508     __m128i temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18, temp19, temp20;
509     // To store dequantization results
510     __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2, resq_r3_1, resq_r3_2,
511         resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2, resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
512     __m128i dupmax_4x32b = _mm_set1_epi32(RSD_MAX);
513     __m128i dupmin_4x32b = _mm_set1_epi32(RSD_MIN);
514 
515     UNUSED(pi2_tmp);
516     UNUSED(iq_start_idx);
517     UNUSED(pi2_dc_ld_addr);
518 
519     /*************************************************************/
520     /* Dequantization of coefficients. Will be replaced by SIMD  */
521     /* operations on platform. Note : DC coeff is not scaled     */
522     /*************************************************************/
523 
524     // Row 0 processing
525     // a00 a01 a02 a03 a04 a05 a06 a07 -- the source matrix 0th row
526     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src));
527     // b00 b01 b02 b03 b04 b05 b06 b07 -- the scaling matrix 0th row
528     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat));
529     dequant_r0 =
530         _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[0]));  // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
531     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);     // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
532     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);     // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
533     // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
534     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
535     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
536     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
537     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
538     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
539 
540     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
541     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
542     // a04*b04*q4 a05*b05*q5 a06*b06*q6  a07*b07*q7 -- 32 bits long
543     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
544 
545     if(qp_div >= 6)
546     {
547         resq_r0_1 = _mm_slli_epi32(temp5, qp_div - 6);
548         resq_r0_2 = _mm_slli_epi32(temp7, qp_div - 6);
549     }
550     else
551     {
552         temp5 = _mm_add_epi32(temp5, add_rshift);
553         temp7 = _mm_add_epi32(temp7, add_rshift);
554         resq_r0_1 = _mm_srai_epi32(temp5, 6 - qp_div);
555         resq_r0_2 = _mm_srai_epi32(temp7, 6 - qp_div);
556     }
557     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16
558     // bit long
559     resq_r0_1 = _mm_packs_epi32(resq_r0_1, resq_r0_2);
560     // Row 1 processing
561     // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 1st row
562     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
563     // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 1st row
564     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 8));
565     // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
566     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[8]));
567     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
568     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
569     // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
570     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
571     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
572     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
573     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
574     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
575     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
576     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
577     // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
578     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
579     if(qp_div >= 6)
580     {
581         resq_r1_1 = _mm_slli_epi32(temp5, qp_div - 6);
582         resq_r1_2 = _mm_slli_epi32(temp7, qp_div - 6);
583     }
584     else
585     {
586         temp5 = _mm_add_epi32(temp5, add_rshift);
587         temp7 = _mm_add_epi32(temp7, add_rshift);
588         resq_r1_1 = _mm_srai_epi32(temp5, 6 - qp_div);
589         resq_r1_2 = _mm_srai_epi32(temp7, 6 - qp_div);
590     }
591     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
592     resq_r1_1 = _mm_packs_epi32(resq_r1_1, resq_r1_2);
593     // Row 2 processing
594     // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 2nd row
595     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 16));
596     // b00 b01 b02 b03 b04 b05 b06 b07 b08-- the scaling matrix 2nd row
597     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 16));
598     // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
599     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[16]));
600     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
601     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
602     // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
603     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
604     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
605     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
606     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
607     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
608     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
609     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
610     // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
611     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
612     if(qp_div >= 6)
613     {
614         resq_r2_1 = _mm_slli_epi32(temp5, qp_div - 6);
615         resq_r2_2 = _mm_slli_epi32(temp7, qp_div - 6);
616     }
617     else
618     {
619         temp5 = _mm_add_epi32(temp5, add_rshift);
620         temp7 = _mm_add_epi32(temp7, add_rshift);
621         resq_r2_1 = _mm_srai_epi32(temp5, 6 - qp_div);
622         resq_r2_2 = _mm_srai_epi32(temp7, 6 - qp_div);
623     }
624     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
625     resq_r2_1 = _mm_packs_epi32(resq_r2_1, resq_r2_2);
626     // Row 3 processing
627     // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 3rd row
628     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 24));
629     // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 3rd row
630     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 24));
631     // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
632     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[24]));
633     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
634     // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
635     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);
636     // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4  b05*q5 b06*q6 b07*q7 -- 16 bit result
637     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
638     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
639     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
640     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
641     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
642     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 - 32 bits long
643     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
644     // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
645     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
646     if(qp_div >= 6)
647     {
648         resq_r3_1 = _mm_slli_epi32(temp5, qp_div - 6);
649         resq_r3_2 = _mm_slli_epi32(temp7, qp_div - 6);
650     }
651     else
652     {
653         temp5 = _mm_add_epi32(temp5, add_rshift);
654         temp7 = _mm_add_epi32(temp7, add_rshift);
655         resq_r3_1 = _mm_srai_epi32(temp5, 6 - qp_div);
656         resq_r3_2 = _mm_srai_epi32(temp7, 6 - qp_div);
657     }
658     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4  a05*b05*q5 a06*b06*q6 a07*b07*q7
659     resq_r3_1 = _mm_packs_epi32(resq_r3_1, resq_r3_2);
660     // Row 4 processing
661     // a00 a01 a02 a03 a04 a05 a06 a07 a08 --  the source matrix 4th row
662     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 32));
663     // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 4th row
664     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 32));
665     dequant_r0 = _mm_loadu_si128(
666         (__m128i *) (&pu2_weigh_mat[32]));              // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
667     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
668     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
669     // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
670     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
671     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
672     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
673     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
674     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
675     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
676     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
677     // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
678     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
679     if(qp_div >= 6)
680     {
681         resq_r4_1 = _mm_slli_epi32(temp5, qp_div - 6);
682         resq_r4_2 = _mm_slli_epi32(temp7, qp_div - 6);
683     }
684     else
685     {
686         temp5 = _mm_add_epi32(temp5, add_rshift);
687         temp7 = _mm_add_epi32(temp7, add_rshift);
688         resq_r4_1 = _mm_srai_epi32(temp5, 6 - qp_div);
689         resq_r4_2 = _mm_srai_epi32(temp7, 6 - qp_div);
690     }
691     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
692     resq_r4_1 = _mm_packs_epi32(resq_r4_1, resq_r4_2);
693     // Row 5 processing
694     // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 5th row
695     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 40));
696     // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 5th row
697     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 40));
698     // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
699     dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[40]));
700     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
701     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
702     // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4  b05*q5 b06*q6 b07*q7 -- 16 bit result
703     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
704     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
705     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
706     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
707     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
708     // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
709     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
710     // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
711     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
712     if(qp_div >= 6)
713     {
714         resq_r5_1 = _mm_slli_epi32(temp5, qp_div - 6);
715         resq_r5_2 = _mm_slli_epi32(temp7, qp_div - 6);
716     }
717     else
718     {
719         temp5 = _mm_add_epi32(temp5, add_rshift);
720         temp7 = _mm_add_epi32(temp7, add_rshift);
721         resq_r5_1 = _mm_srai_epi32(temp5, 6 - qp_div);
722         resq_r5_2 = _mm_srai_epi32(temp7, 6 - qp_div);
723     }
724     /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 */
725     resq_r5_1 = _mm_packs_epi32(resq_r5_1, resq_r5_2);
726     // Row 6 processing
727     /* a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 6th row */
728     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 48));
729     /* b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 6th row */
730     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 48));
731     dequant_r0 = _mm_loadu_si128(
732         (__m128i *) (&pu2_weigh_mat[48]));              // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
733     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
734     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
735     /* b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result */
736     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
737     // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
738     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
739     // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
740     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
741     /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long */
742     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
743     /* a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long */
744     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
745     if(qp_div >= 6)
746     {
747         resq_r6_1 = _mm_slli_epi32(temp5, qp_div - 6);
748         resq_r6_2 = _mm_slli_epi32(temp7, qp_div - 6);
749     }
750     else
751     {
752         temp5 = _mm_add_epi32(temp5, add_rshift);
753         temp7 = _mm_add_epi32(temp7, add_rshift);
754         resq_r6_1 = _mm_srai_epi32(temp5, 6 - qp_div);
755         resq_r6_2 = _mm_srai_epi32(temp7, 6 - qp_div);
756     }
757     /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 */
758     resq_r6_1 = _mm_packs_epi32(resq_r6_1, resq_r6_2);
759     // Row 7 processing
760     /* a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 7th row */
761     src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 56));
762     /* b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 7th row */
763     scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 56));
764     dequant_r0 = _mm_loadu_si128(
765         (__m128i *) (&pu2_weigh_mat[56]));              // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
766     src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
767     src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);  // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
768 
769     /* b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result */
770     temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
771     /* b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long */
772     scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
773     /* b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long */
774     scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
775     /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long */
776     temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
777     /* a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long */
778     temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
779     if(qp_div >= 6)
780     {
781         resq_r7_1 = _mm_slli_epi32(temp5, qp_div - 6);
782         resq_r7_2 = _mm_slli_epi32(temp7, qp_div - 6);
783     }
784     else
785     {
786         temp5 = _mm_add_epi32(temp5, add_rshift);
787         temp7 = _mm_add_epi32(temp7, add_rshift);
788         resq_r7_1 = _mm_srai_epi32(temp5, 6 - qp_div);
789         resq_r7_2 = _mm_srai_epi32(temp7, 6 - qp_div);
790     }
791     /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 */
792     resq_r7_1 = _mm_packs_epi32(resq_r7_1, resq_r7_2);
793     /* Perform Inverse transform */
794     /*--------------------------------------------------------------------*/
795     /* IDCT [ Horizontal transformation ]                                 */
796     /*--------------------------------------------------------------------*/
797     // Matrix transpose
798     /*
799      *  a0 a1 a2 a3 a4 a5 a6 a7
800      *  b0 b1 b2 b3 b4 b5 b6 b7
801      *  c0 c1 c2 c3 c4 c5 c6 c7
802      *  d0 d1 d2 d3 d4 d5 d6 d7
803      */
804     temp1 = _mm_unpacklo_epi16(resq_r0_1, resq_r1_1);  // a0 b0 a1 b1 a2 b2 a3 b3
805     temp3 = _mm_unpacklo_epi16(resq_r2_1, resq_r3_1);  // c0 d0 c1 d1 c2 d2 c3 d3
806     temp2 = _mm_unpackhi_epi16(resq_r0_1, resq_r1_1);  // a4 b4 a5 b5 a6 b6 a7 b7
807     temp4 = _mm_unpackhi_epi16(resq_r2_1, resq_r3_1);  // c4 d4 c5 d5 c6 d6 c7 d7
808     resq_r0_1 = _mm_unpacklo_epi32(temp1, temp3);      // a0 b0 c0 d0 a1 b1 c1 d1
809     resq_r1_1 = _mm_unpackhi_epi32(temp1, temp3);      // a2 b2 c2 d2 a3 b3 c3 d3
810     resq_r2_1 = _mm_unpacklo_epi32(temp2, temp4);      // a4 b4 c4 d4 a5 b5 c5 d5
811     resq_r3_1 = _mm_unpackhi_epi32(temp2, temp4);      // a6 b6 c6 d6 a7 b7 c7 d7
812     /*
813      * e0 e1 e2 e3 e4 e5 e6 e7
814      * f0 f1 f2 f3 f4 f5 f6 f7
815      * g0 g1 g2 g3 g4 g5 g6 g7
816      * h0 h1 h2 h3 h4 h5 h6 h7
817      */
818     temp1 = _mm_unpacklo_epi16(resq_r4_1, resq_r5_1);  // e0 f0 e1 f1 e2 f2 e2 f3
819     temp3 = _mm_unpacklo_epi16(resq_r6_1, resq_r7_1);  // g0 h0 g1 h1 g2 h2 g3 h3
820     temp2 = _mm_unpackhi_epi16(resq_r4_1, resq_r5_1);  // e4 f4 e5 f5 e6 f6 e7 f7
821     temp4 = _mm_unpackhi_epi16(resq_r6_1, resq_r7_1);  // g4 h4 g5 h5 g6 h6 g7 h7
822     resq_r4_1 = _mm_unpacklo_epi32(temp1, temp3);      // e0 f0 g0 h0 e1 f1 g1 h1
823     resq_r5_1 = _mm_unpackhi_epi32(temp1, temp3);      // e2 f2 g2 h2 e3 f3 g3 h3
824     resq_r6_1 = _mm_unpacklo_epi32(temp2, temp4);      // e4 f4 g4 h4 e5 f5 g5 h5
825     resq_r7_1 = _mm_unpackhi_epi32(temp2, temp4);      // e6 f6 g6 h6 e7 f7 g7 h7
826     /*
827      * a0 b0 c0 d0 a1 b1 c1 d1
828      * a2 b2 c2 d2 a3 b3 c3 d3
829      * a4 b4 c4 d4 a5 b5 c5 d5
830      * a6 b6 c6 d6 a7 b7 c7 d7
831      * e0 f0 g0 h0 e1 f1 g1 h1
832      * e2 f2 g2 h2 e3 f3 g3 h3
833      * e4 f4 g4 h4 e5 f5 g5 h5
834      * e6 f6 g6 h6 e7 f7 g7 h7
835      */
836     resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1);  // a0 b0 c0 d0 e0 f0 g0 h0
837     resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1);  // a1 b1 c1 d1 e1 f1 g1 h1
838     resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1);  // a2 b2 c2 d2 e2 f2 g2 h2
839     resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1);  // a3 b3 c3 d3 e3 f3 g3 h3
840     resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1);  // a4 b4 c4 d4 e4 f4 g4 h4
841     resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1);  // a5 b5 c5 d5 e5 f5 g5 h5
842     resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1);  // a6 b6 c6 d6 e6 f6 g6 h6
843     resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1);  // a7 b7 c7 d7 e7 f7 g7 h7
844 
845     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
846     resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg);  // a1 b1 c1 d1 -- 32 bit
847     resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg);  // e1 f1 g1 h1 -- 32 bit
848     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
849     resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg);  // a3 b3 c3 d3 -- 32 bit
850     resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg);  // e3 f3 g3 h3 -- 32 bit
851     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
852     resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg);  // a5 b5 c5 d5 -- 32 bit
853     resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg);  // e5 f5 g5 h5 -- 32 bit
854     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
855     resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg);  // a7 b7 c7 d7 -- 32 bit
856     resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg);  // e7 f7 g7 h7 -- 32 bit
857     // Transform starts -- horizontal transform
858     /*------------------------------------------------------------------*/
859     /* y0 = w0 + w4                                                     */
860     temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
861     /* y2 = w0 - w4                                                      */
862     temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
863     /* y1 = -w3 + w5 - w7 - (w7 >> 1)                                   */
864     temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1);  //-w3+w5
865     temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
866     temp4 = _mm_sub_epi32(temp2, resq_r7_1);      //-w3+w5-w7
867     temp12 = _mm_sub_epi32(temp10, resq_r7_2);
868     temp5 = _mm_srai_epi32(resq_r7_1, 1);         // w7>>1
869     temp13 = _mm_srai_epi32(resq_r7_2, 1);
870     temp2 = _mm_sub_epi32(temp4, temp5);          //-w3+w5-w7 -(w7>>1)
871     temp10 = _mm_sub_epi32(temp12, temp13);
872     temp2 = _mm_packs_epi32(temp2, temp10);
873     /* y3 = w1 + w7 - w3 - (w3 >> 1)                                    */
874     temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1);  // w1+w7
875     temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
876     temp4 = _mm_sub_epi32(temp4, resq_r3_1);      // w1+w7-w3
877     temp12 = _mm_sub_epi32(temp12, resq_r3_2);
878     temp5 = _mm_srai_epi32(resq_r3_1, 1);         // w3>>1
879     temp13 = _mm_srai_epi32(resq_r3_2, 1);
880     temp4 = _mm_sub_epi32(temp4, temp5);          // w1+w7-w3-(w3>>1)
881     temp12 = _mm_sub_epi32(temp12, temp13);
882     temp4 = _mm_packs_epi32(temp4, temp12);
883     /* y4 = (w2 >> 1) - w6                                              */
884     temp5 = _mm_srai_epi16(resq_r2_2, 1);     // w2>>1
885     temp5 = _mm_sub_epi16(temp5, resq_r6_2);  //(w2>>1)-w6
886     /* y5 = -w1 + w7 + w5 + (w5 >> 1)                                   */
887     temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1);  // w7-w1
888     temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
889     temp6 = _mm_add_epi32(temp6, resq_r5_1);      // w7-w1+w5
890     temp14 = _mm_add_epi32(temp14, resq_r5_2);
891     temp7 = _mm_srai_epi32(resq_r5_1, 1);         // w5>>1
892     temp15 = _mm_srai_epi32(resq_r5_2, 1);
893     temp6 = _mm_add_epi32(temp6, temp7);          // w7-w1_w5+(w5>>1)
894     temp14 = _mm_add_epi32(temp14, temp15);
895     temp6 = _mm_packs_epi32(temp6, temp14);
896     /* y6 = w2 + (w6 >> 1)                                              */
897     temp7 = _mm_srai_epi16(resq_r6_2, 1);     // w6>>1
898     temp7 = _mm_add_epi16(temp7, resq_r2_2);  //(w6>>1)+w2
899     /* y7 = w3 + w5 + w1 + (w1 >> 1)                                    */
900     temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1);  // w3+w5
901     temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
902     temp8 = _mm_add_epi32(temp8, resq_r1_1);      // w3+w5+w1
903     temp16 = _mm_add_epi32(temp16, resq_r1_2);
904     temp17 = _mm_srai_epi32(resq_r1_1, 1);        // w1>>1
905     temp18 = _mm_srai_epi32(resq_r1_2, 1);
906     temp8 = _mm_add_epi32(temp8, temp17);         // w3+w5+w1+(w1>>1)
907     temp16 = _mm_add_epi32(temp16, temp18);
908     temp8 = _mm_packs_epi32(temp8, temp16);
909     /*------------------------------------------------------------------*/
910     /*------------------------------------------------------------------*/
911     /* z0 = y0 + y6                                                        */
912     resq_r0_1 = _mm_add_epi16(temp1, temp7);
913     /* z1 = y1 + (y7 >> 2)                                                */
914     resq_r1_1 = _mm_srai_epi16(temp8, 2);
915     resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
916     /* z2 = y2 + y4                                                        */
917     resq_r2_1 = _mm_add_epi16(temp3, temp5);
918     /* z3 = y3 + (y5 >> 2)                                                */
919     resq_r3_1 = _mm_srai_epi16(temp6, 2);
920     resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
921     /* z4 = y2 - y4                                                        */
922     resq_r4_1 = _mm_sub_epi16(temp3, temp5);
923     /* z5 = (y3 >> 2) - y5                                                 */
924     resq_r5_1 = _mm_srai_epi16(temp4, 2);
925     resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
926     /* z6 = y0 - y6                                                     */
927     resq_r6_1 = _mm_sub_epi16(temp1, temp7);
928     /* z7 = y7 - (y1 >> 2)                                                 */
929     resq_r7_1 = _mm_srai_epi16(temp2, 2);
930     resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
931     /*------------------------------------------------------------------*/
932     /*------------------------------------------------------------------*/
933     /* x0 = z0 + z7                                                        */
934     temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
935     /* x1 = z2 + z5                                                        */
936     temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
937     /* x2 = z4 + z3                                                        */
938     temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
939     /* x3 = z6 + z1                                                        */
940     temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
941     /* x4 = z6 - z1                                                        */
942     temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
943     /* x5 = z4 - z3                                                        */
944     temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
945     /* x6 = z2 - z5                                                        */
946     temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
947     /* x7 = z0 - z7                                                        */
948     temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
949     /*------------------------------------------------------------------*/
950     // Matrix transpose
951     /*
952      *  a0 b0 c0 d0 e0 f0 g0 h0
953      *  a1 b1 c1 d1 e1 f1 g1 h1
954      *  a2 b2 c2 d2 e2 f2 g2 h2
955      *  a3 b3 c3 d3 e3 f3 g3 h3
956      */
957     temp17 = _mm_unpacklo_epi16(temp1, temp2);       // a0 a1 b0 b1 c0 c1 d0 d1
958     temp19 = _mm_unpacklo_epi16(temp3, temp4);       // a2 a3 b2 b3 c2 c3 d2 d3
959     temp18 = _mm_unpackhi_epi16(temp1, temp2);       // e0 e1 f0 f1 g0 g1 h0 h1
960     temp20 = _mm_unpackhi_epi16(temp3, temp4);       // e2 e3 f2 f3 g2 g3 h2 h3
961 
962     resq_r0_1 = _mm_unpacklo_epi32(temp17, temp19);  // a0 a1 a2 a3 b0 b1 b2 b3
963     resq_r1_1 = _mm_unpackhi_epi32(temp17, temp19);  // c0 c1 c2 c3 d0 d1 d2 d3
964     resq_r2_1 = _mm_unpacklo_epi32(temp18, temp20);  // e0 e1 e2 e3 f0 f1 f2 f3
965     resq_r3_1 = _mm_unpackhi_epi32(temp18, temp20);  // g0 g2 g2 g3 h0 h1 h2 h3
966     /*
967      *  a4 b4 c4 d4 e4 f4 g4 h4
968      *  a5 b5 c5 d5 e5 f5 g5 h5
969      *  a6 b6 c6 d6 e6 f6 g6 h6
970      *  a7 b7 c7 d7 e7 f7 g7 h7
971      */
972     temp17 = _mm_unpacklo_epi16(temp5, temp6);       // a4 a5 b4 b5 c4 c5 d4 d5
973     temp19 = _mm_unpacklo_epi16(temp7, temp8);       // a6 a7 b6 b7 c6 c7 d6 d7
974     temp18 = _mm_unpackhi_epi16(temp5, temp6);       // e4 e5 f4 f5 g4 g5 h4 h5
975     temp20 = _mm_unpackhi_epi16(temp7, temp8);       // e6 e7 f6 f7 g6 g7 h6 h7
976 
977     resq_r4_1 = _mm_unpacklo_epi32(temp17, temp19);  // a4 a5 a6 a7 b4 b5 b6 b7
978     resq_r5_1 = _mm_unpackhi_epi32(temp17, temp19);  // c4 c5 c6 c7 d4 d5 d6 d7
979     resq_r6_1 = _mm_unpacklo_epi32(temp18, temp20);  // e4 e5 e6 e7 f4 f5 f6 f7
980     resq_r7_1 = _mm_unpackhi_epi32(temp18, temp20);  // g4 g5 g6 g7 h4 h5 h6 h7
981     /*  a0 a1 a2 a3 b0 b1 b2 b3
982      *  c0 c1 c2 c3 d0 d1 d2 d3
983      *  e0 e1 e2 e3 f0 f1 f2 f3
984      *  g0 g2 g2 g3 h0 h1 h2 h3
985      *  a4 a5 a6 a7 b4 b5 b6 b7
986      *  c4 c5 c6 c7 d4 d5 d6 d7
987      *  e4 e5 e6 e7 f4 f5 f6 f7
988      *  g4 g5 g6 g7 h4 h5 h6 h7
989      */
990     resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1);  // a0 a1 a2 a3 a4 a5 a6 a7
991     resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1);  // b0 b1 b2 b3 b4 b5 b6 b7
992     resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1);  // c0 c1 c2 c3 c4 c5 c6 c7
993     resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1);  // d0 d1 d2 d3 d4 d5 d6 d7
994     resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1);  // e0 e1 e2 e3 e4 e5 e6 e7
995     resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1);  // f0 f1 f2 f3 f4 f5 f6 f7
996     resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1);  // g0 g1 g2 g3 g4 g5 g6 g7
997     resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1);  // h0 h1 h2 h3 h4 h5 h6 h7
998 
999     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
1000     resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg);  // a1 b1 c1 d1 -- 32 bit
1001     resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg);  // e1 f1 g1 h1 -- 32 bit
1002     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
1003     resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg);  // a3 b3 c3 d3 -- 32 bit
1004     resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg);  // e3 f3 g3 h3 -- 32 bit
1005     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
1006     resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg);  // a5 b5 c5 d5 -- 32 bit
1007     resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg);  // e5 f5 g5 h5 -- 32 bit
1008     sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
1009     resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg);  // a7 b7 c7 d7 -- 32 bit
1010     resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg);  // e7 f7 g7 h7 -- 32 bit
1011 
1012     /*--------------------------------------------------------------------*/
1013     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6            */
1014     /*                                                                    */
1015 
1016     /* y0j = w0j + w4j                                                     */
1017     temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
1018     /* y2j = w0j - w4j                                                      */
1019     temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
1020     /* y1j = -w3j + w5j - w7j - (w7j >> 1)                                   */
1021     temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1);  //-w3+w5
1022     temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
1023     temp4 = _mm_sub_epi32(temp2, resq_r7_1);      //-w3+w5-w7
1024     temp12 = _mm_sub_epi32(temp10, resq_r7_2);
1025     temp5 = _mm_srai_epi32(resq_r7_1, 1);         // w7>>1
1026     temp13 = _mm_srai_epi32(resq_r7_2, 1);
1027     temp2 = _mm_sub_epi32(temp4, temp5);          //-w3+w5-w7 -(w7>>1)
1028     temp10 = _mm_sub_epi32(temp12, temp13);
1029     temp2 = _mm_packs_epi32(temp2, temp10);
1030     /* y3j = w1j + w7j - w3j - (w3j >> 1)                                    */
1031     temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1);  // w1+w7
1032     temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
1033     temp4 = _mm_sub_epi32(temp4, resq_r3_1);      // w1+w7-w3
1034     temp12 = _mm_sub_epi32(temp12, resq_r3_2);
1035     temp5 = _mm_srai_epi32(resq_r3_1, 1);         // w3>>1
1036     temp13 = _mm_srai_epi32(resq_r3_2, 1);
1037     temp4 = _mm_sub_epi32(temp4, temp5);          // w1+w7-w3-(w3>>1)
1038     temp12 = _mm_sub_epi32(temp12, temp13);
1039     temp4 = _mm_packs_epi32(temp4, temp12);
1040     /* y4j = (w2j >> 1) - w6j                                              */
1041     temp5 = _mm_srai_epi16(resq_r2_2, 1);     // w2>>1
1042     temp5 = _mm_sub_epi16(temp5, resq_r6_2);  //(w2>>1)-w6
1043     /* y5j = -w1j + w7j + w5j + (w5j >> 1)                                   */
1044     temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1);  // w7-w1
1045     temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
1046     temp6 = _mm_add_epi32(temp6, resq_r5_1);      // w7-w1+w5
1047     temp14 = _mm_add_epi32(temp14, resq_r5_2);
1048     temp7 = _mm_srai_epi32(resq_r5_1, 1);         // w5>>1
1049     temp15 = _mm_srai_epi32(resq_r5_2, 1);
1050     temp6 = _mm_add_epi32(temp6, temp7);          // w7-w1_w5+(w5>>1)
1051     temp14 = _mm_add_epi32(temp14, temp15);
1052     temp6 = _mm_packs_epi32(temp6, temp14);
1053     /* y6j = w2j + (w6j >> 1)                                              */
1054     temp7 = _mm_srai_epi16(resq_r6_2, 1);     // w6>>1
1055     temp7 = _mm_add_epi16(temp7, resq_r2_2);  //(w6>>1)+w2
1056     /* y7j = w3j + w5j + w1j + (w1j >> 1)                                    */
1057     temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1);  // w3+w5
1058     temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
1059     temp8 = _mm_add_epi32(temp8, resq_r1_1);      // w3+w5+w1
1060     temp16 = _mm_add_epi32(temp16, resq_r1_2);
1061     temp17 = _mm_srai_epi32(resq_r1_1, 1);        // w1>>1
1062     temp18 = _mm_srai_epi32(resq_r1_2, 1);
1063     temp8 = _mm_add_epi32(temp8, temp17);         // w3+w5+w1+(w1>>1)
1064     temp16 = _mm_add_epi32(temp16, temp18);
1065     temp8 = _mm_packs_epi32(temp8, temp16);
1066     /*------------------------------------------------------------------*/
1067     /*------------------------------------------------------------------*/
1068     /* z0j = y0j + y6j                                                        */
1069     resq_r0_1 = _mm_add_epi16(temp1, temp7);
1070     /* z1j = y1j + (y7j >> 2)                                                */
1071     resq_r1_1 = _mm_srai_epi16(temp8, 2);
1072     resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
1073     /* z2j = y2j + y4j                                                        */
1074     resq_r2_1 = _mm_add_epi16(temp3, temp5);
1075     /* z3j = y3j + (y5j >> 2)                                                */
1076     resq_r3_1 = _mm_srai_epi16(temp6, 2);
1077     resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
1078     /* z4j = y2j - y4j                                                        */
1079     resq_r4_1 = _mm_sub_epi16(temp3, temp5);
1080     /* z5j = (y3j >> 2) - y5j                                                 */
1081     resq_r5_1 = _mm_srai_epi16(temp4, 2);
1082     resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
1083     /* z6j = y0j - y6j                                                     */
1084     resq_r6_1 = _mm_sub_epi16(temp1, temp7);
1085     /* z7j = y7j - (y1j >> 2)                                                 */
1086     resq_r7_1 = _mm_srai_epi16(temp2, 2);
1087     resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
1088     /*------------------------------------------------------------------*/
1089 
1090     /*------------------------------------------------------------------*/
1091     /* x0j = z0j + z7j                                                        */
1092     temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
1093     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp1);
1094     temp10 = _mm_unpacklo_epi16(temp1, sign_reg);
1095     temp11 = _mm_unpackhi_epi16(temp1, sign_reg);
1096     temp10 = _mm_add_epi32(temp10, value_32);
1097     temp11 = _mm_add_epi32(temp11, value_32);
1098     temp10 = _mm_srai_epi32(temp10, 6);
1099     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1100     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1101     temp11 = _mm_srai_epi32(temp11, 6);
1102     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1103     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1104     temp10 = _mm_packs_epi32(temp10, temp11);
1105     temp1 = temp10;  //_mm_add_epi16(temp10, pred_r0_1);
1106     /* x1j = z2j + z5j                                                        */
1107     temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
1108     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp2);
1109     temp10 = _mm_unpacklo_epi16(temp2, sign_reg);
1110     temp11 = _mm_unpackhi_epi16(temp2, sign_reg);
1111     temp10 = _mm_add_epi32(temp10, value_32);
1112     temp11 = _mm_add_epi32(temp11, value_32);
1113     temp10 = _mm_srai_epi32(temp10, 6);
1114     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1115     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1116     temp11 = _mm_srai_epi32(temp11, 6);
1117     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1118     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1119     temp10 = _mm_packs_epi32(temp10, temp11);
1120     temp2 = temp10;  //_mm_add_epi16(temp10, pred_r1_1);
1121     /* x2j = z4j + z3j                                                        */
1122     temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
1123     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp3);
1124     temp10 = _mm_unpacklo_epi16(temp3, sign_reg);
1125     temp11 = _mm_unpackhi_epi16(temp3, sign_reg);
1126     temp10 = _mm_add_epi32(temp10, value_32);
1127     temp11 = _mm_add_epi32(temp11, value_32);
1128     temp10 = _mm_srai_epi32(temp10, 6);
1129     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1130     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1131     temp11 = _mm_srai_epi32(temp11, 6);
1132     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1133     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1134     temp10 = _mm_packs_epi32(temp10, temp11);
1135     temp3 = temp10;  //_mm_add_epi16(temp10, pred_r2_1);
1136     /* x3j = z6j + z1j                                                        */
1137     temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
1138     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp4);
1139     temp10 = _mm_unpacklo_epi16(temp4, sign_reg);
1140     temp11 = _mm_unpackhi_epi16(temp4, sign_reg);
1141     temp10 = _mm_add_epi32(temp10, value_32);
1142     temp11 = _mm_add_epi32(temp11, value_32);
1143     temp10 = _mm_srai_epi32(temp10, 6);
1144     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1145     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1146     temp11 = _mm_srai_epi32(temp11, 6);
1147     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1148     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1149     temp10 = _mm_packs_epi32(temp10, temp11);
1150     temp4 = temp10;  //_mm_add_epi16(temp10, pred_r3_1);
1151     /* x4j = z6j - z1j                                                        */
1152     temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
1153     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp5);
1154     temp10 = _mm_unpacklo_epi16(temp5, sign_reg);
1155     temp11 = _mm_unpackhi_epi16(temp5, sign_reg);
1156     temp10 = _mm_add_epi32(temp10, value_32);
1157     temp11 = _mm_add_epi32(temp11, value_32);
1158     temp10 = _mm_srai_epi32(temp10, 6);
1159     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1160     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1161     temp11 = _mm_srai_epi32(temp11, 6);
1162     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1163     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1164     temp10 = _mm_packs_epi32(temp10, temp11);
1165     temp5 = temp10;  //_mm_add_epi16(temp10, pred_r4_1);
1166     /* x5j = z4j - z3j                                                        */
1167     temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
1168     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp6);
1169     temp10 = _mm_unpacklo_epi16(temp6, sign_reg);
1170     temp11 = _mm_unpackhi_epi16(temp6, sign_reg);
1171     temp10 = _mm_add_epi32(temp10, value_32);
1172     temp11 = _mm_add_epi32(temp11, value_32);
1173     temp10 = _mm_srai_epi32(temp10, 6);
1174     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1175     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1176     temp11 = _mm_srai_epi32(temp11, 6);
1177     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1178     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1179     temp10 = _mm_packs_epi32(temp10, temp11);
1180     temp6 = temp10;  //_mm_add_epi16(temp10, pred_r5_1);
1181     /* x6j = z2j - z5j                                                        */
1182     temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
1183     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp7);
1184     temp10 = _mm_unpacklo_epi16(temp7, sign_reg);
1185     temp11 = _mm_unpackhi_epi16(temp7, sign_reg);
1186     temp10 = _mm_add_epi32(temp10, value_32);
1187     temp11 = _mm_add_epi32(temp11, value_32);
1188     temp10 = _mm_srai_epi32(temp10, 6);
1189     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1190     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1191     temp11 = _mm_srai_epi32(temp11, 6);
1192     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1193     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1194     temp10 = _mm_packs_epi32(temp10, temp11);
1195     temp7 = temp10;  //_mm_add_epi16(temp10, pred_r6_1);
1196     /* x7j = z0j - z7j                                                        */
1197     temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
1198     sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp8);
1199     temp10 = _mm_unpacklo_epi16(temp8, sign_reg);
1200     temp11 = _mm_unpackhi_epi16(temp8, sign_reg);
1201     temp10 = _mm_add_epi32(temp10, value_32);
1202     temp11 = _mm_add_epi32(temp11, value_32);
1203     temp10 = _mm_srai_epi32(temp10, 6);
1204     temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1205     temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1206     temp11 = _mm_srai_epi32(temp11, 6);
1207     temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1208     temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1209     temp10 = _mm_packs_epi32(temp10, temp11);
1210     temp8 = temp10;  //_mm_add_epi16(temp10, pred_r7_1);
1211 
1212     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp1);
1213     _mm_storeu_si128((__m128i *) (&pi2_out[out_strd]), temp2);
1214     _mm_storeu_si128((__m128i *) (&pi2_out[2 * out_strd]), temp3);
1215     _mm_storeu_si128((__m128i *) (&pi2_out[3 * out_strd]), temp4);
1216     _mm_storeu_si128((__m128i *) (&pi2_out[4 * out_strd]), temp5);
1217     _mm_storeu_si128((__m128i *) (&pi2_out[5 * out_strd]), temp6);
1218     _mm_storeu_si128((__m128i *) (&pi2_out[6 * out_strd]), temp7);
1219     _mm_storeu_si128((__m128i *) (&pi2_out[7 * out_strd]), temp8);
1220 }
1221 
1222 /*****************************************************************************/
1223 /*                                                                           */
1224 /*  Function Name : isvcd_iquant_itrans_chroma_4x4_sse42                      */
1225 /*                                                                           */
1226 /*  Description   : this function computes the inverse quantized and         */
1227 /*                   inverse transformed output                              */
1228 /*                                                                           */
1229 /*  Inputs        :                                                          */
1230 /*  Globals       : none                                                     */
1231 /*  Processing    :                                                          */
1232 /*                                                                           */
1233 /*  Outputs       : none                                                     */
1234 /*  Returns       : none                                                     */
1235 /*                                                                           */
1236 /*  Issues        : none                                                     */
1237 /*                                                                           */
1238 /*  Revision History:                                                        */
1239 /*                                                                           */
1240 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1241 /*         25 11 2021   Kishore               creation                       */
1242 /*                                                                           */
1243 /*****************************************************************************/
1244 
isvcd_iquant_itrans_chroma_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)1245 void isvcd_iquant_itrans_chroma_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
1246                                           const UWORD16 *pu2_iscal_mat,
1247                                           const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
1248                                           WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
1249 {
1250     __m128i src_r0_r1, src_r2_r3;
1251     __m128i src_r0, src_r1, src_r2, src_r3;
1252     __m128i scalemat_r0_r1, scalemat_r2_r3;
1253     __m128i dequant_r0_r1, dequant_r2_r3;
1254     __m128i zero_8x16b = _mm_setzero_si128();  // all bits reset to zero
1255     __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1256     __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1257     __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1258     __m128i value_32 = _mm_set1_epi32(32);
1259     __m128i dupmax_4x32b = _mm_set1_epi32(RSD_MAX);
1260     __m128i dupmin_4x32b = _mm_set1_epi32(RSD_MIN);
1261 
1262     __m128i chroma_mask_even =
1263         _mm_set_epi16(0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff);
1264     __m128i chroma_mask_odd =
1265         _mm_set_epi16(0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000);
1266 
1267     UNUSED(pi2_tmp);
1268 
1269     /*************************************************************/
1270     /* Dequantization of coefficients. Will be replaced by SIMD  */
1271     /* operations on platform                                    */
1272     /*************************************************************/
1273     // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
1274     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1275     // a20 a21 a22 a23 a30 a31 a32 a33 --the source matrix 2nd,3rd row
1276     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1277     // b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
1278     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1279     // b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
1280     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1281     // q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
1282     dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1283     // q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
1284     dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1285     // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
1286     temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
1287     // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
1288     temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1289     // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
1290     temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1291     // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
1292     temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1293     // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
1294     temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1295     // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
1296     temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1297 
1298     src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b);  // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
1299     src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b);  // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
1300     src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b);  // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
1301     src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b);  // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
1302 
1303     // a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
1304     temp4 = _mm_madd_epi16(src_r0, temp4);
1305     temp5 = _mm_madd_epi16(src_r1, temp5);
1306     temp6 = _mm_madd_epi16(src_r2, temp6);
1307     temp7 = _mm_madd_epi16(src_r3, temp7);
1308 
1309     if(u4_qp_div_6 >= 4)
1310     {
1311         resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1312         resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1313         resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1314         resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1315     }
1316     else
1317     {
1318         temp4 = _mm_add_epi32(temp4, add_rshift);
1319         temp5 = _mm_add_epi32(temp5, add_rshift);
1320         temp6 = _mm_add_epi32(temp6, add_rshift);
1321         temp7 = _mm_add_epi32(temp7, add_rshift);
1322         resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1323         resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1324         resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1325         resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1326     }
1327 
1328     resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1329     /* Perform Inverse transform */
1330     /*-------------------------------------------------------------*/
1331     /* IDCT [ Horizontal transformation ]                          */
1332     /*-------------------------------------------------------------*/
1333     // Matrix transpose
1334     /*
1335      *  a0 a1 a2 a3
1336      *  b0 b1 b2 b3
1337      *  c0 c1 c2 c3
1338      *  d0 d1 d2 d3
1339      */
1340     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);  // a0 b0 a1 b1
1341     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);  // c0 d0 c1 d1
1342     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);  // a2 b2 a3 b3
1343     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);  // c2 d2 c3 d3
1344     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);    // a0 b0 c0 d0
1345     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);    // a1 b1 c1 d1
1346     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);    // a2 b2 c2 d2
1347     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);    // a3 b3 c3 d3
1348     // Transform starts -- horizontal transform
1349     /*------------------------------------------------------------------*/
1350     /* z0 = w0 + w2                                             */
1351     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1352     /* z1 = w0 - w2                                             */
1353     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1354     /* z2 = (w1 >> 1) - w3                                      */
1355     temp2 = _mm_srai_epi32(resq_r1, 1);     //(w1>>1)
1356     temp2 = _mm_sub_epi32(temp2, resq_r3);  //(w1>>1) - w3
1357     /* z3 = w1 + (w3 >> 1)                                      */
1358     temp3 = _mm_srai_epi32(resq_r3, 1);  //(w3>>1) + w1
1359     temp3 = _mm_add_epi32(temp3, resq_r1);
1360     /*----------------------------------------------------------*/
1361     /* x0 = z0 + z3                                             */
1362     resq_r0 = _mm_add_epi32(temp0, temp3);
1363     /* x1 = z1 + z2                                             */
1364     resq_r1 = _mm_add_epi32(temp1, temp2);
1365     /* x2 = z1 - z2                                             */
1366     resq_r2 = _mm_sub_epi32(temp1, temp2);
1367     /* x3 = z0 - z3                                             */
1368     resq_r3 = _mm_sub_epi32(temp0, temp3);
1369     // Matrix transpose
1370     /*
1371      *  a0 b0 c0 d0
1372      *  a1 b1 c1 d1
1373      *  a2 b2 c2 d2
1374      *  a3 b3 c3 d3
1375      */
1376     temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1);  // a0 a1 b0 b1
1377     temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3);  // a2 a3 b2 b3
1378     temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1);  // c0 c1 d0 d1
1379     temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3);  // c2 c3 d2 d3
1380     resq_r0 = _mm_unpacklo_epi64(temp1, temp3);    // a0 a1 a2 a3
1381     resq_r1 = _mm_unpackhi_epi64(temp1, temp3);    // b0 b1 b2 b3
1382     resq_r2 = _mm_unpacklo_epi64(temp2, temp4);    // c0 c1 c2 c3
1383     resq_r3 = _mm_unpackhi_epi64(temp2, temp4);    // d0 d1 d2 d3
1384     // Transform ends -- horizontal transform
1385 
1386     /*--------------------------------------------------------------*/
1387     /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
1388     /* Add the prediction and store it back to same buffer          */
1389     /*--------------------------------------------------------------*/
1390     /* z0j = y0j + y2j                                                        */
1391     temp0 = _mm_add_epi32(resq_r0, resq_r2);
1392     /* z1j = y0j - y2j                                                        */
1393     temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1394     /* z2j = (y1j>>1) - y3j */
1395     temp2 = _mm_srai_epi32(resq_r1, 1);  //(y1j>>1)
1396     temp2 = _mm_sub_epi32(temp2, resq_r3);
1397     /* z3j = y1j + (y3j>>1) */
1398     temp3 = _mm_srai_epi32(resq_r3, 1);  //(y3j>>1)
1399     temp3 = _mm_add_epi32(temp3, resq_r1);
1400 
1401     /* x0j = z0j + z3j                                                        */
1402     temp4 = _mm_add_epi32(temp0, temp3);
1403     temp4 = _mm_add_epi32(temp4, value_32);
1404     temp4 = _mm_srai_epi32(temp4, 6);
1405     temp4 = _mm_min_epi32(dupmax_4x32b, temp4);
1406     temp4 = _mm_max_epi32(dupmin_4x32b, temp4);
1407 
1408     /* x1j = z1j + z2j                                                        */
1409     temp5 = _mm_add_epi32(temp1, temp2);
1410     temp5 = _mm_add_epi32(temp5, value_32);
1411     temp5 = _mm_srai_epi32(temp5, 6);
1412     temp5 = _mm_min_epi32(dupmax_4x32b, temp5);
1413     temp5 = _mm_max_epi32(dupmin_4x32b, temp5);
1414 
1415     /* x2j = z1j - z2j                                                        */
1416     temp6 = _mm_sub_epi32(temp1, temp2);
1417     temp6 = _mm_add_epi32(temp6, value_32);
1418     temp6 = _mm_srai_epi32(temp6, 6);
1419     temp6 = _mm_min_epi32(dupmax_4x32b, temp6);
1420     temp6 = _mm_max_epi32(dupmin_4x32b, temp6);
1421 
1422     /* x3j = z0j - z3j                                                        */
1423     temp7 = _mm_sub_epi32(temp0, temp3);
1424     temp7 = _mm_add_epi32(temp7, value_32);
1425     temp7 = _mm_srai_epi32(temp7, 6);
1426     temp7 = _mm_min_epi32(dupmax_4x32b, temp7);
1427     temp7 = _mm_max_epi32(dupmin_4x32b, temp7);
1428 
1429     // 32-bit to 16-bit conversion
1430     temp0 = _mm_packs_epi32(temp4, temp5);
1431     temp1 = _mm_packs_epi32(temp6, temp7);
1432 
1433     resq_r0 = temp0;
1434     resq_r1 = _mm_srli_si128(temp0, 8);
1435     resq_r2 = temp1;
1436     resq_r3 = _mm_srli_si128(temp1, 8);
1437 
1438     // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
1439     src_r0 = _mm_loadu_si128((__m128i *) (pi2_out));
1440     // a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
1441     src_r1 = _mm_loadu_si128((__m128i *) (pi2_out + (1 * out_strd)));
1442 
1443     src_r2 = _mm_loadu_si128((__m128i *) (pi2_out + (2 * out_strd)));
1444     src_r3 = _mm_loadu_si128((__m128i *) (pi2_out + (3 * out_strd)));
1445 
1446     resq_r0 = _mm_and_si128(temp4, chroma_mask_even);  // macro 0 macro 0 ..
1447     resq_r1 = _mm_and_si128(temp5, chroma_mask_even);
1448     resq_r2 = _mm_and_si128(temp6, chroma_mask_even);
1449     resq_r3 = _mm_and_si128(temp7, chroma_mask_even);
1450 
1451     src_r0 = _mm_and_si128(src_r0, chroma_mask_odd);  // 0 src1 0 src2 0 ...
1452     src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
1453     src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
1454     src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
1455 
1456     src_r0 = _mm_add_epi16(src_r0, resq_r0);  // macro  src1 macro src2 macro ...
1457     src_r1 = _mm_add_epi16(src_r1, resq_r1);
1458     src_r2 = _mm_add_epi16(src_r2, resq_r2);
1459     src_r3 = _mm_add_epi16(src_r3, resq_r3);
1460 
1461     _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0);
1462     _mm_storeu_si128((__m128i *) (&pi2_out[out_strd]), src_r1);
1463     _mm_storeu_si128((__m128i *) (&pi2_out[2 * out_strd]), src_r2);
1464     _mm_storeu_si128((__m128i *) (&pi2_out[3 * out_strd]), src_r3);
1465 }
1466