1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_iquant_itrans_sse42.c
24 *
25 * @brief
26 * Contains function definitions for inverse quantization, inverse
27 * transform
28 *
29 * @author
30 * Kishore
31 *
32 * @par List of Functions:
33 * - isvcd_iquant_itrans_4x4_sse42()
34 * - isvcd_iquant_itrans_chroma_4x4_sse42()
35 * - isvcd_iquant_itrans_8x8_dc_sse42()
36 * - isvcd_iquant_itrans_4x4_dc_sse42()
37 * - isvcd_iquant_itrans_chroma_4x4_dc_sse42()
38 * - isvcd_iquant_itrans_8x8_sse42()
39 *
40 * @remarks
41 * None
42 *
43 *******************************************************************************
44 */
45 /* User include files */
46 #include <immintrin.h>
47 #include "ih264_typedefs.h"
48 #include "ih264_defs.h"
49 #include "ih264_trans_macros.h"
50 #include "ih264_macros.h"
51 #include "ih264_platform_macros.h"
52 #include "ih264_trans_data.h"
53 #include "ih264_size_defs.h"
54 #include "ih264_structs.h"
55 #include "isvcd_iquant_itrans.h"
56
57 /*****************************************************************************/
58 /* */
59 /* Function Name : isvcd_iquant_itrans_4x4_dc_sse42 */
60 /* */
61 /* Description : this function computes the inverse quantized and */
62 /* inverse transformed output */
63 /* */
64 /* Inputs : */
65 /* Globals : none */
66 /* Processing : */
67 /* */
68 /* Outputs : none */
69 /* Returns : none */
70 /* */
71 /* Issues : none */
72 /* */
73 /* Revision History: */
74 /* */
75 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
76 /* 25 11 2021 Kishore creation */
77 /* */
78 /*****************************************************************************/
79
isvcd_iquant_itrans_4x4_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)80 void isvcd_iquant_itrans_4x4_dc_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
81 const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
82 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
83 WORD16 *pi2_dc_ld_addr)
84 {
85 WORD32 q0;
86 WORD16 rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
87 __m128i dupmax_8x16b = _mm_set1_epi16(RSD_MAX);
88 __m128i dupmin_8x16b = _mm_set1_epi16(RSD_MIN);
89 __m128i i_macro;
90 UNUSED(pi2_tmp);
91
92 if(iq_start_idx == 0)
93 {
94 q0 = pi2_src[0];
95 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
96 }
97 else
98 {
99 q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case3
100 }
101
102 i_macro = _mm_set1_epi16((q0 + 32) >> 6);
103 i_macro = _mm_min_epi16(dupmax_8x16b, i_macro);
104 i_macro = _mm_max_epi16(dupmin_8x16b, i_macro);
105
106 _mm_storel_epi64((__m128i *) pi2_out, i_macro);
107 pi2_out += out_strd;
108 _mm_storel_epi64((__m128i *) pi2_out, i_macro);
109 pi2_out += out_strd;
110 _mm_storel_epi64((__m128i *) pi2_out, i_macro);
111 pi2_out += out_strd;
112 _mm_storel_epi64((__m128i *) pi2_out, i_macro);
113 }
114
115 /*****************************************************************************/
116 /* */
117 /* Function Name : isvcd_iquant_itrans_8x8_dc_sse42 */
118 /* */
119 /* Description : this function computes the inverse quantized and */
120 /* inverse transformed output */
121 /* */
122 /* Inputs : */
123 /* Globals : none */
124 /* Processing : */
125 /* */
126 /* Outputs : none */
127 /* Returns : none */
128 /* */
129 /* Issues : none */
130 /* */
131 /* Revision History: */
132 /* */
133 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
134 /* 25 11 2021 Kishore creation */
135 /* */
136 /*****************************************************************************/
137
isvcd_iquant_itrans_8x8_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)138 void isvcd_iquant_itrans_8x8_dc_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
139 const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
140 UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx,
141 WORD16 *pi2_dc_ld_addr)
142 {
143 WORD32 q;
144 WORD32 rnd_fact = (qp_div < 6) ? (1 << (5 - qp_div)) : 0;
145 __m128i dupmin_8x16b, dupmax_8x16b, i_macro;
146
147 UNUSED(pi2_tmp);
148 UNUSED(iq_start_idx);
149 UNUSED(pi2_dc_ld_addr);
150
151 q = pi2_src[0];
152 INV_QUANT(q, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
153
154 i_macro = _mm_set1_epi16((q + 32) >> 6);
155 dupmax_8x16b = _mm_set1_epi16(RSD_MAX);
156 dupmin_8x16b = _mm_set1_epi16(RSD_MIN);
157
158 i_macro = _mm_min_epi16(dupmax_8x16b, i_macro);
159 i_macro = _mm_max_epi16(dupmin_8x16b, i_macro);
160
161 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
162 pi2_out += out_strd;
163 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
164 pi2_out += out_strd;
165 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
166 pi2_out += out_strd;
167 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
168 pi2_out += out_strd;
169 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
170 pi2_out += out_strd;
171 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
172 pi2_out += out_strd;
173 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
174 pi2_out += out_strd;
175 _mm_storeu_si128((__m128i *) pi2_out, i_macro);
176 }
177 /*****************************************************************************/
178 /* */
179 /* Function Name : isvcd_iquant_itrans_chroma_4x4_dc_sse42 */
180 /* */
181 /* Description : this function computes the inverse quantized and */
182 /* inverse transformed output */
183 /* */
184 /* Inputs : */
185 /* Globals : none */
186 /* Processing : */
187 /* */
188 /* Outputs : none */
189 /* Returns : none */
190 /* */
191 /* Issues : none */
192 /* */
193 /* Revision History: */
194 /* */
195 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
196 /* 25 11 2021 Kishore creation */
197 /* */
198 /*****************************************************************************/
199
isvcd_iquant_itrans_chroma_4x4_dc_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)200 void isvcd_iquant_itrans_chroma_4x4_dc_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
201 const UWORD16 *pu2_iscal_mat,
202 const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
203 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
204 {
205 __m128i src_r0, src_r1, src_r2, src_r3;
206 __m128i i_macro = _mm_set1_epi16((pi2_dc_src[0] + 32) >> 6);
207 __m128i chroma_mask_even, chroma_mask_odd;
208 __m128i dupmax_8x16b = _mm_set1_epi16(RSD_MAX);
209 __m128i dupmin_8x16b = _mm_set1_epi16(RSD_MIN);
210
211 UNUSED(pi2_src);
212 UNUSED(pu2_iscal_mat);
213 UNUSED(pu2_weigh_mat);
214 UNUSED(pi2_tmp);
215 UNUSED(u4_qp_div_6);
216
217 i_macro = _mm_min_epi16(dupmax_8x16b, i_macro);
218 i_macro = _mm_max_epi16(dupmin_8x16b, i_macro);
219
220 // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
221 src_r0 = _mm_loadu_si128((__m128i *) (pi2_out));
222 // a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
223 src_r1 = _mm_loadu_si128((__m128i *) (pi2_out + (1 * out_strd)));
224 src_r2 = _mm_loadu_si128((__m128i *) (pi2_out + (2 * out_strd)));
225 src_r3 = _mm_loadu_si128((__m128i *) (pi2_out + (3 * out_strd)));
226
227 chroma_mask_even =
228 _mm_set_epi16(0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff);
229 chroma_mask_odd = _mm_set_epi16(0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000);
230
231 src_r0 = _mm_and_si128(src_r0, chroma_mask_odd); // 0 src1 0 src2 0 ...
232 src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
233 src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
234 src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
235
236 i_macro = _mm_and_si128(i_macro, chroma_mask_even); // macro 0 macro 0 ..
237
238 src_r0 = _mm_add_epi16(src_r0, i_macro); // macro src1 macro src2 macro ...
239 src_r1 = _mm_add_epi16(src_r1, i_macro);
240 src_r2 = _mm_add_epi16(src_r2, i_macro);
241 src_r3 = _mm_add_epi16(src_r3, i_macro);
242
243 _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0);
244 _mm_storeu_si128((__m128i *) (&pi2_out[out_strd]), src_r1);
245 _mm_storeu_si128((__m128i *) (&pi2_out[2 * out_strd]), src_r2);
246 _mm_storeu_si128((__m128i *) (&pi2_out[3 * out_strd]), src_r3);
247 }
248 /*****************************************************************************/
249 /* */
250 /* Function Name : isvcd_iquant_itrans_4x4_sse42 */
251 /* */
252 /* Description : this function computes the inverse quantized and */
253 /* inverse transformed output */
254 /* */
255 /* Inputs : */
256 /* Globals : none */
257 /* Processing : */
258 /* */
259 /* Outputs : none */
260 /* Returns : none */
261 /* */
262 /* Issues : none */
263 /* */
264 /* Revision History: */
265 /* */
266 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
267 /* 25 11 2021 Kishore creation */
268 /* */
269 /*****************************************************************************/
270
isvcd_iquant_itrans_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)271 void isvcd_iquant_itrans_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
272 const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
273 UWORD32 u4_qp_div_6, WORD16 *pi2_tmp, WORD32 iq_start_idx,
274 WORD16 *pi2_dc_ld_addr)
275 {
276 __m128i src_r0_r1, src_r2_r3;
277 __m128i src_r0, src_r1, src_r2, src_r3;
278 __m128i scalemat_r0_r1, scalemat_r2_r3;
279 __m128i dequant_r0_r1, dequant_r2_r3;
280 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
281 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
282 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
283 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
284 __m128i value_32 = _mm_set1_epi32(32);
285 __m128i dupmax_4x32b = _mm_set1_epi32(RSD_MAX);
286 __m128i dupmin_4x32b = _mm_set1_epi32(RSD_MIN);
287 UNUSED(pi2_tmp);
288
289 /*************************************************************/
290 /* Dequantization of coefficients. Will be replaced by SIMD */
291 /* operations on platform */
292 /*************************************************************/
293 // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
294 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
295 // a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
296 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
297 // b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
298 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
299 // b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
300 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
301 // q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
302 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
303 // q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
304 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
305
306 // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
307 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
308 // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
309 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
310
311 // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
312 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
313 // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
314 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
315 // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
316 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
317 // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
318 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
319
320 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
321 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
322 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
323 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
324
325 // a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
326 temp4 = _mm_madd_epi16(src_r0, temp4);
327 temp5 = _mm_madd_epi16(src_r1, temp5);
328 temp6 = _mm_madd_epi16(src_r2, temp6);
329 temp7 = _mm_madd_epi16(src_r3, temp7);
330
331 if(u4_qp_div_6 >= 4)
332 {
333 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
334 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
335 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
336 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
337 }
338 else
339 {
340 temp4 = _mm_add_epi32(temp4, add_rshift);
341 temp5 = _mm_add_epi32(temp5, add_rshift);
342 temp6 = _mm_add_epi32(temp6, add_rshift);
343 temp7 = _mm_add_epi32(temp7, add_rshift);
344 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
345 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
346 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
347 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
348 }
349
350 if(iq_start_idx == 1) resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_ld_addr[0], 0);
351 /* Perform Inverse transform */
352 /*-------------------------------------------------------------*/
353 /* IDCT [ Horizontal transformation ] */
354 /*-------------------------------------------------------------*/
355 // Matrix transpose
356 /*
357 * a0 a1 a2 a3
358 * b0 b1 b2 b3
359 * c0 c1 c2 c3
360 * d0 d1 d2 d3
361 */
362 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); // a0 b0 a1 b1
363 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); // c0 d0 c1 d1
364 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); // a2 b2 a3 b3
365 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); // c2 d2 c3 d3
366 resq_r0 = _mm_unpacklo_epi64(temp1, temp3); // a0 b0 c0 d0
367 resq_r1 = _mm_unpackhi_epi64(temp1, temp3); // a1 b1 c1 d1
368 resq_r2 = _mm_unpacklo_epi64(temp2, temp4); // a2 b2 c2 d2
369 resq_r3 = _mm_unpackhi_epi64(temp2, temp4); // a3 b3 c3 d3
370 // Transform starts -- horizontal transform
371 /*------------------------------------------------------------------*/
372 /* z0 = w0 + w2 */
373 temp0 = _mm_add_epi32(resq_r0, resq_r2);
374 /* z1 = w0 - w2 */
375 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
376 /* z2 = (w1 >> 1) - w3 */
377 temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1)
378 temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3
379 /* z3 = w1 + (w3 >> 1) */
380 temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
381 temp3 = _mm_add_epi32(temp3, resq_r1);
382 /*----------------------------------------------------------*/
383 /* x0 = z0 + z3 */
384 resq_r0 = _mm_add_epi32(temp0, temp3);
385 /* x1 = z1 + z2 */
386 resq_r1 = _mm_add_epi32(temp1, temp2);
387 /* x2 = z1 - z2 */
388 resq_r2 = _mm_sub_epi32(temp1, temp2);
389 /* x3 = z0 - z3 */
390 resq_r3 = _mm_sub_epi32(temp0, temp3);
391 // Matrix transpose
392 /*
393 * a0 b0 c0 d0
394 * a1 b1 c1 d1
395 * a2 b2 c2 d2
396 * a3 b3 c3 d3
397 */
398 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); // a0 a1 b0 b1
399 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); // a2 a3 b2 b3
400 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); // c0 c1 d0 d1
401 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); // c2 c3 d2 d3
402 resq_r0 = _mm_unpacklo_epi64(temp1, temp3); // a0 a1 a2 a3
403 resq_r1 = _mm_unpackhi_epi64(temp1, temp3); // b0 b1 b2 b3
404 resq_r2 = _mm_unpacklo_epi64(temp2, temp4); // c0 c1 c2 c3
405 resq_r3 = _mm_unpackhi_epi64(temp2, temp4); // d0 d1 d2 d3
406 // Transform ends -- horizontal transform
407
408 /*--------------------------------------------------------------*/
409 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
410 /* */
411 /* Add the prediction and store it back to same buffer */
412 /*--------------------------------------------------------------*/
413 /* z0j = y0j + y2j */
414 temp0 = _mm_add_epi32(resq_r0, resq_r2);
415 /* z1j = y0j - y2j */
416 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
417 /* z2j = (y1j>>1) - y3j */
418 temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1)
419 temp2 = _mm_sub_epi32(temp2, resq_r3);
420 /* z3j = y1j + (y3j>>1) */
421 temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1)
422 temp3 = _mm_add_epi32(temp3, resq_r1);
423
424 /* x0j = z0j + z3j */
425 temp4 = _mm_add_epi32(temp0, temp3);
426 temp4 = _mm_add_epi32(temp4, value_32);
427 temp4 = _mm_srai_epi32(temp4, 6);
428 temp4 = _mm_min_epi32(dupmax_4x32b, temp4);
429 temp4 = _mm_max_epi32(dupmin_4x32b, temp4);
430
431 /* x1j = z1j + z2j */
432 temp5 = _mm_add_epi32(temp1, temp2);
433 temp5 = _mm_add_epi32(temp5, value_32);
434 temp5 = _mm_srai_epi32(temp5, 6);
435 temp5 = _mm_min_epi32(dupmax_4x32b, temp5);
436 temp5 = _mm_max_epi32(dupmin_4x32b, temp5);
437
438 /* x2j = z1j - z2j */
439 temp6 = _mm_sub_epi32(temp1, temp2);
440 temp6 = _mm_add_epi32(temp6, value_32);
441 temp6 = _mm_srai_epi32(temp6, 6);
442 temp6 = _mm_min_epi32(dupmax_4x32b, temp6);
443 temp6 = _mm_max_epi32(dupmin_4x32b, temp6);
444
445 /* x3j = z0j - z3j */
446 temp7 = _mm_sub_epi32(temp0, temp3);
447 temp7 = _mm_add_epi32(temp7, value_32);
448 temp7 = _mm_srai_epi32(temp7, 6);
449 temp7 = _mm_min_epi32(dupmax_4x32b, temp7);
450 temp7 = _mm_max_epi32(dupmin_4x32b, temp7);
451
452 // 32-bit to 16-bit conversion
453 temp0 = _mm_packs_epi32(temp4, temp5);
454 temp1 = _mm_packs_epi32(temp6, temp7);
455
456 resq_r0 = temp0;
457 resq_r1 = _mm_srli_si128(temp0, 8);
458 resq_r2 = temp1;
459 resq_r3 = _mm_srli_si128(temp1, 8);
460
461 _mm_storel_epi64((__m128i *) pi2_out, resq_r0);
462 pi2_out += out_strd;
463 _mm_storel_epi64((__m128i *) pi2_out, resq_r1);
464 pi2_out += out_strd;
465 _mm_storel_epi64((__m128i *) pi2_out, resq_r2);
466 pi2_out += out_strd;
467 _mm_storel_epi64((__m128i *) pi2_out, resq_r3);
468 }
469
470 /*****************************************************************************/
471 /* */
472 /* Function Name : isvcd_iquant_itrans_8x8_sse42 */
473 /* */
474 /* Description : this function computes the inverse quantized and */
475 /* inverse transformed output */
476 /* */
477 /* Inputs : */
478 /* Globals : none */
479 /* Processing : */
480 /* */
481 /* Outputs : none */
482 /* Returns : none */
483 /* */
484 /* Issues : none */
485 /* */
486 /* Revision History: */
487 /* */
488 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
489 /* 25 11 2021 Kishore creation */
490 /* */
491 /*****************************************************************************/
492
isvcd_iquant_itrans_8x8_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)493 void isvcd_iquant_itrans_8x8_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
494 const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat,
495 UWORD32 qp_div, WORD16 *pi2_tmp, WORD32 iq_start_idx,
496 WORD16 *pi2_dc_ld_addr)
497 {
498 __m128i src_r0;
499 __m128i scalemat_r0;
500 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
501 __m128i value_32 = _mm_set1_epi32(32);
502 __m128i add_rshift = _mm_set1_epi32((qp_div < 6) ? (1 << (5 - qp_div)) : 0);
503 __m128i dequant_r0;
504 __m128i sign_reg;
505 __m128i src_r0_1, src_r0_2;
506 __m128i scalemat_r0_1, scalemat_r0_2;
507 __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
508 __m128i temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18, temp19, temp20;
509 // To store dequantization results
510 __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2, resq_r3_1, resq_r3_2,
511 resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2, resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
512 __m128i dupmax_4x32b = _mm_set1_epi32(RSD_MAX);
513 __m128i dupmin_4x32b = _mm_set1_epi32(RSD_MIN);
514
515 UNUSED(pi2_tmp);
516 UNUSED(iq_start_idx);
517 UNUSED(pi2_dc_ld_addr);
518
519 /*************************************************************/
520 /* Dequantization of coefficients. Will be replaced by SIMD */
521 /* operations on platform. Note : DC coeff is not scaled */
522 /*************************************************************/
523
524 // Row 0 processing
525 // a00 a01 a02 a03 a04 a05 a06 a07 -- the source matrix 0th row
526 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src));
527 // b00 b01 b02 b03 b04 b05 b06 b07 -- the scaling matrix 0th row
528 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat));
529 dequant_r0 =
530 _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[0])); // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
531 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
532 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
533 // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
534 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
535 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
536 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
537 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
538 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
539
540 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
541 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
542 // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
543 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
544
545 if(qp_div >= 6)
546 {
547 resq_r0_1 = _mm_slli_epi32(temp5, qp_div - 6);
548 resq_r0_2 = _mm_slli_epi32(temp7, qp_div - 6);
549 }
550 else
551 {
552 temp5 = _mm_add_epi32(temp5, add_rshift);
553 temp7 = _mm_add_epi32(temp7, add_rshift);
554 resq_r0_1 = _mm_srai_epi32(temp5, 6 - qp_div);
555 resq_r0_2 = _mm_srai_epi32(temp7, 6 - qp_div);
556 }
557 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 16
558 // bit long
559 resq_r0_1 = _mm_packs_epi32(resq_r0_1, resq_r0_2);
560 // Row 1 processing
561 // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 1st row
562 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
563 // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 1st row
564 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 8));
565 // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
566 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[8]));
567 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
568 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
569 // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
570 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
571 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
572 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
573 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
574 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
575 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
576 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
577 // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
578 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
579 if(qp_div >= 6)
580 {
581 resq_r1_1 = _mm_slli_epi32(temp5, qp_div - 6);
582 resq_r1_2 = _mm_slli_epi32(temp7, qp_div - 6);
583 }
584 else
585 {
586 temp5 = _mm_add_epi32(temp5, add_rshift);
587 temp7 = _mm_add_epi32(temp7, add_rshift);
588 resq_r1_1 = _mm_srai_epi32(temp5, 6 - qp_div);
589 resq_r1_2 = _mm_srai_epi32(temp7, 6 - qp_div);
590 }
591 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
592 resq_r1_1 = _mm_packs_epi32(resq_r1_1, resq_r1_2);
593 // Row 2 processing
594 // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 2nd row
595 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 16));
596 // b00 b01 b02 b03 b04 b05 b06 b07 b08-- the scaling matrix 2nd row
597 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 16));
598 // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
599 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[16]));
600 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
601 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
602 // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
603 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
604 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
605 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
606 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
607 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
608 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
609 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
610 // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
611 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
612 if(qp_div >= 6)
613 {
614 resq_r2_1 = _mm_slli_epi32(temp5, qp_div - 6);
615 resq_r2_2 = _mm_slli_epi32(temp7, qp_div - 6);
616 }
617 else
618 {
619 temp5 = _mm_add_epi32(temp5, add_rshift);
620 temp7 = _mm_add_epi32(temp7, add_rshift);
621 resq_r2_1 = _mm_srai_epi32(temp5, 6 - qp_div);
622 resq_r2_2 = _mm_srai_epi32(temp7, 6 - qp_div);
623 }
624 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
625 resq_r2_1 = _mm_packs_epi32(resq_r2_1, resq_r2_2);
626 // Row 3 processing
627 // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 3rd row
628 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 24));
629 // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 3rd row
630 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 24));
631 // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
632 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[24]));
633 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
634 // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
635 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b);
636 // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
637 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
638 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
639 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
640 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
641 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
642 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 - 32 bits long
643 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
644 // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
645 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
646 if(qp_div >= 6)
647 {
648 resq_r3_1 = _mm_slli_epi32(temp5, qp_div - 6);
649 resq_r3_2 = _mm_slli_epi32(temp7, qp_div - 6);
650 }
651 else
652 {
653 temp5 = _mm_add_epi32(temp5, add_rshift);
654 temp7 = _mm_add_epi32(temp7, add_rshift);
655 resq_r3_1 = _mm_srai_epi32(temp5, 6 - qp_div);
656 resq_r3_2 = _mm_srai_epi32(temp7, 6 - qp_div);
657 }
658 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
659 resq_r3_1 = _mm_packs_epi32(resq_r3_1, resq_r3_2);
660 // Row 4 processing
661 // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 4th row
662 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 32));
663 // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 4th row
664 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 32));
665 dequant_r0 = _mm_loadu_si128(
666 (__m128i *) (&pu2_weigh_mat[32])); // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
667 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
668 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
669 // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
670 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
671 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
672 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
673 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
674 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
675 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
676 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
677 // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
678 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
679 if(qp_div >= 6)
680 {
681 resq_r4_1 = _mm_slli_epi32(temp5, qp_div - 6);
682 resq_r4_2 = _mm_slli_epi32(temp7, qp_div - 6);
683 }
684 else
685 {
686 temp5 = _mm_add_epi32(temp5, add_rshift);
687 temp7 = _mm_add_epi32(temp7, add_rshift);
688 resq_r4_1 = _mm_srai_epi32(temp5, 6 - qp_div);
689 resq_r4_2 = _mm_srai_epi32(temp7, 6 - qp_div);
690 }
691 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7
692 resq_r4_1 = _mm_packs_epi32(resq_r4_1, resq_r4_2);
693 // Row 5 processing
694 // a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 5th row
695 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 40));
696 // b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 5th row
697 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 40));
698 // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
699 dequant_r0 = _mm_loadu_si128((__m128i *) (&pu2_weigh_mat[40]));
700 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
701 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
702 // b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result
703 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
704 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
705 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
706 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
707 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
708 // a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long
709 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
710 // a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long
711 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
712 if(qp_div >= 6)
713 {
714 resq_r5_1 = _mm_slli_epi32(temp5, qp_div - 6);
715 resq_r5_2 = _mm_slli_epi32(temp7, qp_div - 6);
716 }
717 else
718 {
719 temp5 = _mm_add_epi32(temp5, add_rshift);
720 temp7 = _mm_add_epi32(temp7, add_rshift);
721 resq_r5_1 = _mm_srai_epi32(temp5, 6 - qp_div);
722 resq_r5_2 = _mm_srai_epi32(temp7, 6 - qp_div);
723 }
724 /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 */
725 resq_r5_1 = _mm_packs_epi32(resq_r5_1, resq_r5_2);
726 // Row 6 processing
727 /* a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 6th row */
728 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 48));
729 /* b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 6th row */
730 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 48));
731 dequant_r0 = _mm_loadu_si128(
732 (__m128i *) (&pu2_weigh_mat[48])); // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
733 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
734 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
735 /* b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result */
736 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
737 // b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long
738 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
739 // b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long
740 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
741 /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long */
742 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
743 /* a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long */
744 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
745 if(qp_div >= 6)
746 {
747 resq_r6_1 = _mm_slli_epi32(temp5, qp_div - 6);
748 resq_r6_2 = _mm_slli_epi32(temp7, qp_div - 6);
749 }
750 else
751 {
752 temp5 = _mm_add_epi32(temp5, add_rshift);
753 temp7 = _mm_add_epi32(temp7, add_rshift);
754 resq_r6_1 = _mm_srai_epi32(temp5, 6 - qp_div);
755 resq_r6_2 = _mm_srai_epi32(temp7, 6 - qp_div);
756 }
757 /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 */
758 resq_r6_1 = _mm_packs_epi32(resq_r6_1, resq_r6_2);
759 // Row 7 processing
760 /* a00 a01 a02 a03 a04 a05 a06 a07 a08 -- the source matrix 7th row */
761 src_r0 = _mm_loadu_si128((__m128i *) (pi2_src + 56));
762 /* b00 b01 b02 b03 b04 b05 b06 b07 b08 -- the scaling matrix 7th row */
763 scalemat_r0 = _mm_loadu_si128((__m128i *) (pu2_iscale_mat + 56));
764 dequant_r0 = _mm_loadu_si128(
765 (__m128i *) (&pu2_weigh_mat[56])); // q0 q1 q2 q3 q4 q5 q6 q7 -- all 16 bits
766 src_r0_1 = _mm_unpacklo_epi16(src_r0, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
767 src_r0_2 = _mm_unpackhi_epi16(src_r0, zero_8x16b); // a04 0 a05 0 a06 0 a07 0 -- 16 bit long
768
769 /* b00*q0 b01*q1 b02*q2 b03*q3 b04*q4 b05*q5 b06*q6 b07*q7 -- 16 bit result */
770 temp10 = _mm_mullo_epi16(scalemat_r0, dequant_r0);
771 /* b00*q0 0 b01*q1 0 b02*q2 0 b03*q3 0 -- 16 bit long */
772 scalemat_r0_1 = _mm_unpacklo_epi16(temp10, zero_8x16b);
773 /* b04*q4 0 b05*q5 0 b06*q6 0 b07*q7 0 -- 16 bit long */
774 scalemat_r0_2 = _mm_unpackhi_epi16(temp10, zero_8x16b);
775 /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 -- 32 bits long */
776 temp5 = _mm_madd_epi16(src_r0_1, scalemat_r0_1);
777 /* a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 -- 32 bits long */
778 temp7 = _mm_madd_epi16(src_r0_2, scalemat_r0_2);
779 if(qp_div >= 6)
780 {
781 resq_r7_1 = _mm_slli_epi32(temp5, qp_div - 6);
782 resq_r7_2 = _mm_slli_epi32(temp7, qp_div - 6);
783 }
784 else
785 {
786 temp5 = _mm_add_epi32(temp5, add_rshift);
787 temp7 = _mm_add_epi32(temp7, add_rshift);
788 resq_r7_1 = _mm_srai_epi32(temp5, 6 - qp_div);
789 resq_r7_2 = _mm_srai_epi32(temp7, 6 - qp_div);
790 }
791 /* a00*b00*q0 a01*b01*q1 a02*b02*q2 a03*b03*q3 a04*b04*q4 a05*b05*q5 a06*b06*q6 a07*b07*q7 */
792 resq_r7_1 = _mm_packs_epi32(resq_r7_1, resq_r7_2);
793 /* Perform Inverse transform */
794 /*--------------------------------------------------------------------*/
795 /* IDCT [ Horizontal transformation ] */
796 /*--------------------------------------------------------------------*/
797 // Matrix transpose
798 /*
799 * a0 a1 a2 a3 a4 a5 a6 a7
800 * b0 b1 b2 b3 b4 b5 b6 b7
801 * c0 c1 c2 c3 c4 c5 c6 c7
802 * d0 d1 d2 d3 d4 d5 d6 d7
803 */
804 temp1 = _mm_unpacklo_epi16(resq_r0_1, resq_r1_1); // a0 b0 a1 b1 a2 b2 a3 b3
805 temp3 = _mm_unpacklo_epi16(resq_r2_1, resq_r3_1); // c0 d0 c1 d1 c2 d2 c3 d3
806 temp2 = _mm_unpackhi_epi16(resq_r0_1, resq_r1_1); // a4 b4 a5 b5 a6 b6 a7 b7
807 temp4 = _mm_unpackhi_epi16(resq_r2_1, resq_r3_1); // c4 d4 c5 d5 c6 d6 c7 d7
808 resq_r0_1 = _mm_unpacklo_epi32(temp1, temp3); // a0 b0 c0 d0 a1 b1 c1 d1
809 resq_r1_1 = _mm_unpackhi_epi32(temp1, temp3); // a2 b2 c2 d2 a3 b3 c3 d3
810 resq_r2_1 = _mm_unpacklo_epi32(temp2, temp4); // a4 b4 c4 d4 a5 b5 c5 d5
811 resq_r3_1 = _mm_unpackhi_epi32(temp2, temp4); // a6 b6 c6 d6 a7 b7 c7 d7
812 /*
813 * e0 e1 e2 e3 e4 e5 e6 e7
814 * f0 f1 f2 f3 f4 f5 f6 f7
815 * g0 g1 g2 g3 g4 g5 g6 g7
816 * h0 h1 h2 h3 h4 h5 h6 h7
817 */
818 temp1 = _mm_unpacklo_epi16(resq_r4_1, resq_r5_1); // e0 f0 e1 f1 e2 f2 e2 f3
819 temp3 = _mm_unpacklo_epi16(resq_r6_1, resq_r7_1); // g0 h0 g1 h1 g2 h2 g3 h3
820 temp2 = _mm_unpackhi_epi16(resq_r4_1, resq_r5_1); // e4 f4 e5 f5 e6 f6 e7 f7
821 temp4 = _mm_unpackhi_epi16(resq_r6_1, resq_r7_1); // g4 h4 g5 h5 g6 h6 g7 h7
822 resq_r4_1 = _mm_unpacklo_epi32(temp1, temp3); // e0 f0 g0 h0 e1 f1 g1 h1
823 resq_r5_1 = _mm_unpackhi_epi32(temp1, temp3); // e2 f2 g2 h2 e3 f3 g3 h3
824 resq_r6_1 = _mm_unpacklo_epi32(temp2, temp4); // e4 f4 g4 h4 e5 f5 g5 h5
825 resq_r7_1 = _mm_unpackhi_epi32(temp2, temp4); // e6 f6 g6 h6 e7 f7 g7 h7
826 /*
827 * a0 b0 c0 d0 a1 b1 c1 d1
828 * a2 b2 c2 d2 a3 b3 c3 d3
829 * a4 b4 c4 d4 a5 b5 c5 d5
830 * a6 b6 c6 d6 a7 b7 c7 d7
831 * e0 f0 g0 h0 e1 f1 g1 h1
832 * e2 f2 g2 h2 e3 f3 g3 h3
833 * e4 f4 g4 h4 e5 f5 g5 h5
834 * e6 f6 g6 h6 e7 f7 g7 h7
835 */
836 resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); // a0 b0 c0 d0 e0 f0 g0 h0
837 resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); // a1 b1 c1 d1 e1 f1 g1 h1
838 resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); // a2 b2 c2 d2 e2 f2 g2 h2
839 resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); // a3 b3 c3 d3 e3 f3 g3 h3
840 resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); // a4 b4 c4 d4 e4 f4 g4 h4
841 resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); // a5 b5 c5 d5 e5 f5 g5 h5
842 resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); // a6 b6 c6 d6 e6 f6 g6 h6
843 resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); // a7 b7 c7 d7 e7 f7 g7 h7
844
845 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
846 resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); // a1 b1 c1 d1 -- 32 bit
847 resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); // e1 f1 g1 h1 -- 32 bit
848 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
849 resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); // a3 b3 c3 d3 -- 32 bit
850 resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); // e3 f3 g3 h3 -- 32 bit
851 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
852 resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); // a5 b5 c5 d5 -- 32 bit
853 resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); // e5 f5 g5 h5 -- 32 bit
854 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
855 resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); // a7 b7 c7 d7 -- 32 bit
856 resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); // e7 f7 g7 h7 -- 32 bit
857 // Transform starts -- horizontal transform
858 /*------------------------------------------------------------------*/
859 /* y0 = w0 + w4 */
860 temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
861 /* y2 = w0 - w4 */
862 temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
863 /* y1 = -w3 + w5 - w7 - (w7 >> 1) */
864 temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5
865 temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
866 temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7
867 temp12 = _mm_sub_epi32(temp10, resq_r7_2);
868 temp5 = _mm_srai_epi32(resq_r7_1, 1); // w7>>1
869 temp13 = _mm_srai_epi32(resq_r7_2, 1);
870 temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1)
871 temp10 = _mm_sub_epi32(temp12, temp13);
872 temp2 = _mm_packs_epi32(temp2, temp10);
873 /* y3 = w1 + w7 - w3 - (w3 >> 1) */
874 temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); // w1+w7
875 temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
876 temp4 = _mm_sub_epi32(temp4, resq_r3_1); // w1+w7-w3
877 temp12 = _mm_sub_epi32(temp12, resq_r3_2);
878 temp5 = _mm_srai_epi32(resq_r3_1, 1); // w3>>1
879 temp13 = _mm_srai_epi32(resq_r3_2, 1);
880 temp4 = _mm_sub_epi32(temp4, temp5); // w1+w7-w3-(w3>>1)
881 temp12 = _mm_sub_epi32(temp12, temp13);
882 temp4 = _mm_packs_epi32(temp4, temp12);
883 /* y4 = (w2 >> 1) - w6 */
884 temp5 = _mm_srai_epi16(resq_r2_2, 1); // w2>>1
885 temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6
886 /* y5 = -w1 + w7 + w5 + (w5 >> 1) */
887 temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); // w7-w1
888 temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
889 temp6 = _mm_add_epi32(temp6, resq_r5_1); // w7-w1+w5
890 temp14 = _mm_add_epi32(temp14, resq_r5_2);
891 temp7 = _mm_srai_epi32(resq_r5_1, 1); // w5>>1
892 temp15 = _mm_srai_epi32(resq_r5_2, 1);
893 temp6 = _mm_add_epi32(temp6, temp7); // w7-w1_w5+(w5>>1)
894 temp14 = _mm_add_epi32(temp14, temp15);
895 temp6 = _mm_packs_epi32(temp6, temp14);
896 /* y6 = w2 + (w6 >> 1) */
897 temp7 = _mm_srai_epi16(resq_r6_2, 1); // w6>>1
898 temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2
899 /* y7 = w3 + w5 + w1 + (w1 >> 1) */
900 temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); // w3+w5
901 temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
902 temp8 = _mm_add_epi32(temp8, resq_r1_1); // w3+w5+w1
903 temp16 = _mm_add_epi32(temp16, resq_r1_2);
904 temp17 = _mm_srai_epi32(resq_r1_1, 1); // w1>>1
905 temp18 = _mm_srai_epi32(resq_r1_2, 1);
906 temp8 = _mm_add_epi32(temp8, temp17); // w3+w5+w1+(w1>>1)
907 temp16 = _mm_add_epi32(temp16, temp18);
908 temp8 = _mm_packs_epi32(temp8, temp16);
909 /*------------------------------------------------------------------*/
910 /*------------------------------------------------------------------*/
911 /* z0 = y0 + y6 */
912 resq_r0_1 = _mm_add_epi16(temp1, temp7);
913 /* z1 = y1 + (y7 >> 2) */
914 resq_r1_1 = _mm_srai_epi16(temp8, 2);
915 resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
916 /* z2 = y2 + y4 */
917 resq_r2_1 = _mm_add_epi16(temp3, temp5);
918 /* z3 = y3 + (y5 >> 2) */
919 resq_r3_1 = _mm_srai_epi16(temp6, 2);
920 resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
921 /* z4 = y2 - y4 */
922 resq_r4_1 = _mm_sub_epi16(temp3, temp5);
923 /* z5 = (y3 >> 2) - y5 */
924 resq_r5_1 = _mm_srai_epi16(temp4, 2);
925 resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
926 /* z6 = y0 - y6 */
927 resq_r6_1 = _mm_sub_epi16(temp1, temp7);
928 /* z7 = y7 - (y1 >> 2) */
929 resq_r7_1 = _mm_srai_epi16(temp2, 2);
930 resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
931 /*------------------------------------------------------------------*/
932 /*------------------------------------------------------------------*/
933 /* x0 = z0 + z7 */
934 temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
935 /* x1 = z2 + z5 */
936 temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
937 /* x2 = z4 + z3 */
938 temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
939 /* x3 = z6 + z1 */
940 temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
941 /* x4 = z6 - z1 */
942 temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
943 /* x5 = z4 - z3 */
944 temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
945 /* x6 = z2 - z5 */
946 temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
947 /* x7 = z0 - z7 */
948 temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
949 /*------------------------------------------------------------------*/
950 // Matrix transpose
951 /*
952 * a0 b0 c0 d0 e0 f0 g0 h0
953 * a1 b1 c1 d1 e1 f1 g1 h1
954 * a2 b2 c2 d2 e2 f2 g2 h2
955 * a3 b3 c3 d3 e3 f3 g3 h3
956 */
957 temp17 = _mm_unpacklo_epi16(temp1, temp2); // a0 a1 b0 b1 c0 c1 d0 d1
958 temp19 = _mm_unpacklo_epi16(temp3, temp4); // a2 a3 b2 b3 c2 c3 d2 d3
959 temp18 = _mm_unpackhi_epi16(temp1, temp2); // e0 e1 f0 f1 g0 g1 h0 h1
960 temp20 = _mm_unpackhi_epi16(temp3, temp4); // e2 e3 f2 f3 g2 g3 h2 h3
961
962 resq_r0_1 = _mm_unpacklo_epi32(temp17, temp19); // a0 a1 a2 a3 b0 b1 b2 b3
963 resq_r1_1 = _mm_unpackhi_epi32(temp17, temp19); // c0 c1 c2 c3 d0 d1 d2 d3
964 resq_r2_1 = _mm_unpacklo_epi32(temp18, temp20); // e0 e1 e2 e3 f0 f1 f2 f3
965 resq_r3_1 = _mm_unpackhi_epi32(temp18, temp20); // g0 g2 g2 g3 h0 h1 h2 h3
966 /*
967 * a4 b4 c4 d4 e4 f4 g4 h4
968 * a5 b5 c5 d5 e5 f5 g5 h5
969 * a6 b6 c6 d6 e6 f6 g6 h6
970 * a7 b7 c7 d7 e7 f7 g7 h7
971 */
972 temp17 = _mm_unpacklo_epi16(temp5, temp6); // a4 a5 b4 b5 c4 c5 d4 d5
973 temp19 = _mm_unpacklo_epi16(temp7, temp8); // a6 a7 b6 b7 c6 c7 d6 d7
974 temp18 = _mm_unpackhi_epi16(temp5, temp6); // e4 e5 f4 f5 g4 g5 h4 h5
975 temp20 = _mm_unpackhi_epi16(temp7, temp8); // e6 e7 f6 f7 g6 g7 h6 h7
976
977 resq_r4_1 = _mm_unpacklo_epi32(temp17, temp19); // a4 a5 a6 a7 b4 b5 b6 b7
978 resq_r5_1 = _mm_unpackhi_epi32(temp17, temp19); // c4 c5 c6 c7 d4 d5 d6 d7
979 resq_r6_1 = _mm_unpacklo_epi32(temp18, temp20); // e4 e5 e6 e7 f4 f5 f6 f7
980 resq_r7_1 = _mm_unpackhi_epi32(temp18, temp20); // g4 g5 g6 g7 h4 h5 h6 h7
981 /* a0 a1 a2 a3 b0 b1 b2 b3
982 * c0 c1 c2 c3 d0 d1 d2 d3
983 * e0 e1 e2 e3 f0 f1 f2 f3
984 * g0 g2 g2 g3 h0 h1 h2 h3
985 * a4 a5 a6 a7 b4 b5 b6 b7
986 * c4 c5 c6 c7 d4 d5 d6 d7
987 * e4 e5 e6 e7 f4 f5 f6 f7
988 * g4 g5 g6 g7 h4 h5 h6 h7
989 */
990 resq_r0_2 = _mm_unpacklo_epi64(resq_r0_1, resq_r4_1); // a0 a1 a2 a3 a4 a5 a6 a7
991 resq_r1_2 = _mm_unpackhi_epi64(resq_r0_1, resq_r4_1); // b0 b1 b2 b3 b4 b5 b6 b7
992 resq_r2_2 = _mm_unpacklo_epi64(resq_r1_1, resq_r5_1); // c0 c1 c2 c3 c4 c5 c6 c7
993 resq_r3_2 = _mm_unpackhi_epi64(resq_r1_1, resq_r5_1); // d0 d1 d2 d3 d4 d5 d6 d7
994 resq_r4_2 = _mm_unpacklo_epi64(resq_r2_1, resq_r6_1); // e0 e1 e2 e3 e4 e5 e6 e7
995 resq_r5_2 = _mm_unpackhi_epi64(resq_r2_1, resq_r6_1); // f0 f1 f2 f3 f4 f5 f6 f7
996 resq_r6_2 = _mm_unpacklo_epi64(resq_r3_1, resq_r7_1); // g0 g1 g2 g3 g4 g5 g6 g7
997 resq_r7_2 = _mm_unpackhi_epi64(resq_r3_1, resq_r7_1); // h0 h1 h2 h3 h4 h5 h6 h7
998
999 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r1_2);
1000 resq_r1_1 = _mm_unpacklo_epi16(resq_r1_2, sign_reg); // a1 b1 c1 d1 -- 32 bit
1001 resq_r1_2 = _mm_unpackhi_epi16(resq_r1_2, sign_reg); // e1 f1 g1 h1 -- 32 bit
1002 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r3_2);
1003 resq_r3_1 = _mm_unpacklo_epi16(resq_r3_2, sign_reg); // a3 b3 c3 d3 -- 32 bit
1004 resq_r3_2 = _mm_unpackhi_epi16(resq_r3_2, sign_reg); // e3 f3 g3 h3 -- 32 bit
1005 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r5_2);
1006 resq_r5_1 = _mm_unpacklo_epi16(resq_r5_2, sign_reg); // a5 b5 c5 d5 -- 32 bit
1007 resq_r5_2 = _mm_unpackhi_epi16(resq_r5_2, sign_reg); // e5 f5 g5 h5 -- 32 bit
1008 sign_reg = _mm_cmpgt_epi16(zero_8x16b, resq_r7_2);
1009 resq_r7_1 = _mm_unpacklo_epi16(resq_r7_2, sign_reg); // a7 b7 c7 d7 -- 32 bit
1010 resq_r7_2 = _mm_unpackhi_epi16(resq_r7_2, sign_reg); // e7 f7 g7 h7 -- 32 bit
1011
1012 /*--------------------------------------------------------------------*/
1013 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1014 /* */
1015
1016 /* y0j = w0j + w4j */
1017 temp1 = _mm_add_epi16(resq_r0_2, resq_r4_2);
1018 /* y2j = w0j - w4j */
1019 temp3 = _mm_sub_epi16(resq_r0_2, resq_r4_2);
1020 /* y1j = -w3j + w5j - w7j - (w7j >> 1) */
1021 temp2 = _mm_sub_epi32(resq_r5_1, resq_r3_1); //-w3+w5
1022 temp10 = _mm_sub_epi32(resq_r5_2, resq_r3_2);
1023 temp4 = _mm_sub_epi32(temp2, resq_r7_1); //-w3+w5-w7
1024 temp12 = _mm_sub_epi32(temp10, resq_r7_2);
1025 temp5 = _mm_srai_epi32(resq_r7_1, 1); // w7>>1
1026 temp13 = _mm_srai_epi32(resq_r7_2, 1);
1027 temp2 = _mm_sub_epi32(temp4, temp5); //-w3+w5-w7 -(w7>>1)
1028 temp10 = _mm_sub_epi32(temp12, temp13);
1029 temp2 = _mm_packs_epi32(temp2, temp10);
1030 /* y3j = w1j + w7j - w3j - (w3j >> 1) */
1031 temp4 = _mm_add_epi32(resq_r1_1, resq_r7_1); // w1+w7
1032 temp12 = _mm_add_epi32(resq_r1_2, resq_r7_2);
1033 temp4 = _mm_sub_epi32(temp4, resq_r3_1); // w1+w7-w3
1034 temp12 = _mm_sub_epi32(temp12, resq_r3_2);
1035 temp5 = _mm_srai_epi32(resq_r3_1, 1); // w3>>1
1036 temp13 = _mm_srai_epi32(resq_r3_2, 1);
1037 temp4 = _mm_sub_epi32(temp4, temp5); // w1+w7-w3-(w3>>1)
1038 temp12 = _mm_sub_epi32(temp12, temp13);
1039 temp4 = _mm_packs_epi32(temp4, temp12);
1040 /* y4j = (w2j >> 1) - w6j */
1041 temp5 = _mm_srai_epi16(resq_r2_2, 1); // w2>>1
1042 temp5 = _mm_sub_epi16(temp5, resq_r6_2); //(w2>>1)-w6
1043 /* y5j = -w1j + w7j + w5j + (w5j >> 1) */
1044 temp6 = _mm_sub_epi32(resq_r7_1, resq_r1_1); // w7-w1
1045 temp14 = _mm_sub_epi32(resq_r7_2, resq_r1_2);
1046 temp6 = _mm_add_epi32(temp6, resq_r5_1); // w7-w1+w5
1047 temp14 = _mm_add_epi32(temp14, resq_r5_2);
1048 temp7 = _mm_srai_epi32(resq_r5_1, 1); // w5>>1
1049 temp15 = _mm_srai_epi32(resq_r5_2, 1);
1050 temp6 = _mm_add_epi32(temp6, temp7); // w7-w1_w5+(w5>>1)
1051 temp14 = _mm_add_epi32(temp14, temp15);
1052 temp6 = _mm_packs_epi32(temp6, temp14);
1053 /* y6j = w2j + (w6j >> 1) */
1054 temp7 = _mm_srai_epi16(resq_r6_2, 1); // w6>>1
1055 temp7 = _mm_add_epi16(temp7, resq_r2_2); //(w6>>1)+w2
1056 /* y7j = w3j + w5j + w1j + (w1j >> 1) */
1057 temp8 = _mm_add_epi32(resq_r3_1, resq_r5_1); // w3+w5
1058 temp16 = _mm_add_epi32(resq_r3_2, resq_r5_2);
1059 temp8 = _mm_add_epi32(temp8, resq_r1_1); // w3+w5+w1
1060 temp16 = _mm_add_epi32(temp16, resq_r1_2);
1061 temp17 = _mm_srai_epi32(resq_r1_1, 1); // w1>>1
1062 temp18 = _mm_srai_epi32(resq_r1_2, 1);
1063 temp8 = _mm_add_epi32(temp8, temp17); // w3+w5+w1+(w1>>1)
1064 temp16 = _mm_add_epi32(temp16, temp18);
1065 temp8 = _mm_packs_epi32(temp8, temp16);
1066 /*------------------------------------------------------------------*/
1067 /*------------------------------------------------------------------*/
1068 /* z0j = y0j + y6j */
1069 resq_r0_1 = _mm_add_epi16(temp1, temp7);
1070 /* z1j = y1j + (y7j >> 2) */
1071 resq_r1_1 = _mm_srai_epi16(temp8, 2);
1072 resq_r1_1 = _mm_add_epi16(resq_r1_1, temp2);
1073 /* z2j = y2j + y4j */
1074 resq_r2_1 = _mm_add_epi16(temp3, temp5);
1075 /* z3j = y3j + (y5j >> 2) */
1076 resq_r3_1 = _mm_srai_epi16(temp6, 2);
1077 resq_r3_1 = _mm_add_epi16(resq_r3_1, temp4);
1078 /* z4j = y2j - y4j */
1079 resq_r4_1 = _mm_sub_epi16(temp3, temp5);
1080 /* z5j = (y3j >> 2) - y5j */
1081 resq_r5_1 = _mm_srai_epi16(temp4, 2);
1082 resq_r5_1 = _mm_sub_epi16(resq_r5_1, temp6);
1083 /* z6j = y0j - y6j */
1084 resq_r6_1 = _mm_sub_epi16(temp1, temp7);
1085 /* z7j = y7j - (y1j >> 2) */
1086 resq_r7_1 = _mm_srai_epi16(temp2, 2);
1087 resq_r7_1 = _mm_sub_epi16(temp8, resq_r7_1);
1088 /*------------------------------------------------------------------*/
1089
1090 /*------------------------------------------------------------------*/
1091 /* x0j = z0j + z7j */
1092 temp1 = _mm_add_epi16(resq_r0_1, resq_r7_1);
1093 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp1);
1094 temp10 = _mm_unpacklo_epi16(temp1, sign_reg);
1095 temp11 = _mm_unpackhi_epi16(temp1, sign_reg);
1096 temp10 = _mm_add_epi32(temp10, value_32);
1097 temp11 = _mm_add_epi32(temp11, value_32);
1098 temp10 = _mm_srai_epi32(temp10, 6);
1099 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1100 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1101 temp11 = _mm_srai_epi32(temp11, 6);
1102 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1103 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1104 temp10 = _mm_packs_epi32(temp10, temp11);
1105 temp1 = temp10; //_mm_add_epi16(temp10, pred_r0_1);
1106 /* x1j = z2j + z5j */
1107 temp2 = _mm_add_epi16(resq_r2_1, resq_r5_1);
1108 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp2);
1109 temp10 = _mm_unpacklo_epi16(temp2, sign_reg);
1110 temp11 = _mm_unpackhi_epi16(temp2, sign_reg);
1111 temp10 = _mm_add_epi32(temp10, value_32);
1112 temp11 = _mm_add_epi32(temp11, value_32);
1113 temp10 = _mm_srai_epi32(temp10, 6);
1114 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1115 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1116 temp11 = _mm_srai_epi32(temp11, 6);
1117 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1118 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1119 temp10 = _mm_packs_epi32(temp10, temp11);
1120 temp2 = temp10; //_mm_add_epi16(temp10, pred_r1_1);
1121 /* x2j = z4j + z3j */
1122 temp3 = _mm_add_epi16(resq_r4_1, resq_r3_1);
1123 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp3);
1124 temp10 = _mm_unpacklo_epi16(temp3, sign_reg);
1125 temp11 = _mm_unpackhi_epi16(temp3, sign_reg);
1126 temp10 = _mm_add_epi32(temp10, value_32);
1127 temp11 = _mm_add_epi32(temp11, value_32);
1128 temp10 = _mm_srai_epi32(temp10, 6);
1129 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1130 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1131 temp11 = _mm_srai_epi32(temp11, 6);
1132 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1133 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1134 temp10 = _mm_packs_epi32(temp10, temp11);
1135 temp3 = temp10; //_mm_add_epi16(temp10, pred_r2_1);
1136 /* x3j = z6j + z1j */
1137 temp4 = _mm_add_epi16(resq_r6_1, resq_r1_1);
1138 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp4);
1139 temp10 = _mm_unpacklo_epi16(temp4, sign_reg);
1140 temp11 = _mm_unpackhi_epi16(temp4, sign_reg);
1141 temp10 = _mm_add_epi32(temp10, value_32);
1142 temp11 = _mm_add_epi32(temp11, value_32);
1143 temp10 = _mm_srai_epi32(temp10, 6);
1144 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1145 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1146 temp11 = _mm_srai_epi32(temp11, 6);
1147 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1148 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1149 temp10 = _mm_packs_epi32(temp10, temp11);
1150 temp4 = temp10; //_mm_add_epi16(temp10, pred_r3_1);
1151 /* x4j = z6j - z1j */
1152 temp5 = _mm_sub_epi16(resq_r6_1, resq_r1_1);
1153 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp5);
1154 temp10 = _mm_unpacklo_epi16(temp5, sign_reg);
1155 temp11 = _mm_unpackhi_epi16(temp5, sign_reg);
1156 temp10 = _mm_add_epi32(temp10, value_32);
1157 temp11 = _mm_add_epi32(temp11, value_32);
1158 temp10 = _mm_srai_epi32(temp10, 6);
1159 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1160 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1161 temp11 = _mm_srai_epi32(temp11, 6);
1162 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1163 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1164 temp10 = _mm_packs_epi32(temp10, temp11);
1165 temp5 = temp10; //_mm_add_epi16(temp10, pred_r4_1);
1166 /* x5j = z4j - z3j */
1167 temp6 = _mm_sub_epi16(resq_r4_1, resq_r3_1);
1168 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp6);
1169 temp10 = _mm_unpacklo_epi16(temp6, sign_reg);
1170 temp11 = _mm_unpackhi_epi16(temp6, sign_reg);
1171 temp10 = _mm_add_epi32(temp10, value_32);
1172 temp11 = _mm_add_epi32(temp11, value_32);
1173 temp10 = _mm_srai_epi32(temp10, 6);
1174 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1175 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1176 temp11 = _mm_srai_epi32(temp11, 6);
1177 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1178 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1179 temp10 = _mm_packs_epi32(temp10, temp11);
1180 temp6 = temp10; //_mm_add_epi16(temp10, pred_r5_1);
1181 /* x6j = z2j - z5j */
1182 temp7 = _mm_sub_epi16(resq_r2_1, resq_r5_1);
1183 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp7);
1184 temp10 = _mm_unpacklo_epi16(temp7, sign_reg);
1185 temp11 = _mm_unpackhi_epi16(temp7, sign_reg);
1186 temp10 = _mm_add_epi32(temp10, value_32);
1187 temp11 = _mm_add_epi32(temp11, value_32);
1188 temp10 = _mm_srai_epi32(temp10, 6);
1189 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1190 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1191 temp11 = _mm_srai_epi32(temp11, 6);
1192 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1193 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1194 temp10 = _mm_packs_epi32(temp10, temp11);
1195 temp7 = temp10; //_mm_add_epi16(temp10, pred_r6_1);
1196 /* x7j = z0j - z7j */
1197 temp8 = _mm_sub_epi16(resq_r0_1, resq_r7_1);
1198 sign_reg = _mm_cmpgt_epi16(zero_8x16b, temp8);
1199 temp10 = _mm_unpacklo_epi16(temp8, sign_reg);
1200 temp11 = _mm_unpackhi_epi16(temp8, sign_reg);
1201 temp10 = _mm_add_epi32(temp10, value_32);
1202 temp11 = _mm_add_epi32(temp11, value_32);
1203 temp10 = _mm_srai_epi32(temp10, 6);
1204 temp10 = _mm_min_epi32(dupmax_4x32b, temp10);
1205 temp10 = _mm_max_epi32(dupmin_4x32b, temp10);
1206 temp11 = _mm_srai_epi32(temp11, 6);
1207 temp11 = _mm_min_epi32(dupmax_4x32b, temp11);
1208 temp11 = _mm_max_epi32(dupmin_4x32b, temp11);
1209 temp10 = _mm_packs_epi32(temp10, temp11);
1210 temp8 = temp10; //_mm_add_epi16(temp10, pred_r7_1);
1211
1212 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp1);
1213 _mm_storeu_si128((__m128i *) (&pi2_out[out_strd]), temp2);
1214 _mm_storeu_si128((__m128i *) (&pi2_out[2 * out_strd]), temp3);
1215 _mm_storeu_si128((__m128i *) (&pi2_out[3 * out_strd]), temp4);
1216 _mm_storeu_si128((__m128i *) (&pi2_out[4 * out_strd]), temp5);
1217 _mm_storeu_si128((__m128i *) (&pi2_out[5 * out_strd]), temp6);
1218 _mm_storeu_si128((__m128i *) (&pi2_out[6 * out_strd]), temp7);
1219 _mm_storeu_si128((__m128i *) (&pi2_out[7 * out_strd]), temp8);
1220 }
1221
1222 /*****************************************************************************/
1223 /* */
1224 /* Function Name : isvcd_iquant_itrans_chroma_4x4_sse42 */
1225 /* */
1226 /* Description : this function computes the inverse quantized and */
1227 /* inverse transformed output */
1228 /* */
1229 /* Inputs : */
1230 /* Globals : none */
1231 /* Processing : */
1232 /* */
1233 /* Outputs : none */
1234 /* Returns : none */
1235 /* */
1236 /* Issues : none */
1237 /* */
1238 /* Revision History: */
1239 /* */
1240 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1241 /* 25 11 2021 Kishore creation */
1242 /* */
1243 /*****************************************************************************/
1244
isvcd_iquant_itrans_chroma_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_out,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)1245 void isvcd_iquant_itrans_chroma_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_out, WORD32 out_strd,
1246 const UWORD16 *pu2_iscal_mat,
1247 const UWORD16 *pu2_weigh_mat, UWORD32 u4_qp_div_6,
1248 WORD16 *pi2_tmp, WORD16 *pi2_dc_src)
1249 {
1250 __m128i src_r0_r1, src_r2_r3;
1251 __m128i src_r0, src_r1, src_r2, src_r3;
1252 __m128i scalemat_r0_r1, scalemat_r2_r3;
1253 __m128i dequant_r0_r1, dequant_r2_r3;
1254 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
1255 __m128i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1256 __m128i resq_r0, resq_r1, resq_r2, resq_r3;
1257 __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
1258 __m128i value_32 = _mm_set1_epi32(32);
1259 __m128i dupmax_4x32b = _mm_set1_epi32(RSD_MAX);
1260 __m128i dupmin_4x32b = _mm_set1_epi32(RSD_MIN);
1261
1262 __m128i chroma_mask_even =
1263 _mm_set_epi16(0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff);
1264 __m128i chroma_mask_odd =
1265 _mm_set_epi16(0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000);
1266
1267 UNUSED(pi2_tmp);
1268
1269 /*************************************************************/
1270 /* Dequantization of coefficients. Will be replaced by SIMD */
1271 /* operations on platform */
1272 /*************************************************************/
1273 // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
1274 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));
1275 // a20 a21 a22 a23 a30 a31 a32 a33 --the source matrix 2nd,3rd row
1276 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));
1277 // b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
1278 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat));
1279 // b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
1280 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_iscal_mat + 8));
1281 // q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
1282 dequant_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat));
1283 // q20 q21 q22 q23 q30 q31 q32 q33 -- all 16 bits
1284 dequant_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_weigh_mat + 8));
1285 // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
1286 temp0 = _mm_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
1287 // b00*q00 b01*q01 b02*q02 b03*q03 b10*q10 b11*q11 b12*q12 b13*q13 -- 16 bit result
1288 temp1 = _mm_mullo_epi16(scalemat_r2_r3, dequant_r2_r3);
1289 // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
1290 temp4 = _mm_unpacklo_epi16(temp0, zero_8x16b);
1291 // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
1292 temp5 = _mm_unpackhi_epi16(temp0, zero_8x16b);
1293 // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
1294 temp6 = _mm_unpacklo_epi16(temp1, zero_8x16b);
1295 // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
1296 temp7 = _mm_unpackhi_epi16(temp1, zero_8x16b);
1297
1298 src_r0 = _mm_unpacklo_epi16(src_r0_r1, zero_8x16b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
1299 src_r1 = _mm_unpackhi_epi16(src_r0_r1, zero_8x16b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
1300 src_r2 = _mm_unpacklo_epi16(src_r2_r3, zero_8x16b); // a20 0 a21 0 a22 0 a23 0 -- 16 bit long
1301 src_r3 = _mm_unpackhi_epi16(src_r2_r3, zero_8x16b); // a30 0 a31 0 a32 0 a33 0 -- 16 bit long
1302
1303 // a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
1304 temp4 = _mm_madd_epi16(src_r0, temp4);
1305 temp5 = _mm_madd_epi16(src_r1, temp5);
1306 temp6 = _mm_madd_epi16(src_r2, temp6);
1307 temp7 = _mm_madd_epi16(src_r3, temp7);
1308
1309 if(u4_qp_div_6 >= 4)
1310 {
1311 resq_r0 = _mm_slli_epi32(temp4, u4_qp_div_6 - 4);
1312 resq_r1 = _mm_slli_epi32(temp5, u4_qp_div_6 - 4);
1313 resq_r2 = _mm_slli_epi32(temp6, u4_qp_div_6 - 4);
1314 resq_r3 = _mm_slli_epi32(temp7, u4_qp_div_6 - 4);
1315 }
1316 else
1317 {
1318 temp4 = _mm_add_epi32(temp4, add_rshift);
1319 temp5 = _mm_add_epi32(temp5, add_rshift);
1320 temp6 = _mm_add_epi32(temp6, add_rshift);
1321 temp7 = _mm_add_epi32(temp7, add_rshift);
1322 resq_r0 = _mm_srai_epi32(temp4, 4 - u4_qp_div_6);
1323 resq_r1 = _mm_srai_epi32(temp5, 4 - u4_qp_div_6);
1324 resq_r2 = _mm_srai_epi32(temp6, 4 - u4_qp_div_6);
1325 resq_r3 = _mm_srai_epi32(temp7, 4 - u4_qp_div_6);
1326 }
1327
1328 resq_r0 = _mm_insert_epi32(resq_r0, (WORD32) pi2_dc_src[0], 0);
1329 /* Perform Inverse transform */
1330 /*-------------------------------------------------------------*/
1331 /* IDCT [ Horizontal transformation ] */
1332 /*-------------------------------------------------------------*/
1333 // Matrix transpose
1334 /*
1335 * a0 a1 a2 a3
1336 * b0 b1 b2 b3
1337 * c0 c1 c2 c3
1338 * d0 d1 d2 d3
1339 */
1340 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); // a0 b0 a1 b1
1341 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); // c0 d0 c1 d1
1342 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); // a2 b2 a3 b3
1343 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); // c2 d2 c3 d3
1344 resq_r0 = _mm_unpacklo_epi64(temp1, temp3); // a0 b0 c0 d0
1345 resq_r1 = _mm_unpackhi_epi64(temp1, temp3); // a1 b1 c1 d1
1346 resq_r2 = _mm_unpacklo_epi64(temp2, temp4); // a2 b2 c2 d2
1347 resq_r3 = _mm_unpackhi_epi64(temp2, temp4); // a3 b3 c3 d3
1348 // Transform starts -- horizontal transform
1349 /*------------------------------------------------------------------*/
1350 /* z0 = w0 + w2 */
1351 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1352 /* z1 = w0 - w2 */
1353 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1354 /* z2 = (w1 >> 1) - w3 */
1355 temp2 = _mm_srai_epi32(resq_r1, 1); //(w1>>1)
1356 temp2 = _mm_sub_epi32(temp2, resq_r3); //(w1>>1) - w3
1357 /* z3 = w1 + (w3 >> 1) */
1358 temp3 = _mm_srai_epi32(resq_r3, 1); //(w3>>1) + w1
1359 temp3 = _mm_add_epi32(temp3, resq_r1);
1360 /*----------------------------------------------------------*/
1361 /* x0 = z0 + z3 */
1362 resq_r0 = _mm_add_epi32(temp0, temp3);
1363 /* x1 = z1 + z2 */
1364 resq_r1 = _mm_add_epi32(temp1, temp2);
1365 /* x2 = z1 - z2 */
1366 resq_r2 = _mm_sub_epi32(temp1, temp2);
1367 /* x3 = z0 - z3 */
1368 resq_r3 = _mm_sub_epi32(temp0, temp3);
1369 // Matrix transpose
1370 /*
1371 * a0 b0 c0 d0
1372 * a1 b1 c1 d1
1373 * a2 b2 c2 d2
1374 * a3 b3 c3 d3
1375 */
1376 temp1 = _mm_unpacklo_epi32(resq_r0, resq_r1); // a0 a1 b0 b1
1377 temp3 = _mm_unpacklo_epi32(resq_r2, resq_r3); // a2 a3 b2 b3
1378 temp2 = _mm_unpackhi_epi32(resq_r0, resq_r1); // c0 c1 d0 d1
1379 temp4 = _mm_unpackhi_epi32(resq_r2, resq_r3); // c2 c3 d2 d3
1380 resq_r0 = _mm_unpacklo_epi64(temp1, temp3); // a0 a1 a2 a3
1381 resq_r1 = _mm_unpackhi_epi64(temp1, temp3); // b0 b1 b2 b3
1382 resq_r2 = _mm_unpacklo_epi64(temp2, temp4); // c0 c1 c2 c3
1383 resq_r3 = _mm_unpackhi_epi64(temp2, temp4); // d0 d1 d2 d3
1384 // Transform ends -- horizontal transform
1385
1386 /*--------------------------------------------------------------*/
1387 /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
1388 /* Add the prediction and store it back to same buffer */
1389 /*--------------------------------------------------------------*/
1390 /* z0j = y0j + y2j */
1391 temp0 = _mm_add_epi32(resq_r0, resq_r2);
1392 /* z1j = y0j - y2j */
1393 temp1 = _mm_sub_epi32(resq_r0, resq_r2);
1394 /* z2j = (y1j>>1) - y3j */
1395 temp2 = _mm_srai_epi32(resq_r1, 1); //(y1j>>1)
1396 temp2 = _mm_sub_epi32(temp2, resq_r3);
1397 /* z3j = y1j + (y3j>>1) */
1398 temp3 = _mm_srai_epi32(resq_r3, 1); //(y3j>>1)
1399 temp3 = _mm_add_epi32(temp3, resq_r1);
1400
1401 /* x0j = z0j + z3j */
1402 temp4 = _mm_add_epi32(temp0, temp3);
1403 temp4 = _mm_add_epi32(temp4, value_32);
1404 temp4 = _mm_srai_epi32(temp4, 6);
1405 temp4 = _mm_min_epi32(dupmax_4x32b, temp4);
1406 temp4 = _mm_max_epi32(dupmin_4x32b, temp4);
1407
1408 /* x1j = z1j + z2j */
1409 temp5 = _mm_add_epi32(temp1, temp2);
1410 temp5 = _mm_add_epi32(temp5, value_32);
1411 temp5 = _mm_srai_epi32(temp5, 6);
1412 temp5 = _mm_min_epi32(dupmax_4x32b, temp5);
1413 temp5 = _mm_max_epi32(dupmin_4x32b, temp5);
1414
1415 /* x2j = z1j - z2j */
1416 temp6 = _mm_sub_epi32(temp1, temp2);
1417 temp6 = _mm_add_epi32(temp6, value_32);
1418 temp6 = _mm_srai_epi32(temp6, 6);
1419 temp6 = _mm_min_epi32(dupmax_4x32b, temp6);
1420 temp6 = _mm_max_epi32(dupmin_4x32b, temp6);
1421
1422 /* x3j = z0j - z3j */
1423 temp7 = _mm_sub_epi32(temp0, temp3);
1424 temp7 = _mm_add_epi32(temp7, value_32);
1425 temp7 = _mm_srai_epi32(temp7, 6);
1426 temp7 = _mm_min_epi32(dupmax_4x32b, temp7);
1427 temp7 = _mm_max_epi32(dupmin_4x32b, temp7);
1428
1429 // 32-bit to 16-bit conversion
1430 temp0 = _mm_packs_epi32(temp4, temp5);
1431 temp1 = _mm_packs_epi32(temp6, temp7);
1432
1433 resq_r0 = temp0;
1434 resq_r1 = _mm_srli_si128(temp0, 8);
1435 resq_r2 = temp1;
1436 resq_r3 = _mm_srli_si128(temp1, 8);
1437
1438 // a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
1439 src_r0 = _mm_loadu_si128((__m128i *) (pi2_out));
1440 // a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
1441 src_r1 = _mm_loadu_si128((__m128i *) (pi2_out + (1 * out_strd)));
1442
1443 src_r2 = _mm_loadu_si128((__m128i *) (pi2_out + (2 * out_strd)));
1444 src_r3 = _mm_loadu_si128((__m128i *) (pi2_out + (3 * out_strd)));
1445
1446 resq_r0 = _mm_and_si128(temp4, chroma_mask_even); // macro 0 macro 0 ..
1447 resq_r1 = _mm_and_si128(temp5, chroma_mask_even);
1448 resq_r2 = _mm_and_si128(temp6, chroma_mask_even);
1449 resq_r3 = _mm_and_si128(temp7, chroma_mask_even);
1450
1451 src_r0 = _mm_and_si128(src_r0, chroma_mask_odd); // 0 src1 0 src2 0 ...
1452 src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
1453 src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
1454 src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
1455
1456 src_r0 = _mm_add_epi16(src_r0, resq_r0); // macro src1 macro src2 macro ...
1457 src_r1 = _mm_add_epi16(src_r1, resq_r1);
1458 src_r2 = _mm_add_epi16(src_r2, resq_r2);
1459 src_r3 = _mm_add_epi16(src_r3, resq_r3);
1460
1461 _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0);
1462 _mm_storeu_si128((__m128i *) (&pi2_out[out_strd]), src_r1);
1463 _mm_storeu_si128((__m128i *) (&pi2_out[2 * out_strd]), src_r2);
1464 _mm_storeu_si128((__m128i *) (&pi2_out[3 * out_strd]), src_r3);
1465 }
1466