1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_pred_residual_recon_sse42.c
24 *
25 * @brief
26 * Contains function definitions for pred_residual and recon transform
27 *
28 * @author
29 * Kishore
30 *
31 * @par List of Functions:
32 * - isvcd_pred_residual_recon_4x4_sse42()
33 * - isvcd_pred_residual_recon_8x8_sse42()
34 * - isvcd_pred_residual_recon_16x16_sse42()
35 * - isvcd_pred_residual_recon_chroma_4x4_sse42()
36 * - isvcd_pred_residual_recon_chroma_8x8_sse42()
37 * - isvcd_residual_luma_4x4_sse42()
38 * - isvcd_residual_luma_8x8_sse42()
39 * - isvcd_residual_luma_16x16_sse42()
40 * - isvcd_residual_chroma_cb_cr_8x8_sse42()
41 *
42 * @remarks
43 * None
44 *
45 *******************************************************************************
46 */
47 /* User include files */
48 #include <immintrin.h>
49 #include "ih264_typedefs.h"
50 #include "ih264_defs.h"
51 #include "ih264_trans_macros.h"
52 #include "ih264_macros.h"
53 #include "ih264_platform_macros.h"
54 #include "ih264_trans_data.h"
55 #include "ih264_size_defs.h"
56 #include "ih264_structs.h"
57 #include "isvcd_pred_residual_recon.h"
58
59 /*****************************************************************************/
60 /* */
61 /* Function Name : isvcd_pred_residual_recon_4x4_sse42 */
62 /* */
63 /* Description : this function computes the recon from */
64 /* the residual and pred buffer */
65 /* Inputs : */
66 /* Globals : none */
67 /* Processing : */
68 /* */
69 /* Outputs : none */
70 /* Returns : nnz */
71 /* */
72 /* Issues : none */
73 /* */
74 /* Revision History: */
75 /* */
76 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
77 /* 25 11 2021 Kishore creation */
78 /* */
79 /*****************************************************************************/
80
isvcd_pred_residual_recon_4x4_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)81 WORD32 isvcd_pred_residual_recon_4x4_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
82 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
83 {
84 __m128i pred_16x8b_0, pred_8x16b_0, rsd_8x16b_0, out_8x16b_0, out_16x8b_0;
85 __m128i pred_16x8b_1, pred_8x16b_1, rsd_8x16b_1, out_8x16b_1, out_16x8b_1;
86 __m128i pred_16x8b_2, pred_8x16b_2, rsd_8x16b_2, out_8x16b_2, out_16x8b_2;
87 __m128i pred_16x8b_3, pred_8x16b_3, rsd_8x16b_3, out_8x16b_3, out_16x8b_3;
88 __m128i rsd_8x16b_01, rsd_8x16b_23;
89
90 __m128i zero_8x16b = _mm_setzero_si128();
91 WORD32 i4_nnz, row_01, row_23;
92
93 pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
94 pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
95 pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + (pred_strd << 1)));
96 pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + (pred_strd << 1) + pred_strd));
97
98 pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
99 pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
100 pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
101 pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
102
103 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
104 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
105 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1)));
106 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1) + rsd_strd));
107
108 rsd_8x16b_01 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
109 rsd_8x16b_23 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
110
111 row_01 = _mm_test_all_ones(
112 _mm_cmpeq_epi16(rsd_8x16b_01, zero_8x16b)); // return 1 if all zeros, else 0
113 row_23 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23, zero_8x16b));
114
115 out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
116 out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
117 out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
118 out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
119
120 out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
121 out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
122 out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
123 out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
124
125 *((WORD32 *) (pu1_out)) = _mm_cvtsi128_si32(out_16x8b_0);
126 *((WORD32 *) (pu1_out + out_strd)) = _mm_cvtsi128_si32(out_16x8b_1);
127 *((WORD32 *) (pu1_out + (out_strd << 1))) = _mm_cvtsi128_si32(out_16x8b_2);
128 *((WORD32 *) (pu1_out + (out_strd * 3))) = _mm_cvtsi128_si32(out_16x8b_3);
129 i4_nnz = !(row_01 && row_23);
130
131 return i4_nnz;
132 }
133
134 /*****************************************************************************/
135 /* */
136 /* Function Name : isvcd_pred_residual_recon_8x8_sse42 */
137 /* */
138 /* Description : this function computes the recon from */
139 /* the residual and pred buffer */
140 /* Inputs : */
141 /* Globals : none */
142 /* Processing : */
143 /* */
144 /* Outputs : none */
145 /* Returns : nnz */
146 /* */
147 /* Issues : none */
148 /* */
149 /* Revision History: */
150 /* */
151 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
152 /* 25 11 2021 Kishore creation */
153 /* */
154 /*****************************************************************************/
155
isvcd_pred_residual_recon_8x8_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)156 WORD32 isvcd_pred_residual_recon_8x8_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
157 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
158 {
159 __m128i pred_16x8b_0, pred_8x16b_0, rsd_8x16b_0, out_8x16b_0, out_16x8b_0;
160 __m128i pred_16x8b_1, pred_8x16b_1, rsd_8x16b_1, out_8x16b_1, out_16x8b_1;
161 __m128i pred_16x8b_2, pred_8x16b_2, rsd_8x16b_2, out_8x16b_2, out_16x8b_2;
162 __m128i pred_16x8b_3, pred_8x16b_3, rsd_8x16b_3, out_8x16b_3, out_16x8b_3;
163 __m128i pred_16x8b_4, pred_8x16b_4, rsd_8x16b_4, out_8x16b_4, out_16x8b_4;
164 __m128i pred_16x8b_5, pred_8x16b_5, rsd_8x16b_5, out_8x16b_5, out_16x8b_5;
165 __m128i pred_16x8b_6, pred_8x16b_6, rsd_8x16b_6, out_8x16b_6, out_16x8b_6;
166 __m128i pred_16x8b_7, pred_8x16b_7, rsd_8x16b_7, out_8x16b_7, out_16x8b_7;
167 __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
168 __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
169
170 WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
171 WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
172 WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
173
174 __m128i zero_8x16b = _mm_setzero_si128();
175
176 WORD32 pred_strd2 = (pred_strd << 1);
177 WORD32 pred_strd4 = (pred_strd << 2);
178 WORD32 rsd_strd2 = (rsd_strd << 1);
179 WORD32 rsd_strd4 = (rsd_strd << 2);
180 WORD32 out_strd2 = (out_strd << 1);
181 WORD32 out_strd4 = (out_strd << 2);
182
183 pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
184 pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
185 pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
186 pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
187 pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
188 pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
189 pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
190 pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
191
192 pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
193 pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
194 pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
195 pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
196 pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
197 pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
198 pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
199 pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
200
201 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
202 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
203 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
204 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
205 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
206 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
207 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
208 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
209
210 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
211 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
212 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
213 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
214
215 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
216 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
217 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
218 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
219
220 row_01_b0 = _mm_test_all_ones(
221 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
222 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
223 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
224 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
225 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
226 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
227 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
228 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
229
230 out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
231 out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
232 out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
233 out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
234 out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
235 out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
236 out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
237 out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
238
239 out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
240 out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
241 out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
242 out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
243 out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
244 out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
245 out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
246 out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
247
248 _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
249 _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
250 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
251 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
252 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
253 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
254 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
255 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
256
257 i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
258 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
259 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
260 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
261
262 i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
263 return i4_nnz;
264 }
265
266 /*****************************************************************************/
267 /* */
268 /* Function Name : isvcd_pred_residual_recon_16x16_sse42 */
269 /* */
270 /* Description : this function computes the recon from */
271 /* the residual and pred buffer */
272 /* Inputs : */
273 /* Globals : none */
274 /* Processing : */
275 /* */
276 /* Outputs : none */
277 /* Returns : nnz */
278 /* */
279 /* Issues : none */
280 /* */
281 /* Revision History: */
282 /* */
283 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
284 /* 25 11 2021 Kishore creation */
285 /* */
286 /*****************************************************************************/
287
isvcd_pred_residual_recon_16x16_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)288 WORD32 isvcd_pred_residual_recon_16x16_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
289 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
290 {
291 __m128i pred_16x8b_0, pred_8x16b_0, rsd_8x16b_0, out_8x16b_0, out_16x8b_0;
292 __m128i pred_16x8b_1, pred_8x16b_1, rsd_8x16b_1, out_8x16b_1, out_16x8b_1;
293 __m128i pred_16x8b_2, pred_8x16b_2, rsd_8x16b_2, out_8x16b_2, out_16x8b_2;
294 __m128i pred_16x8b_3, pred_8x16b_3, rsd_8x16b_3, out_8x16b_3, out_16x8b_3;
295 __m128i pred_16x8b_4, pred_8x16b_4, rsd_8x16b_4, out_8x16b_4, out_16x8b_4;
296 __m128i pred_16x8b_5, pred_8x16b_5, rsd_8x16b_5, out_8x16b_5, out_16x8b_5;
297 __m128i pred_16x8b_6, pred_8x16b_6, rsd_8x16b_6, out_8x16b_6, out_16x8b_6;
298 __m128i pred_16x8b_7, pred_8x16b_7, rsd_8x16b_7, out_8x16b_7, out_16x8b_7;
299 __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
300 __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
301
302 WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
303 WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
304 WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
305
306 __m128i zero_8x16b = _mm_setzero_si128();
307
308 WORD32 pred_strd2 = (pred_strd << 1);
309 WORD32 pred_strd4 = (pred_strd << 2);
310 WORD32 rsd_strd2 = (rsd_strd << 1);
311 WORD32 rsd_strd4 = (rsd_strd << 2);
312 WORD32 out_strd2 = (out_strd << 1);
313 WORD32 out_strd4 = (out_strd << 2);
314
315 pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
316 pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
317 pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
318 pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
319 pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
320 pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
321 pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
322 pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
323
324 pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
325 pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
326 pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
327 pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
328 pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
329 pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
330 pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
331 pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
332
333 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
334 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
335 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
336 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
337 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
338 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
339 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
340 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
341
342 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
343 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
344 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
345 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
346
347 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
348 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
349 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
350 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
351
352 row_01_b0 = _mm_test_all_ones(
353 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
354 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
355 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
356 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
357 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
358 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
359 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
360 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
361
362 out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
363 out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
364 out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
365 out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
366 out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
367 out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
368 out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
369 out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
370
371 out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
372 out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
373 out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
374 out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
375 out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
376 out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
377 out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
378 out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
379
380 _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
381 _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
382 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
383 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
384 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
385 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
386 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
387 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
388
389 i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
390 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
391 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
392 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
393
394 i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
395
396 pu1_pred += 8;
397 pi2_rsd += 8;
398 pu1_out += 8;
399
400 pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
401 pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
402 pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
403 pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
404 pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
405 pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
406 pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
407 pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
408
409 pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
410 pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
411 pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
412 pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
413 pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
414 pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
415 pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
416 pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
417
418 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
419 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
420 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
421 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
422 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
423 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
424 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
425 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
426
427 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
428 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
429 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
430 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
431
432 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
433 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
434 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
435 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
436
437 row_01_b0 = _mm_test_all_ones(
438 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
439 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
440 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
441 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
442 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
443 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
444 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
445 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
446
447 out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
448 out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
449 out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
450 out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
451 out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
452 out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
453 out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
454 out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
455
456 out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
457 out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
458 out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
459 out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
460 out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
461 out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
462 out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
463 out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
464
465 _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
466 _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
467 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
468 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
469 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
470 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
471 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
472 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
473
474 i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 2;
475 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 3;
476 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 6;
477 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 7;
478
479 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
480
481 pu1_pred -= 8;
482 pi2_rsd -= 8;
483 pu1_out -= 8;
484
485 pu1_pred += (pred_strd << 3);
486 pi2_rsd += (rsd_strd << 3);
487 pu1_out += (out_strd << 3);
488
489 pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
490 pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
491 pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
492 pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
493 pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
494 pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
495 pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
496 pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
497
498 pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
499 pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
500 pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
501 pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
502 pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
503 pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
504 pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
505 pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
506
507 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
508 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
509 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
510 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
511 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
512 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
513 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
514 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
515
516 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
517 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
518 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
519 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
520
521 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
522 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
523 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
524 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
525
526 row_01_b0 = _mm_test_all_ones(
527 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
528 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
529 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
530 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
531 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
532 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
533 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
534 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
535
536 out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
537 out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
538 out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
539 out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
540 out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
541 out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
542 out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
543 out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
544
545 out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
546 out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
547 out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
548 out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
549 out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
550 out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
551 out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
552 out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
553
554 _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
555 _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
556 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
557 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
558 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
559 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
560 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
561 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
562
563 i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 8;
564 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 9;
565 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 12;
566 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 13;
567
568 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
569
570 pu1_pred += 8;
571 pi2_rsd += 8;
572 pu1_out += 8;
573
574 pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
575 pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
576 pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
577 pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
578 pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
579 pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
580 pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
581 pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
582
583 pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
584 pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
585 pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
586 pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
587 pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
588 pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
589 pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
590 pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
591
592 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
593 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
594 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
595 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
596 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
597 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
598 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
599 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
600
601 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
602 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
603 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
604 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
605
606 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
607 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
608 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
609 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
610
611 row_01_b0 = _mm_test_all_ones(
612 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
613 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
614 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
615 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
616 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
617 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
618 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
619 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
620
621 out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
622 out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
623 out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
624 out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
625 out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
626 out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
627 out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
628 out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
629
630 out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
631 out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
632 out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
633 out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
634 out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
635 out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
636 out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
637 out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
638
639 _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
640 _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
641 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
642 _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
643 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
644 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
645 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
646 _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
647
648 i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 10;
649 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 11;
650 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 14;
651 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 15;
652
653 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
654 return i4_nnz;
655 }
656
657 /*****************************************************************************/
658 /* */
659 /* Function Name : isvcd_pred_residual_recon_chroma_4x4_sse42 */
660 /* */
661 /* Description : this function computes the recon from */
662 /* the residual and pred buffer */
663 /* Inputs : */
664 /* Globals : none */
665 /* Processing : */
666 /* */
667 /* Outputs : none */
668 /* Returns : nnz */
669 /* */
670 /* Issues : none */
671 /* */
672 /* Revision History: */
673 /* */
674 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
675 /* 25 11 2021 Kishore creation */
676 /* */
677 /*****************************************************************************/
678
isvcd_pred_residual_recon_chroma_4x4_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)679 void isvcd_pred_residual_recon_chroma_4x4_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
680 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
681 {
682 __m128i src_r0, src_r1, src_r2, src_r3;
683 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
684 __m128i pred0, pred1, pred2, pred3;
685 __m128i rsd_r0, rsd_r1, rsd_r2, rsd_r3;
686 __m128i zero_16x8b; // all bits reset to zero
687 __m128i chroma_mask_even;
688 __m128i chroma_mask_odd;
689
690 zero_16x8b = _mm_setzero_si128();
691
692 rsd_r0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
693 rsd_r1 = _mm_loadu_si128((__m128i *) (pi2_rsd + (1 * rsd_strd)));
694 rsd_r2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (2 * rsd_strd)));
695 rsd_r3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (3 * rsd_strd)));
696
697 pred_r0 = _mm_loadu_si128((__m128i *) (pu1_pred));
698 pred_r1 = _mm_loadu_si128((__m128i *) (pu1_pred + (1 * pred_strd)));
699 pred_r2 = _mm_loadu_si128((__m128i *) (pu1_pred + (2 * pred_strd)));
700 pred_r3 = _mm_loadu_si128((__m128i *) (pu1_pred + (3 * pred_strd)));
701
702 src_r0 = _mm_loadu_si128((__m128i *) (pu1_out));
703 src_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (1 * out_strd)));
704 src_r2 = _mm_loadu_si128((__m128i *) (pu1_out + (2 * out_strd)));
705 src_r3 = _mm_loadu_si128((__m128i *) (pu1_out + (3 * out_strd)));
706
707 pred0 = _mm_cvtepu8_epi16(pred_r0);
708 pred1 = _mm_cvtepu8_epi16(pred_r1);
709 pred2 = _mm_cvtepu8_epi16(pred_r2);
710 pred3 = _mm_cvtepu8_epi16(pred_r3);
711
712 pred0 = _mm_add_epi16(pred0, rsd_r0);
713 pred1 = _mm_add_epi16(pred1, rsd_r1);
714 pred2 = _mm_add_epi16(pred2, rsd_r2);
715 pred3 = _mm_add_epi16(pred3, rsd_r3);
716
717 pred0 = _mm_packus_epi16(pred0, zero_16x8b);
718 pred1 = _mm_packus_epi16(pred1, zero_16x8b);
719 pred2 = _mm_packus_epi16(pred2, zero_16x8b);
720 pred3 = _mm_packus_epi16(pred3, zero_16x8b);
721
722 chroma_mask_even = _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
723 0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
724 chroma_mask_odd = _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff,
725 0x00, 0xff, 0x00, 0xff, 0x00);
726
727 src_r0 = _mm_and_si128(src_r0, chroma_mask_odd); // 0 src1 0 src2 0 ...
728 src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
729 src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
730 src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
731
732 pred0 = _mm_and_si128(pred0, chroma_mask_even); // val 0 val 0 ..
733 pred1 = _mm_and_si128(pred1, chroma_mask_even);
734 pred2 = _mm_and_si128(pred2, chroma_mask_even);
735 pred3 = _mm_and_si128(pred3, chroma_mask_even);
736
737 src_r0 = _mm_add_epi8(src_r0, pred0); // macro src1 macro src2 macro ...
738 src_r1 = _mm_add_epi8(src_r1, pred1);
739 src_r2 = _mm_add_epi8(src_r2, pred2);
740 src_r3 = _mm_add_epi8(src_r3, pred3);
741
742 _mm_storel_epi64((__m128i *) (&pu1_out[0]), src_r0);
743 _mm_storel_epi64((__m128i *) (&pu1_out[out_strd]), src_r1);
744 _mm_storel_epi64((__m128i *) (&pu1_out[2 * out_strd]), src_r2);
745 _mm_storel_epi64((__m128i *) (&pu1_out[3 * out_strd]), src_r3);
746 }
747
748 /*****************************************************************************/
749 /* */
750 /* Function Name : isvcd_pred_residual_recon_chroma_8x8_sse42 */
751 /* */
752 /* Description : this function computes the recon from */
753 /* the residual and pred buffer */
754 /* Inputs : */
755 /* Globals : none */
756 /* Processing : */
757 /* */
758 /* Outputs : none */
759 /* Returns : nnz */
760 /* */
761 /* Issues : none */
762 /* */
763 /* Revision History: */
764 /* */
765 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
766 /* 25 11 2021 Kishore creation */
767 /* */
768 /*****************************************************************************/
769
isvcd_pred_residual_recon_chroma_8x8_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)770 void isvcd_pred_residual_recon_chroma_8x8_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
771 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
772 {
773 __m128i src_r0, src_r1, src_r2, src_r3, src_r4, src_r5, src_r6, src_r7;
774 __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
775 __m128i rsd_r0, rsd_r1, rsd_r2, rsd_r3, rsd_r4, rsd_r5, rsd_r6, rsd_r7;
776 __m128i zero_16x8b; // all bits reset to zero
777 __m128i chroma_mask_even;
778 __m128i chroma_mask_odd;
779
780 zero_16x8b = _mm_setzero_si128();
781
782 rsd_r0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
783 rsd_r1 = _mm_loadu_si128((__m128i *) (pi2_rsd + (1 * rsd_strd)));
784 rsd_r2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (2 * rsd_strd)));
785 rsd_r3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (3 * rsd_strd)));
786 rsd_r4 = _mm_loadu_si128((__m128i *) (pi2_rsd + (4 * rsd_strd)));
787 rsd_r5 = _mm_loadu_si128((__m128i *) (pi2_rsd + (5 * rsd_strd)));
788 rsd_r6 = _mm_loadu_si128((__m128i *) (pi2_rsd + (6 * rsd_strd)));
789 rsd_r7 = _mm_loadu_si128((__m128i *) (pi2_rsd + (7 * rsd_strd)));
790
791 pred0 = _mm_loadu_si128((__m128i *) (pu1_pred));
792 pred1 = _mm_loadu_si128((__m128i *) (pu1_pred + (1 * pred_strd)));
793 pred2 = _mm_loadu_si128((__m128i *) (pu1_pred + (2 * pred_strd)));
794 pred3 = _mm_loadu_si128((__m128i *) (pu1_pred + (3 * pred_strd)));
795 pred4 = _mm_loadu_si128((__m128i *) (pu1_pred + (4 * pred_strd)));
796 pred5 = _mm_loadu_si128((__m128i *) (pu1_pred + (5 * pred_strd)));
797 pred6 = _mm_loadu_si128((__m128i *) (pu1_pred + (6 * pred_strd)));
798 pred7 = _mm_loadu_si128((__m128i *) (pu1_pred + (7 * pred_strd)));
799
800 src_r0 = _mm_loadu_si128((__m128i *) (pu1_out));
801 src_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (1 * out_strd)));
802 src_r2 = _mm_loadu_si128((__m128i *) (pu1_out + (2 * out_strd)));
803 src_r3 = _mm_loadu_si128((__m128i *) (pu1_out + (3 * out_strd)));
804 src_r4 = _mm_loadu_si128((__m128i *) (pu1_out + (4 * out_strd)));
805 src_r5 = _mm_loadu_si128((__m128i *) (pu1_out + (5 * out_strd)));
806 src_r6 = _mm_loadu_si128((__m128i *) (pu1_out + (6 * out_strd)));
807 src_r7 = _mm_loadu_si128((__m128i *) (pu1_out + (7 * out_strd)));
808
809 pred0 = _mm_cvtepu8_epi16(pred0);
810 pred1 = _mm_cvtepu8_epi16(pred1);
811 pred2 = _mm_cvtepu8_epi16(pred2);
812 pred3 = _mm_cvtepu8_epi16(pred3);
813 pred4 = _mm_cvtepu8_epi16(pred4);
814 pred5 = _mm_cvtepu8_epi16(pred5);
815 pred6 = _mm_cvtepu8_epi16(pred6);
816 pred7 = _mm_cvtepu8_epi16(pred7);
817
818 pred0 = _mm_add_epi16(pred0, rsd_r0);
819 pred1 = _mm_add_epi16(pred1, rsd_r1);
820 pred2 = _mm_add_epi16(pred2, rsd_r2);
821 pred3 = _mm_add_epi16(pred3, rsd_r3);
822 pred4 = _mm_add_epi16(pred4, rsd_r4);
823 pred5 = _mm_add_epi16(pred5, rsd_r5);
824 pred6 = _mm_add_epi16(pred6, rsd_r6);
825 pred7 = _mm_add_epi16(pred7, rsd_r7);
826
827 pred0 = _mm_packus_epi16(pred0, zero_16x8b);
828 pred1 = _mm_packus_epi16(pred1, zero_16x8b);
829 pred2 = _mm_packus_epi16(pred2, zero_16x8b);
830 pred3 = _mm_packus_epi16(pred3, zero_16x8b);
831 pred4 = _mm_packus_epi16(pred4, zero_16x8b);
832 pred5 = _mm_packus_epi16(pred5, zero_16x8b);
833 pred6 = _mm_packus_epi16(pred6, zero_16x8b);
834 pred7 = _mm_packus_epi16(pred7, zero_16x8b);
835
836 chroma_mask_even = _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
837 0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
838 chroma_mask_odd = _mm_set_epi8(0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
839 0x00, 0xff, 0x00, 0xff, 0x00);
840
841 src_r0 = _mm_and_si128(src_r0, chroma_mask_odd); // 0 src1 0 src2 0 ...
842 src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
843 src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
844 src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
845 src_r4 = _mm_and_si128(src_r4, chroma_mask_odd);
846 src_r5 = _mm_and_si128(src_r5, chroma_mask_odd);
847 src_r6 = _mm_and_si128(src_r6, chroma_mask_odd);
848 src_r7 = _mm_and_si128(src_r7, chroma_mask_odd);
849
850 pred0 = _mm_and_si128(pred0, chroma_mask_even); // val 0 val 0 ..
851 pred1 = _mm_and_si128(pred1, chroma_mask_even);
852 pred2 = _mm_and_si128(pred2, chroma_mask_even);
853 pred3 = _mm_and_si128(pred3, chroma_mask_even);
854 pred4 = _mm_and_si128(pred4, chroma_mask_even);
855 pred5 = _mm_and_si128(pred5, chroma_mask_even);
856 pred6 = _mm_and_si128(pred6, chroma_mask_even);
857 pred7 = _mm_and_si128(pred7, chroma_mask_even);
858
859 src_r0 = _mm_add_epi8(src_r0, pred0); // macro src1 macro src2 macro ...
860 src_r1 = _mm_add_epi8(src_r1, pred1);
861 src_r2 = _mm_add_epi8(src_r2, pred2);
862 src_r3 = _mm_add_epi8(src_r3, pred3);
863 src_r4 = _mm_add_epi8(src_r4, pred4);
864 src_r5 = _mm_add_epi8(src_r5, pred5);
865 src_r6 = _mm_add_epi8(src_r6, pred6);
866 src_r7 = _mm_add_epi8(src_r7, pred7);
867
868 _mm_storel_epi64((__m128i *) (&pu1_out[0]), src_r0);
869 _mm_storel_epi64((__m128i *) (&pu1_out[out_strd]), src_r1);
870 _mm_storel_epi64((__m128i *) (&pu1_out[2 * out_strd]), src_r2);
871 _mm_storel_epi64((__m128i *) (&pu1_out[3 * out_strd]), src_r3);
872 _mm_storel_epi64((__m128i *) (&pu1_out[4 * out_strd]), src_r4);
873 _mm_storel_epi64((__m128i *) (&pu1_out[5 * out_strd]), src_r5);
874 _mm_storel_epi64((__m128i *) (&pu1_out[6 * out_strd]), src_r6);
875 _mm_storel_epi64((__m128i *) (&pu1_out[7 * out_strd]), src_r7);
876
877 /* load and repeat for the last 4 elements interleaved in the row */
878
879 rsd_r0 = _mm_loadu_si128((__m128i *) (pi2_rsd + 8));
880 rsd_r1 = _mm_loadu_si128((__m128i *) (pi2_rsd + (1 * rsd_strd) + 8));
881 rsd_r2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (2 * rsd_strd) + 8));
882 rsd_r3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (3 * rsd_strd) + 8));
883 rsd_r4 = _mm_loadu_si128((__m128i *) (pi2_rsd + (4 * rsd_strd) + 8));
884 rsd_r5 = _mm_loadu_si128((__m128i *) (pi2_rsd + (5 * rsd_strd) + 8));
885 rsd_r6 = _mm_loadu_si128((__m128i *) (pi2_rsd + (6 * rsd_strd) + 8));
886 rsd_r7 = _mm_loadu_si128((__m128i *) (pi2_rsd + (7 * rsd_strd) + 8));
887
888 pred0 = _mm_loadu_si128((__m128i *) (pu1_pred + 8));
889 pred1 = _mm_loadu_si128((__m128i *) (pu1_pred + (1 * pred_strd) + 8));
890 pred2 = _mm_loadu_si128((__m128i *) (pu1_pred + (2 * pred_strd) + 8));
891 pred3 = _mm_loadu_si128((__m128i *) (pu1_pred + (3 * pred_strd) + 8));
892 pred4 = _mm_loadu_si128((__m128i *) (pu1_pred + (4 * pred_strd) + 8));
893 pred5 = _mm_loadu_si128((__m128i *) (pu1_pred + (5 * pred_strd) + 8));
894 pred6 = _mm_loadu_si128((__m128i *) (pu1_pred + (6 * pred_strd) + 8));
895 pred7 = _mm_loadu_si128((__m128i *) (pu1_pred + (7 * pred_strd) + 8));
896
897 src_r0 = _mm_loadu_si128((__m128i *) (pu1_out + 8));
898 src_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (1 * out_strd) + 8));
899 src_r2 = _mm_loadu_si128((__m128i *) (pu1_out + (2 * out_strd) + 8));
900 src_r3 = _mm_loadu_si128((__m128i *) (pu1_out + (3 * out_strd) + 8));
901 src_r4 = _mm_loadu_si128((__m128i *) (pu1_out + (4 * out_strd) + 8));
902 src_r5 = _mm_loadu_si128((__m128i *) (pu1_out + (5 * out_strd) + 8));
903 src_r6 = _mm_loadu_si128((__m128i *) (pu1_out + (6 * out_strd) + 8));
904 src_r7 = _mm_loadu_si128((__m128i *) (pu1_out + (7 * out_strd) + 8));
905
906 pred0 = _mm_cvtepu8_epi16(pred0);
907 pred1 = _mm_cvtepu8_epi16(pred1);
908 pred2 = _mm_cvtepu8_epi16(pred2);
909 pred3 = _mm_cvtepu8_epi16(pred3);
910 pred4 = _mm_cvtepu8_epi16(pred4);
911 pred5 = _mm_cvtepu8_epi16(pred5);
912 pred6 = _mm_cvtepu8_epi16(pred6);
913 pred7 = _mm_cvtepu8_epi16(pred7);
914
915 pred0 = _mm_add_epi16(pred0, rsd_r0);
916 pred1 = _mm_add_epi16(pred1, rsd_r1);
917 pred2 = _mm_add_epi16(pred2, rsd_r2);
918 pred3 = _mm_add_epi16(pred3, rsd_r3);
919 pred4 = _mm_add_epi16(pred4, rsd_r4);
920 pred5 = _mm_add_epi16(pred5, rsd_r5);
921 pred6 = _mm_add_epi16(pred6, rsd_r6);
922 pred7 = _mm_add_epi16(pred7, rsd_r7);
923
924 pred0 = _mm_packus_epi16(pred0, zero_16x8b);
925 pred1 = _mm_packus_epi16(pred1, zero_16x8b);
926 pred2 = _mm_packus_epi16(pred2, zero_16x8b);
927 pred3 = _mm_packus_epi16(pred3, zero_16x8b);
928 pred4 = _mm_packus_epi16(pred4, zero_16x8b);
929 pred5 = _mm_packus_epi16(pred5, zero_16x8b);
930 pred6 = _mm_packus_epi16(pred6, zero_16x8b);
931 pred7 = _mm_packus_epi16(pred7, zero_16x8b);
932
933 src_r0 = _mm_and_si128(src_r0, chroma_mask_odd); // 0 src1 0 src2 0 ...
934 src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
935 src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
936 src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
937 src_r4 = _mm_and_si128(src_r4, chroma_mask_odd);
938 src_r5 = _mm_and_si128(src_r5, chroma_mask_odd);
939 src_r6 = _mm_and_si128(src_r6, chroma_mask_odd);
940 src_r7 = _mm_and_si128(src_r7, chroma_mask_odd);
941
942 pred0 = _mm_and_si128(pred0, chroma_mask_even); // val 0 val 0 ..
943 pred1 = _mm_and_si128(pred1, chroma_mask_even);
944 pred2 = _mm_and_si128(pred2, chroma_mask_even);
945 pred3 = _mm_and_si128(pred3, chroma_mask_even);
946 pred4 = _mm_and_si128(pred4, chroma_mask_even);
947 pred5 = _mm_and_si128(pred5, chroma_mask_even);
948 pred6 = _mm_and_si128(pred6, chroma_mask_even);
949 pred7 = _mm_and_si128(pred7, chroma_mask_even);
950
951 src_r0 = _mm_add_epi8(src_r0, pred0); // macro src1 macro src2 macro ...
952 src_r1 = _mm_add_epi8(src_r1, pred1);
953 src_r2 = _mm_add_epi8(src_r2, pred2);
954 src_r3 = _mm_add_epi8(src_r3, pred3);
955 src_r4 = _mm_add_epi8(src_r4, pred4);
956 src_r5 = _mm_add_epi8(src_r5, pred5);
957 src_r6 = _mm_add_epi8(src_r6, pred6);
958 src_r7 = _mm_add_epi8(src_r7, pred7);
959
960 _mm_storel_epi64((__m128i *) (&pu1_out[0] + 8), src_r0);
961 _mm_storel_epi64((__m128i *) (&pu1_out[out_strd] + 8), src_r1);
962 _mm_storel_epi64((__m128i *) (&pu1_out[(2 * out_strd)] + 8), src_r2);
963 _mm_storel_epi64((__m128i *) (&pu1_out[(3 * out_strd)] + 8), src_r3);
964 _mm_storel_epi64((__m128i *) (&pu1_out[(4 * out_strd)] + 8), src_r4);
965 _mm_storel_epi64((__m128i *) (&pu1_out[(5 * out_strd)] + 8), src_r5);
966 _mm_storel_epi64((__m128i *) (&pu1_out[(6 * out_strd)] + 8), src_r6);
967 _mm_storel_epi64((__m128i *) (&pu1_out[(7 * out_strd)] + 8), src_r7);
968 }
969
970 /*****************************************************************************/
971 /* */
972 /* Function Name : isvcd_residual_luma_4x4_sse42 */
973 /* */
974 /* Description : this function computes the nnz from resd */
975 /* */
976 /* Inputs : */
977 /* Globals : none */
978 /* Processing : */
979 /* */
980 /* Outputs : none */
981 /* Returns : nnz */
982 /* */
983 /* Issues : none */
984 /* */
985 /* Revision History: */
986 /* */
987 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
988 /* 25 11 2021 Kishore creation */
989 /* */
990 /*****************************************************************************/
991
isvcd_residual_luma_4x4_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)992 WORD32 isvcd_residual_luma_4x4_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
993 {
994 __m128i rsd_8x16b_0;
995 __m128i rsd_8x16b_1;
996 __m128i rsd_8x16b_2;
997 __m128i rsd_8x16b_3;
998 __m128i rsd_8x16b_01, rsd_8x16b_23;
999
1000 __m128i zero_8x16b = _mm_setzero_si128();
1001 WORD32 i4_nnz, row_01, row_23;
1002
1003 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1004 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1005 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1)));
1006 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1) + rsd_strd));
1007
1008 rsd_8x16b_01 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1009 rsd_8x16b_23 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1010
1011 row_01 = _mm_test_all_ones(
1012 _mm_cmpeq_epi16(rsd_8x16b_01, zero_8x16b)); // return 1 if all zeros, else 0
1013 row_23 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23, zero_8x16b));
1014
1015 i4_nnz = !(row_01 && row_23);
1016 return i4_nnz;
1017 }
1018 /*****************************************************************************/
1019 /* */
1020 /* Function Name : isvcd_residual_luma_8x8_sse42 */
1021 /* */
1022 /* Description : this function computes the nnz from resd */
1023 /* */
1024 /* Inputs : */
1025 /* Globals : none */
1026 /* Processing : */
1027 /* */
1028 /* Outputs : none */
1029 /* Returns : nnz */
1030 /* */
1031 /* Issues : none */
1032 /* */
1033 /* Revision History: */
1034 /* */
1035 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1036 /* 25 11 2021 Kishore creation */
1037 /* */
1038 /*****************************************************************************/
1039
isvcd_residual_luma_8x8_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)1040 WORD32 isvcd_residual_luma_8x8_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
1041 {
1042 __m128i rsd_8x16b_0;
1043 __m128i rsd_8x16b_1;
1044 __m128i rsd_8x16b_2;
1045 __m128i rsd_8x16b_3;
1046 __m128i rsd_8x16b_4;
1047 __m128i rsd_8x16b_5;
1048 __m128i rsd_8x16b_6;
1049 __m128i rsd_8x16b_7;
1050 __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
1051 __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
1052
1053 WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
1054 WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
1055 WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
1056
1057 __m128i zero_8x16b = _mm_setzero_si128();
1058
1059 WORD32 rsd_strd2 = (rsd_strd << 1);
1060 WORD32 rsd_strd4 = (rsd_strd << 2);
1061
1062 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1063 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1064 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1065 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1066 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1067 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1068 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1069 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1070
1071 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1072 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1073 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1074 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1075
1076 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1077 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1078 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1079 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1080
1081 row_01_b0 = _mm_test_all_ones(
1082 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
1083 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1084 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1085 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1086 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1087 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1088 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1089 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1090
1091 i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
1092 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
1093 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
1094 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
1095
1096 i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1097 return i4_nnz;
1098 }
1099
1100 /*****************************************************************************/
1101 /* */
1102 /* Function Name : isvcd_residual_luma_16x16_sse42 */
1103 /* */
1104 /* Description : this function computes the nnz from resd */
1105 /* */
1106 /* Inputs : */
1107 /* Globals : none */
1108 /* Processing : */
1109 /* */
1110 /* Outputs : none */
1111 /* Returns : nnz */
1112 /* */
1113 /* Issues : none */
1114 /* */
1115 /* Revision History: */
1116 /* */
1117 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1118 /* 25 11 2021 Kishore creation */
1119 /* */
1120 /*****************************************************************************/
1121
isvcd_residual_luma_16x16_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)1122 WORD32 isvcd_residual_luma_16x16_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
1123 {
1124 __m128i rsd_8x16b_0;
1125 __m128i rsd_8x16b_1;
1126 __m128i rsd_8x16b_2;
1127 __m128i rsd_8x16b_3;
1128 __m128i rsd_8x16b_4;
1129 __m128i rsd_8x16b_5;
1130 __m128i rsd_8x16b_6;
1131 __m128i rsd_8x16b_7;
1132 __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
1133 __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
1134
1135 WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
1136 WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
1137 WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
1138
1139 __m128i zero_8x16b = _mm_setzero_si128();
1140
1141 WORD32 rsd_strd2 = (rsd_strd << 1);
1142 WORD32 rsd_strd4 = (rsd_strd << 2);
1143
1144 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1145 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1146 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1147 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1148 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1149 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1150 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1151 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1152
1153 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1154 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1155 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1156 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1157 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1158 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1159 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1160 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1161
1162 row_01_b0 = _mm_test_all_ones(
1163 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
1164 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1165 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1166 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1167 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1168 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1169 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1170 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1171
1172 i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
1173 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
1174 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
1175 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
1176
1177 i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1178
1179 pi2_rsd += 8;
1180
1181 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1182 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1183 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1184 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1185 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1186 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1187 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1188 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1189
1190 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1191 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1192 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1193 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1194
1195 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1196 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1197 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1198 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1199
1200 row_01_b0 = _mm_test_all_ones(
1201 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
1202 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1203 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1204 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1205 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1206 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1207 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1208 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1209
1210 i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 2;
1211 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 3;
1212 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 6;
1213 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 7;
1214
1215 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1216
1217 pi2_rsd -= 8;
1218 pi2_rsd += (rsd_strd << 3);
1219
1220 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1221 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1222 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1223 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1224 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1225 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1226 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1227 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1228
1229 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1230 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1231 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1232 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1233
1234 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1235 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1236 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1237 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1238
1239 row_01_b0 = _mm_test_all_ones(
1240 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
1241 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1242 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1243 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1244 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1245 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1246 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1247 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1248
1249 i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 8;
1250 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 9;
1251 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 12;
1252 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 13;
1253
1254 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1255
1256 pi2_rsd += 8;
1257
1258 rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1259 rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1260 rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1261 rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1262 rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1263 rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1264 rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1265 rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1266
1267 rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1268 rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1269 rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1270 rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1271
1272 rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1273 rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1274 rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1275 rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1276
1277 row_01_b0 = _mm_test_all_ones(
1278 _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b)); // return 1 if all zeros, else 0
1279 row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1280 row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1281 row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1282 row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1283 row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1284 row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1285 row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1286
1287 i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 10;
1288 i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 11;
1289 i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 14;
1290 i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 15;
1291
1292 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1293 return i4_nnz;
1294 }
1295
1296 /*****************************************************************************/
1297 /* */
1298 /* Function Name : isvcd_residual_chroma_cb_cr_8x8_sse42 */
1299 /* */
1300 /* Description : this function computes the nnz from resd */
1301 /* */
1302 /* Inputs : */
1303 /* Globals : none */
1304 /* Processing : */
1305 /* */
1306 /* Outputs : none */
1307 /* Returns : nnz */
1308 /* */
1309 /* Issues : none */
1310 /* */
1311 /* Revision History: */
1312 /* */
1313 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1314 /* 25 11 2021 Kishore creation */
1315 /* */
1316 /*****************************************************************************/
1317
isvcd_residual_chroma_cb_cr_8x8_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)1318 WORD32 isvcd_residual_chroma_cb_cr_8x8_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
1319 {
1320 __m128i rsd_8x16b_r0_0, rsd_8x16b_r0_1, mix_8x16b_r01_0_l, mix_8x16b_r01_1_l,
1321 rsd_8x16b_r01_b0_cb, rsd_8x16b_r01_b1_cb;
1322 __m128i rsd_8x16b_r1_0, rsd_8x16b_r1_1, mix_8x16b_r23_0_l, mix_8x16b_r23_1_l,
1323 rsd_8x16b_r01_b0_cr, rsd_8x16b_r01_b1_cr;
1324 __m128i rsd_8x16b_r2_0, rsd_8x16b_r2_1, mix_8x16b_r45_0_l, mix_8x16b_r45_1_l,
1325 rsd_8x16b_r23_b0_cb, rsd_8x16b_r23_b1_cb;
1326 __m128i rsd_8x16b_r3_0, rsd_8x16b_r3_1, mix_8x16b_r67_0_l, mix_8x16b_r67_1_l,
1327 rsd_8x16b_r23_b0_cr, rsd_8x16b_r23_b1_cr;
1328 __m128i rsd_8x16b_r4_0, rsd_8x16b_r4_1, mix_8x16b_r01_0_h, mix_8x16b_r01_1_h,
1329 rsd_8x16b_r45_b2_cb, rsd_8x16b_r45_b3_cb;
1330 __m128i rsd_8x16b_r5_0, rsd_8x16b_r5_1, mix_8x16b_r23_0_h, mix_8x16b_r23_1_h,
1331 rsd_8x16b_r45_b2_cr, rsd_8x16b_r45_b3_cr;
1332 __m128i rsd_8x16b_r6_0, rsd_8x16b_r6_1, mix_8x16b_r45_0_h, mix_8x16b_r45_1_h,
1333 rsd_8x16b_r67_b2_cb, rsd_8x16b_r67_b3_cb;
1334 __m128i rsd_8x16b_r7_0, rsd_8x16b_r7_1, mix_8x16b_r67_0_h, mix_8x16b_r67_1_h,
1335 rsd_8x16b_r67_b2_cr, rsd_8x16b_r67_b3_cr;
1336
1337 WORD32 r01_b0_cb, r01_b0_cr;
1338 WORD32 r23_b0_cb, r23_b0_cr;
1339 WORD32 r01_b1_cb, r01_b1_cr;
1340 WORD32 r23_b1_cb, r23_b1_cr;
1341 WORD32 r45_b2_cb, r45_b2_cr;
1342 WORD32 r67_b2_cb, r67_b2_cr;
1343 WORD32 r45_b3_cb, r45_b3_cr;
1344 WORD32 r67_b3_cb, r67_b3_cr;
1345
1346 WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
1347
1348 __m128i zero_8x16b = _mm_setzero_si128();
1349
1350 WORD32 rsd_strd2 = (rsd_strd << 1);
1351 WORD32 rsd_strd4 = (rsd_strd << 2);
1352
1353 rsd_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1354 rsd_8x16b_r1_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1355 rsd_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1356 rsd_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1357 rsd_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1358 rsd_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1359 rsd_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1360 rsd_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1361
1362 rsd_8x16b_r0_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + 8));
1363 rsd_8x16b_r1_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd + 8));
1364 rsd_8x16b_r2_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + 8));
1365 rsd_8x16b_r3_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd + 8));
1366 rsd_8x16b_r4_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + 8));
1367 rsd_8x16b_r5_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd + 8));
1368 rsd_8x16b_r6_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + 8));
1369 rsd_8x16b_r7_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd + 8));
1370
1371 mix_8x16b_r01_0_l =
1372 _mm_unpacklo_epi16(rsd_8x16b_r0_0, rsd_8x16b_r1_0); // a0, b0 a1 b1 a2 b2 a3 b3
1373 mix_8x16b_r23_0_l = _mm_unpacklo_epi16(rsd_8x16b_r2_0, rsd_8x16b_r3_0);
1374 mix_8x16b_r45_0_l = _mm_unpacklo_epi16(rsd_8x16b_r4_0, rsd_8x16b_r5_0);
1375 mix_8x16b_r67_0_l = _mm_unpacklo_epi16(rsd_8x16b_r6_0, rsd_8x16b_r7_0);
1376 mix_8x16b_r01_0_h =
1377 _mm_unpackhi_epi16(rsd_8x16b_r0_0, rsd_8x16b_r1_0); // a4 b4 a5 b5 a6 b6 a7 b7
1378 mix_8x16b_r23_0_h = _mm_unpackhi_epi16(rsd_8x16b_r2_0, rsd_8x16b_r3_0);
1379 mix_8x16b_r45_0_h = _mm_unpackhi_epi16(rsd_8x16b_r4_0, rsd_8x16b_r5_0);
1380 mix_8x16b_r67_0_h = _mm_unpackhi_epi16(rsd_8x16b_r6_0, rsd_8x16b_r7_0);
1381
1382 mix_8x16b_r01_1_l =
1383 _mm_unpacklo_epi16(rsd_8x16b_r0_1, rsd_8x16b_r1_1); // a8, b8 a9 b9 a10 b10 a11 b11
1384 mix_8x16b_r23_1_l = _mm_unpacklo_epi16(rsd_8x16b_r2_1, rsd_8x16b_r3_1);
1385 mix_8x16b_r45_1_l = _mm_unpacklo_epi16(rsd_8x16b_r4_1, rsd_8x16b_r5_1);
1386 mix_8x16b_r67_1_l = _mm_unpacklo_epi16(rsd_8x16b_r6_1, rsd_8x16b_r7_1);
1387 mix_8x16b_r01_1_h =
1388 _mm_unpackhi_epi16(rsd_8x16b_r0_1, rsd_8x16b_r1_1); // a12 b12 a13 b13 a14 b14 a15 b15
1389 mix_8x16b_r23_1_h = _mm_unpackhi_epi16(rsd_8x16b_r2_1, rsd_8x16b_r3_1);
1390 mix_8x16b_r45_1_h = _mm_unpackhi_epi16(rsd_8x16b_r4_1, rsd_8x16b_r5_1);
1391 mix_8x16b_r67_1_h = _mm_unpackhi_epi16(rsd_8x16b_r6_1, rsd_8x16b_r7_1);
1392
1393 mix_8x16b_r01_0_l = _mm_shuffle_epi32(mix_8x16b_r01_0_l, 0b11011000); // a0b0 a2b2 a1b1 a3b3
1394 mix_8x16b_r23_0_l = _mm_shuffle_epi32(mix_8x16b_r23_0_l, 0b11011000); // c0d0
1395 mix_8x16b_r45_0_l = _mm_shuffle_epi32(mix_8x16b_r45_0_l, 0b11011000); // e0f0
1396 mix_8x16b_r67_0_l = _mm_shuffle_epi32(mix_8x16b_r67_0_l, 0b11011000); // g0h0
1397 mix_8x16b_r01_0_h = _mm_shuffle_epi32(mix_8x16b_r01_0_h, 0b11011000); // a4b4 a6b6 a5b5 a7b7
1398 mix_8x16b_r23_0_h = _mm_shuffle_epi32(mix_8x16b_r23_0_h, 0b11011000); // c4d4
1399 mix_8x16b_r45_0_h = _mm_shuffle_epi32(mix_8x16b_r45_0_h, 0b11011000); // e4f4
1400 mix_8x16b_r67_0_h = _mm_shuffle_epi32(mix_8x16b_r67_0_h, 0b11011000); // g4h4
1401
1402 mix_8x16b_r01_1_l = _mm_shuffle_epi32(mix_8x16b_r01_1_l, 0b11011000);
1403 mix_8x16b_r23_1_l = _mm_shuffle_epi32(mix_8x16b_r23_1_l, 0b11011000);
1404 mix_8x16b_r45_1_l = _mm_shuffle_epi32(mix_8x16b_r45_1_l, 0b11011000);
1405 mix_8x16b_r67_1_l = _mm_shuffle_epi32(mix_8x16b_r67_1_l, 0b11011000);
1406 mix_8x16b_r01_1_h = _mm_shuffle_epi32(mix_8x16b_r01_1_h, 0b11011000);
1407 mix_8x16b_r23_1_h = _mm_shuffle_epi32(mix_8x16b_r23_1_h, 0b11011000);
1408 mix_8x16b_r45_1_h = _mm_shuffle_epi32(mix_8x16b_r45_1_h, 0b11011000);
1409 mix_8x16b_r67_1_h = _mm_shuffle_epi32(mix_8x16b_r67_1_h, 0b11011000);
1410
1411 rsd_8x16b_r01_b0_cb =
1412 _mm_unpacklo_epi64(mix_8x16b_r01_0_l, mix_8x16b_r01_0_h); // a0b0 a2b2 a4b4 a6b6
1413 rsd_8x16b_r01_b0_cr =
1414 _mm_unpackhi_epi64(mix_8x16b_r01_0_l, mix_8x16b_r01_0_h); // a1b1 a3b3 a5b5 a7b7
1415 rsd_8x16b_r23_b0_cb = _mm_unpacklo_epi64(mix_8x16b_r23_0_l, mix_8x16b_r23_0_h); //
1416 rsd_8x16b_r23_b0_cr = _mm_unpackhi_epi64(mix_8x16b_r23_0_l, mix_8x16b_r23_0_h);
1417 rsd_8x16b_r45_b2_cb = _mm_unpacklo_epi64(mix_8x16b_r45_0_l, mix_8x16b_r45_0_h);
1418 rsd_8x16b_r45_b2_cr = _mm_unpackhi_epi64(mix_8x16b_r45_0_l, mix_8x16b_r45_0_h);
1419 rsd_8x16b_r67_b2_cb = _mm_unpacklo_epi64(mix_8x16b_r67_0_l, mix_8x16b_r67_0_h);
1420 rsd_8x16b_r67_b2_cr = _mm_unpackhi_epi64(mix_8x16b_r67_0_l, mix_8x16b_r67_0_h);
1421
1422 rsd_8x16b_r01_b1_cb =
1423 _mm_unpacklo_epi64(mix_8x16b_r01_1_l, mix_8x16b_r01_1_h); // a8b8 a10b10 a12b12 a14b14
1424 rsd_8x16b_r01_b1_cr =
1425 _mm_unpackhi_epi64(mix_8x16b_r01_1_l, mix_8x16b_r01_1_h); // a9b9 a11b11 a13b13 a15b15
1426 rsd_8x16b_r23_b1_cb = _mm_unpacklo_epi64(mix_8x16b_r23_1_l, mix_8x16b_r23_1_h);
1427 rsd_8x16b_r23_b1_cr = _mm_unpackhi_epi64(mix_8x16b_r23_1_l, mix_8x16b_r23_1_h);
1428 rsd_8x16b_r45_b3_cb = _mm_unpacklo_epi64(mix_8x16b_r45_1_l, mix_8x16b_r45_1_h);
1429 rsd_8x16b_r45_b3_cr = _mm_unpackhi_epi64(mix_8x16b_r45_1_l, mix_8x16b_r45_1_h);
1430 rsd_8x16b_r67_b3_cb = _mm_unpacklo_epi64(mix_8x16b_r67_1_l, mix_8x16b_r67_1_h);
1431 rsd_8x16b_r67_b3_cr = _mm_unpackhi_epi64(mix_8x16b_r67_1_l, mix_8x16b_r67_1_h);
1432
1433 r01_b0_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b0_cb, zero_8x16b));
1434 r23_b0_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b0_cb, zero_8x16b));
1435 r01_b1_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b1_cb, zero_8x16b));
1436 r23_b1_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b1_cb, zero_8x16b));
1437 r45_b2_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b2_cb, zero_8x16b));
1438 r67_b2_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b2_cb, zero_8x16b));
1439 r45_b3_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b3_cb, zero_8x16b));
1440 r67_b3_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b3_cb, zero_8x16b));
1441
1442 r01_b0_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b0_cr, zero_8x16b));
1443 r23_b0_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b0_cr, zero_8x16b));
1444 r01_b1_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b1_cr, zero_8x16b));
1445 r23_b1_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b1_cr, zero_8x16b));
1446 r45_b2_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b2_cr, zero_8x16b));
1447 r67_b2_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b2_cr, zero_8x16b));
1448 r45_b3_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b3_cr, zero_8x16b));
1449 r67_b3_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b3_cr, zero_8x16b));
1450
1451 i4_nnz_b0 = (!(r01_b0_cr && r23_b0_cr));
1452 i4_nnz_b1 = (!(r01_b1_cr && r23_b1_cr)) << 1;
1453 i4_nnz_b2 = (!(r45_b2_cr && r67_b2_cr)) << 2;
1454 i4_nnz_b3 = (!(r45_b3_cr && r67_b3_cr)) << 3;
1455
1456 i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1457 i4_nnz = i4_nnz << 4;
1458
1459 i4_nnz_b0 = (!(r01_b0_cb && r23_b0_cb));
1460 i4_nnz_b1 = (!(r01_b1_cb && r23_b1_cb)) << 1;
1461 i4_nnz_b2 = (!(r45_b2_cb && r67_b2_cb)) << 2;
1462 i4_nnz_b3 = (!(r45_b3_cb && r67_b3_cb)) << 3;
1463
1464 i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1465 return i4_nnz;
1466 }
1467