xref: /aosp_15_r20/external/libavc/decoder/x86/svc/isvcd_pred_residual_recon_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_pred_residual_recon_sse42.c
24  *
25  * @brief
26  *  Contains function definitions for pred_residual and recon transform
27  *
28  * @author
29  *  Kishore
30  *
31  * @par List of Functions:
32  *  - isvcd_pred_residual_recon_4x4_sse42()
33  *  - isvcd_pred_residual_recon_8x8_sse42()
34  *  - isvcd_pred_residual_recon_16x16_sse42()
35  *  - isvcd_pred_residual_recon_chroma_4x4_sse42()
36  *  - isvcd_pred_residual_recon_chroma_8x8_sse42()
37  *  - isvcd_residual_luma_4x4_sse42()
38  *  - isvcd_residual_luma_8x8_sse42()
39  *  - isvcd_residual_luma_16x16_sse42()
40  *  - isvcd_residual_chroma_cb_cr_8x8_sse42()
41  *
42  * @remarks
43  *  None
44  *
45  *******************************************************************************
46  */
47 /* User include files */
48 #include <immintrin.h>
49 #include "ih264_typedefs.h"
50 #include "ih264_defs.h"
51 #include "ih264_trans_macros.h"
52 #include "ih264_macros.h"
53 #include "ih264_platform_macros.h"
54 #include "ih264_trans_data.h"
55 #include "ih264_size_defs.h"
56 #include "ih264_structs.h"
57 #include "isvcd_pred_residual_recon.h"
58 
59 /*****************************************************************************/
60 /*                                                                           */
61 /*  Function Name : isvcd_pred_residual_recon_4x4_sse42                       */
62 /*                                                                           */
63 /*  Description   : this function computes the recon from                    */
64 /*                  the residual and pred buffer                             */
65 /*  Inputs        :                                                          */
66 /*  Globals       : none                                                     */
67 /*  Processing    :                                                          */
68 /*                                                                           */
69 /*  Outputs       : none                                                     */
70 /*  Returns       : nnz                                                      */
71 /*                                                                           */
72 /*  Issues        : none                                                     */
73 /*                                                                           */
74 /*  Revision History:                                                        */
75 /*                                                                           */
76 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
77 /*         25 11 2021   Kishore               creation                       */
78 /*                                                                           */
79 /*****************************************************************************/
80 
isvcd_pred_residual_recon_4x4_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)81 WORD32 isvcd_pred_residual_recon_4x4_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
82                                            WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
83 {
84     __m128i pred_16x8b_0, pred_8x16b_0, rsd_8x16b_0, out_8x16b_0, out_16x8b_0;
85     __m128i pred_16x8b_1, pred_8x16b_1, rsd_8x16b_1, out_8x16b_1, out_16x8b_1;
86     __m128i pred_16x8b_2, pred_8x16b_2, rsd_8x16b_2, out_8x16b_2, out_16x8b_2;
87     __m128i pred_16x8b_3, pred_8x16b_3, rsd_8x16b_3, out_8x16b_3, out_16x8b_3;
88     __m128i rsd_8x16b_01, rsd_8x16b_23;
89 
90     __m128i zero_8x16b = _mm_setzero_si128();
91     WORD32 i4_nnz, row_01, row_23;
92 
93     pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
94     pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
95     pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + (pred_strd << 1)));
96     pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + (pred_strd << 1) + pred_strd));
97 
98     pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
99     pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
100     pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
101     pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
102 
103     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
104     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
105     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1)));
106     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1) + rsd_strd));
107 
108     rsd_8x16b_01 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
109     rsd_8x16b_23 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
110 
111     row_01 = _mm_test_all_ones(
112         _mm_cmpeq_epi16(rsd_8x16b_01, zero_8x16b));  // return 1 if all zeros, else 0
113     row_23 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23, zero_8x16b));
114 
115     out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
116     out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
117     out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
118     out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
119 
120     out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
121     out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
122     out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
123     out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
124 
125     *((WORD32 *) (pu1_out)) = _mm_cvtsi128_si32(out_16x8b_0);
126     *((WORD32 *) (pu1_out + out_strd)) = _mm_cvtsi128_si32(out_16x8b_1);
127     *((WORD32 *) (pu1_out + (out_strd << 1))) = _mm_cvtsi128_si32(out_16x8b_2);
128     *((WORD32 *) (pu1_out + (out_strd * 3))) = _mm_cvtsi128_si32(out_16x8b_3);
129     i4_nnz = !(row_01 && row_23);
130 
131     return i4_nnz;
132 }
133 
134 /*****************************************************************************/
135 /*                                                                           */
136 /*  Function Name : isvcd_pred_residual_recon_8x8_sse42                       */
137 /*                                                                           */
138 /*  Description   : this function computes the recon from                    */
139 /*                  the residual and pred buffer                             */
140 /*  Inputs        :                                                          */
141 /*  Globals       : none                                                     */
142 /*  Processing    :                                                          */
143 /*                                                                           */
144 /*  Outputs       : none                                                     */
145 /*  Returns       : nnz                                                      */
146 /*                                                                           */
147 /*  Issues        : none                                                     */
148 /*                                                                           */
149 /*  Revision History:                                                        */
150 /*                                                                           */
151 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
152 /*         25 11 2021   Kishore               creation                       */
153 /*                                                                           */
154 /*****************************************************************************/
155 
isvcd_pred_residual_recon_8x8_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)156 WORD32 isvcd_pred_residual_recon_8x8_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
157                                            WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
158 {
159     __m128i pred_16x8b_0, pred_8x16b_0, rsd_8x16b_0, out_8x16b_0, out_16x8b_0;
160     __m128i pred_16x8b_1, pred_8x16b_1, rsd_8x16b_1, out_8x16b_1, out_16x8b_1;
161     __m128i pred_16x8b_2, pred_8x16b_2, rsd_8x16b_2, out_8x16b_2, out_16x8b_2;
162     __m128i pred_16x8b_3, pred_8x16b_3, rsd_8x16b_3, out_8x16b_3, out_16x8b_3;
163     __m128i pred_16x8b_4, pred_8x16b_4, rsd_8x16b_4, out_8x16b_4, out_16x8b_4;
164     __m128i pred_16x8b_5, pred_8x16b_5, rsd_8x16b_5, out_8x16b_5, out_16x8b_5;
165     __m128i pred_16x8b_6, pred_8x16b_6, rsd_8x16b_6, out_8x16b_6, out_16x8b_6;
166     __m128i pred_16x8b_7, pred_8x16b_7, rsd_8x16b_7, out_8x16b_7, out_16x8b_7;
167     __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
168     __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
169 
170     WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
171     WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
172     WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
173 
174     __m128i zero_8x16b = _mm_setzero_si128();
175 
176     WORD32 pred_strd2 = (pred_strd << 1);
177     WORD32 pred_strd4 = (pred_strd << 2);
178     WORD32 rsd_strd2 = (rsd_strd << 1);
179     WORD32 rsd_strd4 = (rsd_strd << 2);
180     WORD32 out_strd2 = (out_strd << 1);
181     WORD32 out_strd4 = (out_strd << 2);
182 
183     pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
184     pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
185     pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
186     pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
187     pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
188     pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
189     pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
190     pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
191 
192     pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
193     pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
194     pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
195     pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
196     pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
197     pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
198     pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
199     pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
200 
201     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
202     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
203     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
204     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
205     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
206     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
207     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
208     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
209 
210     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
211     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
212     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
213     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
214 
215     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
216     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
217     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
218     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
219 
220     row_01_b0 = _mm_test_all_ones(
221         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
222     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
223     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
224     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
225     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
226     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
227     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
228     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
229 
230     out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
231     out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
232     out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
233     out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
234     out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
235     out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
236     out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
237     out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
238 
239     out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
240     out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
241     out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
242     out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
243     out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
244     out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
245     out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
246     out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
247 
248     _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
249     _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
250     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
251     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
252     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
253     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
254     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
255     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
256 
257     i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
258     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
259     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
260     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
261 
262     i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
263     return i4_nnz;
264 }
265 
266 /*****************************************************************************/
267 /*                                                                           */
268 /*  Function Name : isvcd_pred_residual_recon_16x16_sse42                     */
269 /*                                                                           */
270 /*  Description   : this function computes the recon from                    */
271 /*                  the residual and pred buffer                             */
272 /*  Inputs        :                                                          */
273 /*  Globals       : none                                                     */
274 /*  Processing    :                                                          */
275 /*                                                                           */
276 /*  Outputs       : none                                                     */
277 /*  Returns       : nnz                                                      */
278 /*                                                                           */
279 /*  Issues        : none                                                     */
280 /*                                                                           */
281 /*  Revision History:                                                        */
282 /*                                                                           */
283 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
284 /*         25 11 2021   Kishore               creation                       */
285 /*                                                                           */
286 /*****************************************************************************/
287 
isvcd_pred_residual_recon_16x16_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)288 WORD32 isvcd_pred_residual_recon_16x16_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
289                                              WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
290 {
291     __m128i pred_16x8b_0, pred_8x16b_0, rsd_8x16b_0, out_8x16b_0, out_16x8b_0;
292     __m128i pred_16x8b_1, pred_8x16b_1, rsd_8x16b_1, out_8x16b_1, out_16x8b_1;
293     __m128i pred_16x8b_2, pred_8x16b_2, rsd_8x16b_2, out_8x16b_2, out_16x8b_2;
294     __m128i pred_16x8b_3, pred_8x16b_3, rsd_8x16b_3, out_8x16b_3, out_16x8b_3;
295     __m128i pred_16x8b_4, pred_8x16b_4, rsd_8x16b_4, out_8x16b_4, out_16x8b_4;
296     __m128i pred_16x8b_5, pred_8x16b_5, rsd_8x16b_5, out_8x16b_5, out_16x8b_5;
297     __m128i pred_16x8b_6, pred_8x16b_6, rsd_8x16b_6, out_8x16b_6, out_16x8b_6;
298     __m128i pred_16x8b_7, pred_8x16b_7, rsd_8x16b_7, out_8x16b_7, out_16x8b_7;
299     __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
300     __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
301 
302     WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
303     WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
304     WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
305 
306     __m128i zero_8x16b = _mm_setzero_si128();
307 
308     WORD32 pred_strd2 = (pred_strd << 1);
309     WORD32 pred_strd4 = (pred_strd << 2);
310     WORD32 rsd_strd2 = (rsd_strd << 1);
311     WORD32 rsd_strd4 = (rsd_strd << 2);
312     WORD32 out_strd2 = (out_strd << 1);
313     WORD32 out_strd4 = (out_strd << 2);
314 
315     pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
316     pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
317     pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
318     pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
319     pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
320     pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
321     pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
322     pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
323 
324     pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
325     pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
326     pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
327     pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
328     pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
329     pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
330     pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
331     pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
332 
333     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
334     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
335     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
336     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
337     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
338     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
339     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
340     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
341 
342     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
343     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
344     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
345     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
346 
347     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
348     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
349     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
350     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
351 
352     row_01_b0 = _mm_test_all_ones(
353         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
354     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
355     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
356     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
357     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
358     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
359     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
360     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
361 
362     out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
363     out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
364     out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
365     out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
366     out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
367     out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
368     out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
369     out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
370 
371     out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
372     out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
373     out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
374     out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
375     out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
376     out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
377     out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
378     out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
379 
380     _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
381     _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
382     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
383     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
384     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
385     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
386     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
387     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
388 
389     i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
390     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
391     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
392     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
393 
394     i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
395 
396     pu1_pred += 8;
397     pi2_rsd += 8;
398     pu1_out += 8;
399 
400     pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
401     pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
402     pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
403     pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
404     pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
405     pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
406     pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
407     pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
408 
409     pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
410     pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
411     pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
412     pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
413     pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
414     pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
415     pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
416     pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
417 
418     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
419     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
420     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
421     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
422     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
423     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
424     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
425     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
426 
427     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
428     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
429     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
430     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
431 
432     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
433     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
434     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
435     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
436 
437     row_01_b0 = _mm_test_all_ones(
438         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
439     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
440     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
441     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
442     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
443     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
444     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
445     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
446 
447     out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
448     out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
449     out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
450     out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
451     out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
452     out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
453     out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
454     out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
455 
456     out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
457     out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
458     out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
459     out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
460     out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
461     out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
462     out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
463     out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
464 
465     _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
466     _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
467     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
468     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
469     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
470     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
471     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
472     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
473 
474     i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 2;
475     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 3;
476     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 6;
477     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 7;
478 
479     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
480 
481     pu1_pred -= 8;
482     pi2_rsd -= 8;
483     pu1_out -= 8;
484 
485     pu1_pred += (pred_strd << 3);
486     pi2_rsd += (rsd_strd << 3);
487     pu1_out += (out_strd << 3);
488 
489     pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
490     pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
491     pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
492     pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
493     pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
494     pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
495     pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
496     pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
497 
498     pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
499     pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
500     pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
501     pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
502     pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
503     pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
504     pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
505     pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
506 
507     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
508     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
509     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
510     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
511     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
512     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
513     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
514     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
515 
516     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
517     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
518     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
519     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
520 
521     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
522     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
523     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
524     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
525 
526     row_01_b0 = _mm_test_all_ones(
527         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
528     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
529     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
530     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
531     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
532     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
533     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
534     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
535 
536     out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
537     out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
538     out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
539     out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
540     out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
541     out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
542     out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
543     out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
544 
545     out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
546     out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
547     out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
548     out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
549     out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
550     out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
551     out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
552     out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
553 
554     _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
555     _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
556     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
557     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
558     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
559     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
560     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
561     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
562 
563     i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 8;
564     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 9;
565     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 12;
566     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 13;
567 
568     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
569 
570     pu1_pred += 8;
571     pi2_rsd += 8;
572     pu1_out += 8;
573 
574     pred_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_pred));
575     pred_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd));
576     pred_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2));
577     pred_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd2 + pred_strd));
578     pred_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4));
579     pred_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd));
580     pred_16x8b_6 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2));
581     pred_16x8b_7 = _mm_loadu_si128((__m128i *) (pu1_pred + pred_strd4 + pred_strd2 + pred_strd));
582 
583     pred_8x16b_0 = _mm_cvtepu8_epi16(pred_16x8b_0);
584     pred_8x16b_1 = _mm_cvtepu8_epi16(pred_16x8b_1);
585     pred_8x16b_2 = _mm_cvtepu8_epi16(pred_16x8b_2);
586     pred_8x16b_3 = _mm_cvtepu8_epi16(pred_16x8b_3);
587     pred_8x16b_4 = _mm_cvtepu8_epi16(pred_16x8b_4);
588     pred_8x16b_5 = _mm_cvtepu8_epi16(pred_16x8b_5);
589     pred_8x16b_6 = _mm_cvtepu8_epi16(pred_16x8b_6);
590     pred_8x16b_7 = _mm_cvtepu8_epi16(pred_16x8b_7);
591 
592     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
593     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
594     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
595     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
596     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
597     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
598     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
599     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
600 
601     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
602     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
603     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
604     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
605 
606     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
607     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
608     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
609     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
610 
611     row_01_b0 = _mm_test_all_ones(
612         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
613     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
614     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
615     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
616     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
617     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
618     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
619     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
620 
621     out_8x16b_0 = _mm_add_epi16(pred_8x16b_0, rsd_8x16b_0);
622     out_8x16b_1 = _mm_add_epi16(pred_8x16b_1, rsd_8x16b_1);
623     out_8x16b_2 = _mm_add_epi16(pred_8x16b_2, rsd_8x16b_2);
624     out_8x16b_3 = _mm_add_epi16(pred_8x16b_3, rsd_8x16b_3);
625     out_8x16b_4 = _mm_add_epi16(pred_8x16b_4, rsd_8x16b_4);
626     out_8x16b_5 = _mm_add_epi16(pred_8x16b_5, rsd_8x16b_5);
627     out_8x16b_6 = _mm_add_epi16(pred_8x16b_6, rsd_8x16b_6);
628     out_8x16b_7 = _mm_add_epi16(pred_8x16b_7, rsd_8x16b_7);
629 
630     out_16x8b_0 = _mm_packus_epi16(out_8x16b_0, zero_8x16b);
631     out_16x8b_1 = _mm_packus_epi16(out_8x16b_1, zero_8x16b);
632     out_16x8b_2 = _mm_packus_epi16(out_8x16b_2, zero_8x16b);
633     out_16x8b_3 = _mm_packus_epi16(out_8x16b_3, zero_8x16b);
634     out_16x8b_4 = _mm_packus_epi16(out_8x16b_4, zero_8x16b);
635     out_16x8b_5 = _mm_packus_epi16(out_8x16b_5, zero_8x16b);
636     out_16x8b_6 = _mm_packus_epi16(out_8x16b_6, zero_8x16b);
637     out_16x8b_7 = _mm_packus_epi16(out_8x16b_7, zero_8x16b);
638 
639     _mm_storel_epi64((__m128i *) (pu1_out), out_16x8b_0);
640     _mm_storel_epi64((__m128i *) (pu1_out + out_strd), out_16x8b_1);
641     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2), out_16x8b_2);
642     _mm_storel_epi64((__m128i *) (pu1_out + out_strd2 + out_strd), out_16x8b_3);
643     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4), out_16x8b_4);
644     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd), out_16x8b_5);
645     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2), out_16x8b_6);
646     _mm_storel_epi64((__m128i *) (pu1_out + out_strd4 + out_strd2 + out_strd), out_16x8b_7);
647 
648     i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 10;
649     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 11;
650     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 14;
651     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 15;
652 
653     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
654     return i4_nnz;
655 }
656 
657 /*****************************************************************************/
658 /*                                                                           */
659 /*  Function Name : isvcd_pred_residual_recon_chroma_4x4_sse42                */
660 /*                                                                           */
661 /*  Description   : this function computes the recon from                    */
662 /*                  the residual and pred buffer                             */
663 /*  Inputs        :                                                          */
664 /*  Globals       : none                                                     */
665 /*  Processing    :                                                          */
666 /*                                                                           */
667 /*  Outputs       : none                                                     */
668 /*  Returns       : nnz                                                      */
669 /*                                                                           */
670 /*  Issues        : none                                                     */
671 /*                                                                           */
672 /*  Revision History:                                                        */
673 /*                                                                           */
674 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
675 /*         25 11 2021   Kishore               creation                       */
676 /*                                                                           */
677 /*****************************************************************************/
678 
isvcd_pred_residual_recon_chroma_4x4_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)679 void isvcd_pred_residual_recon_chroma_4x4_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
680                                                 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
681 {
682     __m128i src_r0, src_r1, src_r2, src_r3;
683     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
684     __m128i pred0, pred1, pred2, pred3;
685     __m128i rsd_r0, rsd_r1, rsd_r2, rsd_r3;
686     __m128i zero_16x8b;  // all bits reset to zero
687     __m128i chroma_mask_even;
688     __m128i chroma_mask_odd;
689 
690     zero_16x8b = _mm_setzero_si128();
691 
692     rsd_r0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
693     rsd_r1 = _mm_loadu_si128((__m128i *) (pi2_rsd + (1 * rsd_strd)));
694     rsd_r2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (2 * rsd_strd)));
695     rsd_r3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (3 * rsd_strd)));
696 
697     pred_r0 = _mm_loadu_si128((__m128i *) (pu1_pred));
698     pred_r1 = _mm_loadu_si128((__m128i *) (pu1_pred + (1 * pred_strd)));
699     pred_r2 = _mm_loadu_si128((__m128i *) (pu1_pred + (2 * pred_strd)));
700     pred_r3 = _mm_loadu_si128((__m128i *) (pu1_pred + (3 * pred_strd)));
701 
702     src_r0 = _mm_loadu_si128((__m128i *) (pu1_out));
703     src_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (1 * out_strd)));
704     src_r2 = _mm_loadu_si128((__m128i *) (pu1_out + (2 * out_strd)));
705     src_r3 = _mm_loadu_si128((__m128i *) (pu1_out + (3 * out_strd)));
706 
707     pred0 = _mm_cvtepu8_epi16(pred_r0);
708     pred1 = _mm_cvtepu8_epi16(pred_r1);
709     pred2 = _mm_cvtepu8_epi16(pred_r2);
710     pred3 = _mm_cvtepu8_epi16(pred_r3);
711 
712     pred0 = _mm_add_epi16(pred0, rsd_r0);
713     pred1 = _mm_add_epi16(pred1, rsd_r1);
714     pred2 = _mm_add_epi16(pred2, rsd_r2);
715     pred3 = _mm_add_epi16(pred3, rsd_r3);
716 
717     pred0 = _mm_packus_epi16(pred0, zero_16x8b);
718     pred1 = _mm_packus_epi16(pred1, zero_16x8b);
719     pred2 = _mm_packus_epi16(pred2, zero_16x8b);
720     pred3 = _mm_packus_epi16(pred3, zero_16x8b);
721 
722     chroma_mask_even = _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff,
723                                     0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
724     chroma_mask_odd = _mm_set_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0xff,
725                                    0x00, 0xff, 0x00, 0xff, 0x00);
726 
727     src_r0 = _mm_and_si128(src_r0, chroma_mask_odd);  // 0 src1 0 src2 0 ...
728     src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
729     src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
730     src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
731 
732     pred0 = _mm_and_si128(pred0, chroma_mask_even);  // val 0 val 0 ..
733     pred1 = _mm_and_si128(pred1, chroma_mask_even);
734     pred2 = _mm_and_si128(pred2, chroma_mask_even);
735     pred3 = _mm_and_si128(pred3, chroma_mask_even);
736 
737     src_r0 = _mm_add_epi8(src_r0, pred0);  // macro  src1 macro src2 macro ...
738     src_r1 = _mm_add_epi8(src_r1, pred1);
739     src_r2 = _mm_add_epi8(src_r2, pred2);
740     src_r3 = _mm_add_epi8(src_r3, pred3);
741 
742     _mm_storel_epi64((__m128i *) (&pu1_out[0]), src_r0);
743     _mm_storel_epi64((__m128i *) (&pu1_out[out_strd]), src_r1);
744     _mm_storel_epi64((__m128i *) (&pu1_out[2 * out_strd]), src_r2);
745     _mm_storel_epi64((__m128i *) (&pu1_out[3 * out_strd]), src_r3);
746 }
747 
748 /*****************************************************************************/
749 /*                                                                           */
750 /*  Function Name : isvcd_pred_residual_recon_chroma_8x8_sse42                */
751 /*                                                                           */
752 /*  Description   : this function computes the recon from                    */
753 /*                  the residual and pred buffer                             */
754 /*  Inputs        :                                                          */
755 /*  Globals       : none                                                     */
756 /*  Processing    :                                                          */
757 /*                                                                           */
758 /*  Outputs       : none                                                     */
759 /*  Returns       : nnz                                                      */
760 /*                                                                           */
761 /*  Issues        : none                                                     */
762 /*                                                                           */
763 /*  Revision History:                                                        */
764 /*                                                                           */
765 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
766 /*         25 11 2021   Kishore               creation                       */
767 /*                                                                           */
768 /*****************************************************************************/
769 
isvcd_pred_residual_recon_chroma_8x8_sse42(UWORD8 * pu1_pred,WORD16 * pi2_rsd,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 rsd_strd,WORD32 out_strd)770 void isvcd_pred_residual_recon_chroma_8x8_sse42(UWORD8 *pu1_pred, WORD16 *pi2_rsd, UWORD8 *pu1_out,
771                                                 WORD32 pred_strd, WORD32 rsd_strd, WORD32 out_strd)
772 {
773     __m128i src_r0, src_r1, src_r2, src_r3, src_r4, src_r5, src_r6, src_r7;
774     __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
775     __m128i rsd_r0, rsd_r1, rsd_r2, rsd_r3, rsd_r4, rsd_r5, rsd_r6, rsd_r7;
776     __m128i zero_16x8b;  // all bits reset to zero
777     __m128i chroma_mask_even;
778     __m128i chroma_mask_odd;
779 
780     zero_16x8b = _mm_setzero_si128();
781 
782     rsd_r0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
783     rsd_r1 = _mm_loadu_si128((__m128i *) (pi2_rsd + (1 * rsd_strd)));
784     rsd_r2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (2 * rsd_strd)));
785     rsd_r3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (3 * rsd_strd)));
786     rsd_r4 = _mm_loadu_si128((__m128i *) (pi2_rsd + (4 * rsd_strd)));
787     rsd_r5 = _mm_loadu_si128((__m128i *) (pi2_rsd + (5 * rsd_strd)));
788     rsd_r6 = _mm_loadu_si128((__m128i *) (pi2_rsd + (6 * rsd_strd)));
789     rsd_r7 = _mm_loadu_si128((__m128i *) (pi2_rsd + (7 * rsd_strd)));
790 
791     pred0 = _mm_loadu_si128((__m128i *) (pu1_pred));
792     pred1 = _mm_loadu_si128((__m128i *) (pu1_pred + (1 * pred_strd)));
793     pred2 = _mm_loadu_si128((__m128i *) (pu1_pred + (2 * pred_strd)));
794     pred3 = _mm_loadu_si128((__m128i *) (pu1_pred + (3 * pred_strd)));
795     pred4 = _mm_loadu_si128((__m128i *) (pu1_pred + (4 * pred_strd)));
796     pred5 = _mm_loadu_si128((__m128i *) (pu1_pred + (5 * pred_strd)));
797     pred6 = _mm_loadu_si128((__m128i *) (pu1_pred + (6 * pred_strd)));
798     pred7 = _mm_loadu_si128((__m128i *) (pu1_pred + (7 * pred_strd)));
799 
800     src_r0 = _mm_loadu_si128((__m128i *) (pu1_out));
801     src_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (1 * out_strd)));
802     src_r2 = _mm_loadu_si128((__m128i *) (pu1_out + (2 * out_strd)));
803     src_r3 = _mm_loadu_si128((__m128i *) (pu1_out + (3 * out_strd)));
804     src_r4 = _mm_loadu_si128((__m128i *) (pu1_out + (4 * out_strd)));
805     src_r5 = _mm_loadu_si128((__m128i *) (pu1_out + (5 * out_strd)));
806     src_r6 = _mm_loadu_si128((__m128i *) (pu1_out + (6 * out_strd)));
807     src_r7 = _mm_loadu_si128((__m128i *) (pu1_out + (7 * out_strd)));
808 
809     pred0 = _mm_cvtepu8_epi16(pred0);
810     pred1 = _mm_cvtepu8_epi16(pred1);
811     pred2 = _mm_cvtepu8_epi16(pred2);
812     pred3 = _mm_cvtepu8_epi16(pred3);
813     pred4 = _mm_cvtepu8_epi16(pred4);
814     pred5 = _mm_cvtepu8_epi16(pred5);
815     pred6 = _mm_cvtepu8_epi16(pred6);
816     pred7 = _mm_cvtepu8_epi16(pred7);
817 
818     pred0 = _mm_add_epi16(pred0, rsd_r0);
819     pred1 = _mm_add_epi16(pred1, rsd_r1);
820     pred2 = _mm_add_epi16(pred2, rsd_r2);
821     pred3 = _mm_add_epi16(pred3, rsd_r3);
822     pred4 = _mm_add_epi16(pred4, rsd_r4);
823     pred5 = _mm_add_epi16(pred5, rsd_r5);
824     pred6 = _mm_add_epi16(pred6, rsd_r6);
825     pred7 = _mm_add_epi16(pred7, rsd_r7);
826 
827     pred0 = _mm_packus_epi16(pred0, zero_16x8b);
828     pred1 = _mm_packus_epi16(pred1, zero_16x8b);
829     pred2 = _mm_packus_epi16(pred2, zero_16x8b);
830     pred3 = _mm_packus_epi16(pred3, zero_16x8b);
831     pred4 = _mm_packus_epi16(pred4, zero_16x8b);
832     pred5 = _mm_packus_epi16(pred5, zero_16x8b);
833     pred6 = _mm_packus_epi16(pred6, zero_16x8b);
834     pred7 = _mm_packus_epi16(pred7, zero_16x8b);
835 
836     chroma_mask_even = _mm_set_epi8(0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
837                                     0x00, 0xff, 0x00, 0xff, 0x00, 0xff);
838     chroma_mask_odd = _mm_set_epi8(0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff,
839                                    0x00, 0xff, 0x00, 0xff, 0x00);
840 
841     src_r0 = _mm_and_si128(src_r0, chroma_mask_odd);  // 0 src1 0 src2 0 ...
842     src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
843     src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
844     src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
845     src_r4 = _mm_and_si128(src_r4, chroma_mask_odd);
846     src_r5 = _mm_and_si128(src_r5, chroma_mask_odd);
847     src_r6 = _mm_and_si128(src_r6, chroma_mask_odd);
848     src_r7 = _mm_and_si128(src_r7, chroma_mask_odd);
849 
850     pred0 = _mm_and_si128(pred0, chroma_mask_even);  // val 0 val 0 ..
851     pred1 = _mm_and_si128(pred1, chroma_mask_even);
852     pred2 = _mm_and_si128(pred2, chroma_mask_even);
853     pred3 = _mm_and_si128(pred3, chroma_mask_even);
854     pred4 = _mm_and_si128(pred4, chroma_mask_even);
855     pred5 = _mm_and_si128(pred5, chroma_mask_even);
856     pred6 = _mm_and_si128(pred6, chroma_mask_even);
857     pred7 = _mm_and_si128(pred7, chroma_mask_even);
858 
859     src_r0 = _mm_add_epi8(src_r0, pred0);  // macro  src1 macro src2 macro ...
860     src_r1 = _mm_add_epi8(src_r1, pred1);
861     src_r2 = _mm_add_epi8(src_r2, pred2);
862     src_r3 = _mm_add_epi8(src_r3, pred3);
863     src_r4 = _mm_add_epi8(src_r4, pred4);
864     src_r5 = _mm_add_epi8(src_r5, pred5);
865     src_r6 = _mm_add_epi8(src_r6, pred6);
866     src_r7 = _mm_add_epi8(src_r7, pred7);
867 
868     _mm_storel_epi64((__m128i *) (&pu1_out[0]), src_r0);
869     _mm_storel_epi64((__m128i *) (&pu1_out[out_strd]), src_r1);
870     _mm_storel_epi64((__m128i *) (&pu1_out[2 * out_strd]), src_r2);
871     _mm_storel_epi64((__m128i *) (&pu1_out[3 * out_strd]), src_r3);
872     _mm_storel_epi64((__m128i *) (&pu1_out[4 * out_strd]), src_r4);
873     _mm_storel_epi64((__m128i *) (&pu1_out[5 * out_strd]), src_r5);
874     _mm_storel_epi64((__m128i *) (&pu1_out[6 * out_strd]), src_r6);
875     _mm_storel_epi64((__m128i *) (&pu1_out[7 * out_strd]), src_r7);
876 
877     /* load and repeat for the last 4 elements interleaved in the row */
878 
879     rsd_r0 = _mm_loadu_si128((__m128i *) (pi2_rsd + 8));
880     rsd_r1 = _mm_loadu_si128((__m128i *) (pi2_rsd + (1 * rsd_strd) + 8));
881     rsd_r2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (2 * rsd_strd) + 8));
882     rsd_r3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (3 * rsd_strd) + 8));
883     rsd_r4 = _mm_loadu_si128((__m128i *) (pi2_rsd + (4 * rsd_strd) + 8));
884     rsd_r5 = _mm_loadu_si128((__m128i *) (pi2_rsd + (5 * rsd_strd) + 8));
885     rsd_r6 = _mm_loadu_si128((__m128i *) (pi2_rsd + (6 * rsd_strd) + 8));
886     rsd_r7 = _mm_loadu_si128((__m128i *) (pi2_rsd + (7 * rsd_strd) + 8));
887 
888     pred0 = _mm_loadu_si128((__m128i *) (pu1_pred + 8));
889     pred1 = _mm_loadu_si128((__m128i *) (pu1_pred + (1 * pred_strd) + 8));
890     pred2 = _mm_loadu_si128((__m128i *) (pu1_pred + (2 * pred_strd) + 8));
891     pred3 = _mm_loadu_si128((__m128i *) (pu1_pred + (3 * pred_strd) + 8));
892     pred4 = _mm_loadu_si128((__m128i *) (pu1_pred + (4 * pred_strd) + 8));
893     pred5 = _mm_loadu_si128((__m128i *) (pu1_pred + (5 * pred_strd) + 8));
894     pred6 = _mm_loadu_si128((__m128i *) (pu1_pred + (6 * pred_strd) + 8));
895     pred7 = _mm_loadu_si128((__m128i *) (pu1_pred + (7 * pred_strd) + 8));
896 
897     src_r0 = _mm_loadu_si128((__m128i *) (pu1_out + 8));
898     src_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (1 * out_strd) + 8));
899     src_r2 = _mm_loadu_si128((__m128i *) (pu1_out + (2 * out_strd) + 8));
900     src_r3 = _mm_loadu_si128((__m128i *) (pu1_out + (3 * out_strd) + 8));
901     src_r4 = _mm_loadu_si128((__m128i *) (pu1_out + (4 * out_strd) + 8));
902     src_r5 = _mm_loadu_si128((__m128i *) (pu1_out + (5 * out_strd) + 8));
903     src_r6 = _mm_loadu_si128((__m128i *) (pu1_out + (6 * out_strd) + 8));
904     src_r7 = _mm_loadu_si128((__m128i *) (pu1_out + (7 * out_strd) + 8));
905 
906     pred0 = _mm_cvtepu8_epi16(pred0);
907     pred1 = _mm_cvtepu8_epi16(pred1);
908     pred2 = _mm_cvtepu8_epi16(pred2);
909     pred3 = _mm_cvtepu8_epi16(pred3);
910     pred4 = _mm_cvtepu8_epi16(pred4);
911     pred5 = _mm_cvtepu8_epi16(pred5);
912     pred6 = _mm_cvtepu8_epi16(pred6);
913     pred7 = _mm_cvtepu8_epi16(pred7);
914 
915     pred0 = _mm_add_epi16(pred0, rsd_r0);
916     pred1 = _mm_add_epi16(pred1, rsd_r1);
917     pred2 = _mm_add_epi16(pred2, rsd_r2);
918     pred3 = _mm_add_epi16(pred3, rsd_r3);
919     pred4 = _mm_add_epi16(pred4, rsd_r4);
920     pred5 = _mm_add_epi16(pred5, rsd_r5);
921     pred6 = _mm_add_epi16(pred6, rsd_r6);
922     pred7 = _mm_add_epi16(pred7, rsd_r7);
923 
924     pred0 = _mm_packus_epi16(pred0, zero_16x8b);
925     pred1 = _mm_packus_epi16(pred1, zero_16x8b);
926     pred2 = _mm_packus_epi16(pred2, zero_16x8b);
927     pred3 = _mm_packus_epi16(pred3, zero_16x8b);
928     pred4 = _mm_packus_epi16(pred4, zero_16x8b);
929     pred5 = _mm_packus_epi16(pred5, zero_16x8b);
930     pred6 = _mm_packus_epi16(pred6, zero_16x8b);
931     pred7 = _mm_packus_epi16(pred7, zero_16x8b);
932 
933     src_r0 = _mm_and_si128(src_r0, chroma_mask_odd);  // 0 src1 0 src2 0 ...
934     src_r1 = _mm_and_si128(src_r1, chroma_mask_odd);
935     src_r2 = _mm_and_si128(src_r2, chroma_mask_odd);
936     src_r3 = _mm_and_si128(src_r3, chroma_mask_odd);
937     src_r4 = _mm_and_si128(src_r4, chroma_mask_odd);
938     src_r5 = _mm_and_si128(src_r5, chroma_mask_odd);
939     src_r6 = _mm_and_si128(src_r6, chroma_mask_odd);
940     src_r7 = _mm_and_si128(src_r7, chroma_mask_odd);
941 
942     pred0 = _mm_and_si128(pred0, chroma_mask_even);  // val 0 val 0 ..
943     pred1 = _mm_and_si128(pred1, chroma_mask_even);
944     pred2 = _mm_and_si128(pred2, chroma_mask_even);
945     pred3 = _mm_and_si128(pred3, chroma_mask_even);
946     pred4 = _mm_and_si128(pred4, chroma_mask_even);
947     pred5 = _mm_and_si128(pred5, chroma_mask_even);
948     pred6 = _mm_and_si128(pred6, chroma_mask_even);
949     pred7 = _mm_and_si128(pred7, chroma_mask_even);
950 
951     src_r0 = _mm_add_epi8(src_r0, pred0);  // macro  src1 macro src2 macro ...
952     src_r1 = _mm_add_epi8(src_r1, pred1);
953     src_r2 = _mm_add_epi8(src_r2, pred2);
954     src_r3 = _mm_add_epi8(src_r3, pred3);
955     src_r4 = _mm_add_epi8(src_r4, pred4);
956     src_r5 = _mm_add_epi8(src_r5, pred5);
957     src_r6 = _mm_add_epi8(src_r6, pred6);
958     src_r7 = _mm_add_epi8(src_r7, pred7);
959 
960     _mm_storel_epi64((__m128i *) (&pu1_out[0] + 8), src_r0);
961     _mm_storel_epi64((__m128i *) (&pu1_out[out_strd] + 8), src_r1);
962     _mm_storel_epi64((__m128i *) (&pu1_out[(2 * out_strd)] + 8), src_r2);
963     _mm_storel_epi64((__m128i *) (&pu1_out[(3 * out_strd)] + 8), src_r3);
964     _mm_storel_epi64((__m128i *) (&pu1_out[(4 * out_strd)] + 8), src_r4);
965     _mm_storel_epi64((__m128i *) (&pu1_out[(5 * out_strd)] + 8), src_r5);
966     _mm_storel_epi64((__m128i *) (&pu1_out[(6 * out_strd)] + 8), src_r6);
967     _mm_storel_epi64((__m128i *) (&pu1_out[(7 * out_strd)] + 8), src_r7);
968 }
969 
970 /*****************************************************************************/
971 /*                                                                           */
972 /*  Function Name : isvcd_residual_luma_4x4_sse42                             */
973 /*                                                                           */
974 /*  Description   : this function computes the nnz from resd                 */
975 /*                                                                           */
976 /*  Inputs        :                                                          */
977 /*  Globals       : none                                                     */
978 /*  Processing    :                                                          */
979 /*                                                                           */
980 /*  Outputs       : none                                                     */
981 /*  Returns       : nnz                                                      */
982 /*                                                                           */
983 /*  Issues        : none                                                     */
984 /*                                                                           */
985 /*  Revision History:                                                        */
986 /*                                                                           */
987 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
988 /*         25 11 2021   Kishore               creation                       */
989 /*                                                                           */
990 /*****************************************************************************/
991 
isvcd_residual_luma_4x4_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)992 WORD32 isvcd_residual_luma_4x4_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
993 {
994     __m128i rsd_8x16b_0;
995     __m128i rsd_8x16b_1;
996     __m128i rsd_8x16b_2;
997     __m128i rsd_8x16b_3;
998     __m128i rsd_8x16b_01, rsd_8x16b_23;
999 
1000     __m128i zero_8x16b = _mm_setzero_si128();
1001     WORD32 i4_nnz, row_01, row_23;
1002 
1003     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1004     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1005     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1)));
1006     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + (rsd_strd << 1) + rsd_strd));
1007 
1008     rsd_8x16b_01 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1009     rsd_8x16b_23 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1010 
1011     row_01 = _mm_test_all_ones(
1012         _mm_cmpeq_epi16(rsd_8x16b_01, zero_8x16b));  // return 1 if all zeros, else 0
1013     row_23 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23, zero_8x16b));
1014 
1015     i4_nnz = !(row_01 && row_23);
1016     return i4_nnz;
1017 }
1018 /*****************************************************************************/
1019 /*                                                                           */
1020 /*  Function Name : isvcd_residual_luma_8x8_sse42                             */
1021 /*                                                                           */
1022 /*  Description   : this function computes the nnz from resd                 */
1023 /*                                                                           */
1024 /*  Inputs        :                                                          */
1025 /*  Globals       : none                                                     */
1026 /*  Processing    :                                                          */
1027 /*                                                                           */
1028 /*  Outputs       : none                                                     */
1029 /*  Returns       : nnz                                                      */
1030 /*                                                                           */
1031 /*  Issues        : none                                                     */
1032 /*                                                                           */
1033 /*  Revision History:                                                        */
1034 /*                                                                           */
1035 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1036 /*         25 11 2021   Kishore               creation                       */
1037 /*                                                                           */
1038 /*****************************************************************************/
1039 
isvcd_residual_luma_8x8_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)1040 WORD32 isvcd_residual_luma_8x8_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
1041 {
1042     __m128i rsd_8x16b_0;
1043     __m128i rsd_8x16b_1;
1044     __m128i rsd_8x16b_2;
1045     __m128i rsd_8x16b_3;
1046     __m128i rsd_8x16b_4;
1047     __m128i rsd_8x16b_5;
1048     __m128i rsd_8x16b_6;
1049     __m128i rsd_8x16b_7;
1050     __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
1051     __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
1052 
1053     WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
1054     WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
1055     WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
1056 
1057     __m128i zero_8x16b = _mm_setzero_si128();
1058 
1059     WORD32 rsd_strd2 = (rsd_strd << 1);
1060     WORD32 rsd_strd4 = (rsd_strd << 2);
1061 
1062     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1063     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1064     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1065     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1066     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1067     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1068     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1069     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1070 
1071     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1072     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1073     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1074     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1075 
1076     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1077     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1078     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1079     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1080 
1081     row_01_b0 = _mm_test_all_ones(
1082         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
1083     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1084     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1085     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1086     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1087     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1088     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1089     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1090 
1091     i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
1092     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
1093     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
1094     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
1095 
1096     i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1097     return i4_nnz;
1098 }
1099 
1100 /*****************************************************************************/
1101 /*                                                                           */
1102 /*  Function Name : isvcd_residual_luma_16x16_sse42                           */
1103 /*                                                                           */
1104 /*  Description   : this function computes the nnz from resd                 */
1105 /*                                                                           */
1106 /*  Inputs        :                                                          */
1107 /*  Globals       : none                                                     */
1108 /*  Processing    :                                                          */
1109 /*                                                                           */
1110 /*  Outputs       : none                                                     */
1111 /*  Returns       : nnz                                                      */
1112 /*                                                                           */
1113 /*  Issues        : none                                                     */
1114 /*                                                                           */
1115 /*  Revision History:                                                        */
1116 /*                                                                           */
1117 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1118 /*         25 11 2021   Kishore               creation                       */
1119 /*                                                                           */
1120 /*****************************************************************************/
1121 
isvcd_residual_luma_16x16_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)1122 WORD32 isvcd_residual_luma_16x16_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
1123 {
1124     __m128i rsd_8x16b_0;
1125     __m128i rsd_8x16b_1;
1126     __m128i rsd_8x16b_2;
1127     __m128i rsd_8x16b_3;
1128     __m128i rsd_8x16b_4;
1129     __m128i rsd_8x16b_5;
1130     __m128i rsd_8x16b_6;
1131     __m128i rsd_8x16b_7;
1132     __m128i rsd_8x16b_01_b0, rsd_8x16b_23_b0, rsd_8x16b_45_b2, rsd_8x16b_67_b2;
1133     __m128i rsd_8x16b_01_b1, rsd_8x16b_23_b1, rsd_8x16b_45_b3, rsd_8x16b_67_b3;
1134 
1135     WORD32 row_01_b0, row_23_b0, row_45_b2, row_67_b2;
1136     WORD32 row_01_b1, row_23_b1, row_45_b3, row_67_b3;
1137     WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
1138 
1139     __m128i zero_8x16b = _mm_setzero_si128();
1140 
1141     WORD32 rsd_strd2 = (rsd_strd << 1);
1142     WORD32 rsd_strd4 = (rsd_strd << 2);
1143 
1144     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1145     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1146     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1147     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1148     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1149     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1150     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1151     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1152 
1153     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1154     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1155     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1156     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1157     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1158     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1159     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1160     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1161 
1162     row_01_b0 = _mm_test_all_ones(
1163         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
1164     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1165     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1166     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1167     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1168     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1169     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1170     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1171 
1172     i4_nnz_b0 = (!(row_01_b0 && row_23_b0));
1173     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 1;
1174     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 4;
1175     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 5;
1176 
1177     i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1178 
1179     pi2_rsd += 8;
1180 
1181     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1182     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1183     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1184     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1185     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1186     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1187     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1188     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1189 
1190     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1191     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1192     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1193     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1194 
1195     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1196     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1197     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1198     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1199 
1200     row_01_b0 = _mm_test_all_ones(
1201         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
1202     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1203     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1204     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1205     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1206     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1207     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1208     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1209 
1210     i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 2;
1211     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 3;
1212     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 6;
1213     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 7;
1214 
1215     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1216 
1217     pi2_rsd -= 8;
1218     pi2_rsd += (rsd_strd << 3);
1219 
1220     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1221     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1222     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1223     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1224     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1225     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1226     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1227     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1228 
1229     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1230     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1231     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1232     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1233 
1234     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1235     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1236     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1237     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1238 
1239     row_01_b0 = _mm_test_all_ones(
1240         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
1241     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1242     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1243     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1244     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1245     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1246     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1247     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1248 
1249     i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 8;
1250     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 9;
1251     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 12;
1252     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 13;
1253 
1254     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1255 
1256     pi2_rsd += 8;
1257 
1258     rsd_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1259     rsd_8x16b_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1260     rsd_8x16b_2 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1261     rsd_8x16b_3 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1262     rsd_8x16b_4 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1263     rsd_8x16b_5 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1264     rsd_8x16b_6 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1265     rsd_8x16b_7 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1266 
1267     rsd_8x16b_01_b0 = _mm_unpacklo_epi64(rsd_8x16b_0, rsd_8x16b_1);
1268     rsd_8x16b_23_b0 = _mm_unpacklo_epi64(rsd_8x16b_2, rsd_8x16b_3);
1269     rsd_8x16b_01_b1 = _mm_unpackhi_epi64(rsd_8x16b_0, rsd_8x16b_1);
1270     rsd_8x16b_23_b1 = _mm_unpackhi_epi64(rsd_8x16b_2, rsd_8x16b_3);
1271 
1272     rsd_8x16b_45_b2 = _mm_unpacklo_epi64(rsd_8x16b_4, rsd_8x16b_5);
1273     rsd_8x16b_67_b2 = _mm_unpacklo_epi64(rsd_8x16b_6, rsd_8x16b_7);
1274     rsd_8x16b_45_b3 = _mm_unpackhi_epi64(rsd_8x16b_4, rsd_8x16b_5);
1275     rsd_8x16b_67_b3 = _mm_unpackhi_epi64(rsd_8x16b_6, rsd_8x16b_7);
1276 
1277     row_01_b0 = _mm_test_all_ones(
1278         _mm_cmpeq_epi16(rsd_8x16b_01_b0, zero_8x16b));  // return 1 if all zeros, else 0
1279     row_23_b0 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b0, zero_8x16b));
1280     row_01_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_01_b1, zero_8x16b));
1281     row_23_b1 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_23_b1, zero_8x16b));
1282     row_45_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b2, zero_8x16b));
1283     row_67_b2 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b2, zero_8x16b));
1284     row_45_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_45_b3, zero_8x16b));
1285     row_67_b3 = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_67_b3, zero_8x16b));
1286 
1287     i4_nnz_b0 = (!(row_01_b0 && row_23_b0)) << 10;
1288     i4_nnz_b1 = (!(row_01_b1 && row_23_b1)) << 11;
1289     i4_nnz_b2 = (!(row_45_b2 && row_67_b2)) << 14;
1290     i4_nnz_b3 = (!(row_45_b3 && row_67_b3)) << 15;
1291 
1292     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1293     return i4_nnz;
1294 }
1295 
1296 /*****************************************************************************/
1297 /*                                                                           */
1298 /*  Function Name : isvcd_residual_chroma_cb_cr_8x8_sse42                     */
1299 /*                                                                           */
1300 /*  Description   : this function computes the nnz from resd                 */
1301 /*                                                                           */
1302 /*  Inputs        :                                                          */
1303 /*  Globals       : none                                                     */
1304 /*  Processing    :                                                          */
1305 /*                                                                           */
1306 /*  Outputs       : none                                                     */
1307 /*  Returns       : nnz                                                      */
1308 /*                                                                           */
1309 /*  Issues        : none                                                     */
1310 /*                                                                           */
1311 /*  Revision History:                                                        */
1312 /*                                                                           */
1313 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1314 /*         25 11 2021   Kishore               creation                       */
1315 /*                                                                           */
1316 /*****************************************************************************/
1317 
isvcd_residual_chroma_cb_cr_8x8_sse42(WORD16 * pi2_rsd,WORD32 rsd_strd)1318 WORD32 isvcd_residual_chroma_cb_cr_8x8_sse42(WORD16 *pi2_rsd, WORD32 rsd_strd)
1319 {
1320     __m128i rsd_8x16b_r0_0, rsd_8x16b_r0_1, mix_8x16b_r01_0_l, mix_8x16b_r01_1_l,
1321         rsd_8x16b_r01_b0_cb, rsd_8x16b_r01_b1_cb;
1322     __m128i rsd_8x16b_r1_0, rsd_8x16b_r1_1, mix_8x16b_r23_0_l, mix_8x16b_r23_1_l,
1323         rsd_8x16b_r01_b0_cr, rsd_8x16b_r01_b1_cr;
1324     __m128i rsd_8x16b_r2_0, rsd_8x16b_r2_1, mix_8x16b_r45_0_l, mix_8x16b_r45_1_l,
1325         rsd_8x16b_r23_b0_cb, rsd_8x16b_r23_b1_cb;
1326     __m128i rsd_8x16b_r3_0, rsd_8x16b_r3_1, mix_8x16b_r67_0_l, mix_8x16b_r67_1_l,
1327         rsd_8x16b_r23_b0_cr, rsd_8x16b_r23_b1_cr;
1328     __m128i rsd_8x16b_r4_0, rsd_8x16b_r4_1, mix_8x16b_r01_0_h, mix_8x16b_r01_1_h,
1329         rsd_8x16b_r45_b2_cb, rsd_8x16b_r45_b3_cb;
1330     __m128i rsd_8x16b_r5_0, rsd_8x16b_r5_1, mix_8x16b_r23_0_h, mix_8x16b_r23_1_h,
1331         rsd_8x16b_r45_b2_cr, rsd_8x16b_r45_b3_cr;
1332     __m128i rsd_8x16b_r6_0, rsd_8x16b_r6_1, mix_8x16b_r45_0_h, mix_8x16b_r45_1_h,
1333         rsd_8x16b_r67_b2_cb, rsd_8x16b_r67_b3_cb;
1334     __m128i rsd_8x16b_r7_0, rsd_8x16b_r7_1, mix_8x16b_r67_0_h, mix_8x16b_r67_1_h,
1335         rsd_8x16b_r67_b2_cr, rsd_8x16b_r67_b3_cr;
1336 
1337     WORD32 r01_b0_cb, r01_b0_cr;
1338     WORD32 r23_b0_cb, r23_b0_cr;
1339     WORD32 r01_b1_cb, r01_b1_cr;
1340     WORD32 r23_b1_cb, r23_b1_cr;
1341     WORD32 r45_b2_cb, r45_b2_cr;
1342     WORD32 r67_b2_cb, r67_b2_cr;
1343     WORD32 r45_b3_cb, r45_b3_cr;
1344     WORD32 r67_b3_cb, r67_b3_cr;
1345 
1346     WORD32 i4_nnz, i4_nnz_b0, i4_nnz_b1, i4_nnz_b2, i4_nnz_b3;
1347 
1348     __m128i zero_8x16b = _mm_setzero_si128();
1349 
1350     WORD32 rsd_strd2 = (rsd_strd << 1);
1351     WORD32 rsd_strd4 = (rsd_strd << 2);
1352 
1353     rsd_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pi2_rsd));
1354     rsd_8x16b_r1_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd));
1355     rsd_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2));
1356     rsd_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd));
1357     rsd_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4));
1358     rsd_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd));
1359     rsd_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2));
1360     rsd_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd));
1361 
1362     rsd_8x16b_r0_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + 8));
1363     rsd_8x16b_r1_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd + 8));
1364     rsd_8x16b_r2_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + 8));
1365     rsd_8x16b_r3_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd2 + rsd_strd + 8));
1366     rsd_8x16b_r4_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + 8));
1367     rsd_8x16b_r5_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd + 8));
1368     rsd_8x16b_r6_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + 8));
1369     rsd_8x16b_r7_1 = _mm_loadu_si128((__m128i *) (pi2_rsd + rsd_strd4 + rsd_strd2 + rsd_strd + 8));
1370 
1371     mix_8x16b_r01_0_l =
1372         _mm_unpacklo_epi16(rsd_8x16b_r0_0, rsd_8x16b_r1_0);  // a0, b0 a1 b1 a2 b2 a3 b3
1373     mix_8x16b_r23_0_l = _mm_unpacklo_epi16(rsd_8x16b_r2_0, rsd_8x16b_r3_0);
1374     mix_8x16b_r45_0_l = _mm_unpacklo_epi16(rsd_8x16b_r4_0, rsd_8x16b_r5_0);
1375     mix_8x16b_r67_0_l = _mm_unpacklo_epi16(rsd_8x16b_r6_0, rsd_8x16b_r7_0);
1376     mix_8x16b_r01_0_h =
1377         _mm_unpackhi_epi16(rsd_8x16b_r0_0, rsd_8x16b_r1_0);  // a4 b4 a5 b5 a6 b6 a7 b7
1378     mix_8x16b_r23_0_h = _mm_unpackhi_epi16(rsd_8x16b_r2_0, rsd_8x16b_r3_0);
1379     mix_8x16b_r45_0_h = _mm_unpackhi_epi16(rsd_8x16b_r4_0, rsd_8x16b_r5_0);
1380     mix_8x16b_r67_0_h = _mm_unpackhi_epi16(rsd_8x16b_r6_0, rsd_8x16b_r7_0);
1381 
1382     mix_8x16b_r01_1_l =
1383         _mm_unpacklo_epi16(rsd_8x16b_r0_1, rsd_8x16b_r1_1);  // a8, b8 a9 b9 a10 b10 a11 b11
1384     mix_8x16b_r23_1_l = _mm_unpacklo_epi16(rsd_8x16b_r2_1, rsd_8x16b_r3_1);
1385     mix_8x16b_r45_1_l = _mm_unpacklo_epi16(rsd_8x16b_r4_1, rsd_8x16b_r5_1);
1386     mix_8x16b_r67_1_l = _mm_unpacklo_epi16(rsd_8x16b_r6_1, rsd_8x16b_r7_1);
1387     mix_8x16b_r01_1_h =
1388         _mm_unpackhi_epi16(rsd_8x16b_r0_1, rsd_8x16b_r1_1);  // a12 b12 a13 b13 a14 b14 a15 b15
1389     mix_8x16b_r23_1_h = _mm_unpackhi_epi16(rsd_8x16b_r2_1, rsd_8x16b_r3_1);
1390     mix_8x16b_r45_1_h = _mm_unpackhi_epi16(rsd_8x16b_r4_1, rsd_8x16b_r5_1);
1391     mix_8x16b_r67_1_h = _mm_unpackhi_epi16(rsd_8x16b_r6_1, rsd_8x16b_r7_1);
1392 
1393     mix_8x16b_r01_0_l = _mm_shuffle_epi32(mix_8x16b_r01_0_l, 0b11011000);  // a0b0 a2b2 a1b1 a3b3
1394     mix_8x16b_r23_0_l = _mm_shuffle_epi32(mix_8x16b_r23_0_l, 0b11011000);  // c0d0
1395     mix_8x16b_r45_0_l = _mm_shuffle_epi32(mix_8x16b_r45_0_l, 0b11011000);  // e0f0
1396     mix_8x16b_r67_0_l = _mm_shuffle_epi32(mix_8x16b_r67_0_l, 0b11011000);  // g0h0
1397     mix_8x16b_r01_0_h = _mm_shuffle_epi32(mix_8x16b_r01_0_h, 0b11011000);  // a4b4 a6b6 a5b5 a7b7
1398     mix_8x16b_r23_0_h = _mm_shuffle_epi32(mix_8x16b_r23_0_h, 0b11011000);  // c4d4
1399     mix_8x16b_r45_0_h = _mm_shuffle_epi32(mix_8x16b_r45_0_h, 0b11011000);  // e4f4
1400     mix_8x16b_r67_0_h = _mm_shuffle_epi32(mix_8x16b_r67_0_h, 0b11011000);  // g4h4
1401 
1402     mix_8x16b_r01_1_l = _mm_shuffle_epi32(mix_8x16b_r01_1_l, 0b11011000);
1403     mix_8x16b_r23_1_l = _mm_shuffle_epi32(mix_8x16b_r23_1_l, 0b11011000);
1404     mix_8x16b_r45_1_l = _mm_shuffle_epi32(mix_8x16b_r45_1_l, 0b11011000);
1405     mix_8x16b_r67_1_l = _mm_shuffle_epi32(mix_8x16b_r67_1_l, 0b11011000);
1406     mix_8x16b_r01_1_h = _mm_shuffle_epi32(mix_8x16b_r01_1_h, 0b11011000);
1407     mix_8x16b_r23_1_h = _mm_shuffle_epi32(mix_8x16b_r23_1_h, 0b11011000);
1408     mix_8x16b_r45_1_h = _mm_shuffle_epi32(mix_8x16b_r45_1_h, 0b11011000);
1409     mix_8x16b_r67_1_h = _mm_shuffle_epi32(mix_8x16b_r67_1_h, 0b11011000);
1410 
1411     rsd_8x16b_r01_b0_cb =
1412         _mm_unpacklo_epi64(mix_8x16b_r01_0_l, mix_8x16b_r01_0_h);  // a0b0 a2b2 a4b4 a6b6
1413     rsd_8x16b_r01_b0_cr =
1414         _mm_unpackhi_epi64(mix_8x16b_r01_0_l, mix_8x16b_r01_0_h);  // a1b1 a3b3 a5b5 a7b7
1415     rsd_8x16b_r23_b0_cb = _mm_unpacklo_epi64(mix_8x16b_r23_0_l, mix_8x16b_r23_0_h);  //
1416     rsd_8x16b_r23_b0_cr = _mm_unpackhi_epi64(mix_8x16b_r23_0_l, mix_8x16b_r23_0_h);
1417     rsd_8x16b_r45_b2_cb = _mm_unpacklo_epi64(mix_8x16b_r45_0_l, mix_8x16b_r45_0_h);
1418     rsd_8x16b_r45_b2_cr = _mm_unpackhi_epi64(mix_8x16b_r45_0_l, mix_8x16b_r45_0_h);
1419     rsd_8x16b_r67_b2_cb = _mm_unpacklo_epi64(mix_8x16b_r67_0_l, mix_8x16b_r67_0_h);
1420     rsd_8x16b_r67_b2_cr = _mm_unpackhi_epi64(mix_8x16b_r67_0_l, mix_8x16b_r67_0_h);
1421 
1422     rsd_8x16b_r01_b1_cb =
1423         _mm_unpacklo_epi64(mix_8x16b_r01_1_l, mix_8x16b_r01_1_h);  // a8b8 a10b10 a12b12 a14b14
1424     rsd_8x16b_r01_b1_cr =
1425         _mm_unpackhi_epi64(mix_8x16b_r01_1_l, mix_8x16b_r01_1_h);  // a9b9 a11b11 a13b13 a15b15
1426     rsd_8x16b_r23_b1_cb = _mm_unpacklo_epi64(mix_8x16b_r23_1_l, mix_8x16b_r23_1_h);
1427     rsd_8x16b_r23_b1_cr = _mm_unpackhi_epi64(mix_8x16b_r23_1_l, mix_8x16b_r23_1_h);
1428     rsd_8x16b_r45_b3_cb = _mm_unpacklo_epi64(mix_8x16b_r45_1_l, mix_8x16b_r45_1_h);
1429     rsd_8x16b_r45_b3_cr = _mm_unpackhi_epi64(mix_8x16b_r45_1_l, mix_8x16b_r45_1_h);
1430     rsd_8x16b_r67_b3_cb = _mm_unpacklo_epi64(mix_8x16b_r67_1_l, mix_8x16b_r67_1_h);
1431     rsd_8x16b_r67_b3_cr = _mm_unpackhi_epi64(mix_8x16b_r67_1_l, mix_8x16b_r67_1_h);
1432 
1433     r01_b0_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b0_cb, zero_8x16b));
1434     r23_b0_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b0_cb, zero_8x16b));
1435     r01_b1_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b1_cb, zero_8x16b));
1436     r23_b1_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b1_cb, zero_8x16b));
1437     r45_b2_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b2_cb, zero_8x16b));
1438     r67_b2_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b2_cb, zero_8x16b));
1439     r45_b3_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b3_cb, zero_8x16b));
1440     r67_b3_cb = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b3_cb, zero_8x16b));
1441 
1442     r01_b0_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b0_cr, zero_8x16b));
1443     r23_b0_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b0_cr, zero_8x16b));
1444     r01_b1_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r01_b1_cr, zero_8x16b));
1445     r23_b1_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r23_b1_cr, zero_8x16b));
1446     r45_b2_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b2_cr, zero_8x16b));
1447     r67_b2_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b2_cr, zero_8x16b));
1448     r45_b3_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r45_b3_cr, zero_8x16b));
1449     r67_b3_cr = _mm_test_all_ones(_mm_cmpeq_epi16(rsd_8x16b_r67_b3_cr, zero_8x16b));
1450 
1451     i4_nnz_b0 = (!(r01_b0_cr && r23_b0_cr));
1452     i4_nnz_b1 = (!(r01_b1_cr && r23_b1_cr)) << 1;
1453     i4_nnz_b2 = (!(r45_b2_cr && r67_b2_cr)) << 2;
1454     i4_nnz_b3 = (!(r45_b3_cr && r67_b3_cr)) << 3;
1455 
1456     i4_nnz = (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1457     i4_nnz = i4_nnz << 4;
1458 
1459     i4_nnz_b0 = (!(r01_b0_cb && r23_b0_cb));
1460     i4_nnz_b1 = (!(r01_b1_cb && r23_b1_cb)) << 1;
1461     i4_nnz_b2 = (!(r45_b2_cb && r67_b2_cb)) << 2;
1462     i4_nnz_b3 = (!(r45_b3_cb && r67_b3_cb)) << 3;
1463 
1464     i4_nnz |= (i4_nnz_b0 | i4_nnz_b1 | i4_nnz_b2 | i4_nnz_b3);
1465     return i4_nnz;
1466 }
1467