xref: /aosp_15_r20/external/libavc/decoder/x86/svc/isvcd_residual_resamp_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_residual_resamp_sse42.c
24  *
25  * @brief
26  *  Contains function definitions for intra resampling functions
27  *
28  * @author
29  *  Kishore
30  *
31  * @par List of Functions:
32  *  - isvcd_interpolate_residual_sse42
33  *  - isvcd_residual_luma_dyadic_sse42
34  *  - isvcd_residual_reflayer_const_non_boundary_mb_sse42
35  *
36  * @remarks
37  *  None
38  *
39  *******************************************************************************
40  */
41 #include <immintrin.h>
42 #include <smmintrin.h>
43 #include <emmintrin.h>
44 /* User include files */
45 #include "ih264_typedefs.h"
46 #include "isvcd_structs.h"
47 
48 /*****************************************************************************/
49 /*                                                                           */
50 /*  Function Name : isvcd_residual_luma_dyadic_sse42                          */
51 /*                                                                           */
52 /*  Description   :                                                          */
53 /*                                                                           */
54 /*  Inputs        :                                                          */
55 /*  Globals       : none                                                     */
56 /*  Processing    :                                                          */
57 /*                                                                           */
58 /*  Outputs       : none                                                     */
59 /*  Returns       : none                                                     */
60 /*                                                                           */
61 /*  Issues        : none                                                     */
62 /*                                                                           */
63 /*  Revision History:                                                        */
64 /*                                                                           */
65 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
66 /*         25 11 2021   Kishore         creation                             */
67 /*                                                                           */
68 /*****************************************************************************/
isvcd_residual_luma_dyadic_sse42(void * pv_residual_samp_ctxt,WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_out_res,WORD32 i4_out_res_stride,mem_element_t * ps_ref_mb_mode,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_ref_nnz,WORD32 i4_ref_tx_size)69 void isvcd_residual_luma_dyadic_sse42(void *pv_residual_samp_ctxt, WORD16 *pi2_inp_data,
70                                       WORD32 i4_inp_data_stride, WORD16 *pi2_out_res,
71                                       WORD32 i4_out_res_stride, mem_element_t *ps_ref_mb_mode,
72                                       UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_ref_nnz,
73                                       WORD32 i4_ref_tx_size)
74 
75 {
76     WORD16 *pi2_refarray_buffer;
77     WORD32 i4_blk_ctr;
78     residual_sampling_ctxt_t *ps_ctxt;
79 
80     UNUSED(ps_ref_mb_mode);
81     UNUSED(u2_mb_x);
82     UNUSED(u2_mb_y);
83 
84     ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
85     pi2_refarray_buffer = ps_ctxt->pi2_refarray_buffer;
86 
87     /* based on transform size the counter and interpolation width and */
88     /* height are intialised as follows                                */
89 
90     if((i4_ref_tx_size) && (0 != i4_ref_nnz))
91     {
92         WORD16 *pi2_ref_data_byte;
93         WORD32 i4_i, i4_j;
94         WORD16 *pi2_refarray_buffer_tmp = pi2_refarray_buffer;
95 
96         __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
97         __m128i res_8x16b_r1_0, res_8x16b_r1_1;
98         __m128i final_res_8x16b_r1_0, final_res_8x16b_r1_1;
99 
100         __m128i coeff_add_8x16b_r1;
101 
102         __m128i coeff_add_8x16b_r2;
103         __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
104         __m128i res_8x16b_r2_0, res_8x16b_r2_1;
105         __m128i final_res_8x16b_r2_0, final_res_8x16b_r2_1;
106 
107         pi2_ref_data_byte = pi2_inp_data;
108 
109         /* ----------- Horizontal Interpolation ---------------- */
110         for(i4_i = 0; i4_i < BLOCK_HEIGHT; i4_i += 2)
111         {
112             i2_coeff_8x16b_r1_0 =
113                 _mm_loadu_si128((__m128i *) pi2_ref_data_byte);         // a0 a1 a2 a3 a4 a5 a6 a7
114             i2_coeff_8x16b_r2_0 = _mm_loadu_si128(
115                 (__m128i *) (pi2_ref_data_byte + i4_inp_data_stride));  // b0 b1 b2 b3 b4 b5 b6 b7
116 
117             i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0, 2);  // a1 a2 a3 a4 a5 a6 a7 0
118             i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0, 2);  // b1 b2 b3 b4 b5 b6 b7 0
119 
120             coeff_add_8x16b_r1 = _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
121             coeff_add_8x16b_r2 = _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
122 
123             i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
124             i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
125 
126             i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
127             i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
128 
129             res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
130             res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
131 
132             res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
133             res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
134 
135             final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
136             final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
137 
138             final_res_8x16b_r1_1 = _mm_unpackhi_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
139             final_res_8x16b_r2_1 = _mm_unpackhi_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
140 
141             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1), final_res_8x16b_r1_0);
142             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9), final_res_8x16b_r1_1);
143 
144             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17), final_res_8x16b_r2_0);
145             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25), final_res_8x16b_r2_1);
146 
147             pi2_refarray_buffer[0] = (pi2_ref_data_byte[0] << 2);
148             pi2_refarray_buffer[15] = (pi2_ref_data_byte[7] << 2);
149             pi2_ref_data_byte += i4_inp_data_stride;
150             pi2_refarray_buffer[16] = (pi2_ref_data_byte[0] << 2);
151             pi2_refarray_buffer[31] = (pi2_ref_data_byte[7] << 2);
152 
153             /* vertical loop uopdates */
154             pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
155             pi2_refarray_buffer += 32;
156         }
157 
158         /* ----------- Vertical Interpolation ---------------- */
159         pi2_refarray_buffer = pi2_refarray_buffer_tmp;
160 
161         {
162             __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r1_3,
163                 i4_horz_samp_4x32b_r1_4;
164             __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r2_3,
165                 i4_horz_samp_4x32b_r2_4;
166             __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2, i4_res_samp_4x32b_r1_3,
167                 i4_res_samp_4x32b_r1_4;
168             __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2, i4_res_samp_4x32b_r2_3,
169                 i4_res_samp_4x32b_r2_4;
170             __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2, horz_add_4x32b_r2_3,
171                 horz_add_4x32b_r2_4;
172 
173             __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r2_1;
174             __m128i i4_horz_samp_8x16b_r1_2, i4_horz_samp_8x16b_r2_2;
175             __m128i i4_horz_samp_8x16b_r1_3, i4_horz_samp_8x16b_r2_3;
176             __m128i i4_horz_samp_8x16b_r1_4, i4_horz_samp_8x16b_r2_4;
177 
178             __m128i twos = _mm_set1_epi32(2);
179             __m128i eights = _mm_set1_epi32(8);
180 
181             WORD16 *pi2_out;
182 
183             pi2_out = pi2_out_res;
184 
185             i4_horz_samp_8x16b_r1_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
186             i4_horz_samp_8x16b_r1_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
187             i4_horz_samp_8x16b_r1_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
188             i4_horz_samp_8x16b_r1_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
189 
190             i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
191             i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
192             i4_horz_samp_4x32b_r1_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_3);
193             i4_horz_samp_4x32b_r1_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_4);
194 
195             /* populate the first inter sample */
196             i4_res_samp_4x32b_r1_1 =
197                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
198             i4_res_samp_4x32b_r1_2 =
199                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
200             i4_res_samp_4x32b_r1_3 =
201                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
202             i4_res_samp_4x32b_r1_4 =
203                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
204 
205             _mm_storeu_si128((__m128i *) pi2_out,
206                              _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
207             _mm_storeu_si128((__m128i *) (pi2_out + 8),
208                              _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
209             pi2_out += i4_out_res_stride;
210 
211             for(i4_j = 0; i4_j < 14; i4_j += 2)
212             {
213                 pi2_refarray_buffer += MB_WIDTH;
214 
215                 i4_horz_samp_8x16b_r2_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
216                 i4_horz_samp_8x16b_r2_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
217                 i4_horz_samp_8x16b_r2_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
218                 i4_horz_samp_8x16b_r2_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
219 
220                 i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
221                 i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
222                 i4_horz_samp_4x32b_r2_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_3);
223                 i4_horz_samp_4x32b_r2_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_4);
224 
225                 horz_add_4x32b_r2_1 =
226                     _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
227                 horz_add_4x32b_r2_2 =
228                     _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
229                 horz_add_4x32b_r2_3 =
230                     _mm_add_epi32(i4_horz_samp_4x32b_r1_3, i4_horz_samp_4x32b_r2_3);
231                 horz_add_4x32b_r2_4 =
232                     _mm_add_epi32(i4_horz_samp_4x32b_r1_4, i4_horz_samp_4x32b_r2_4);
233 
234                 i4_res_samp_4x32b_r1_1 =
235                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
236                 i4_res_samp_4x32b_r1_2 =
237                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
238                 i4_res_samp_4x32b_r1_3 =
239                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_3, 1), horz_add_4x32b_r2_3);
240                 i4_res_samp_4x32b_r1_4 =
241                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_4, 1), horz_add_4x32b_r2_4);
242 
243                 i4_res_samp_4x32b_r2_1 =
244                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
245                 i4_res_samp_4x32b_r2_2 =
246                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
247                 i4_res_samp_4x32b_r2_3 =
248                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_3, 1), horz_add_4x32b_r2_3);
249                 i4_res_samp_4x32b_r2_4 =
250                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_4, 1), horz_add_4x32b_r2_4);
251 
252                 i4_res_samp_4x32b_r1_1 =
253                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
254                 i4_res_samp_4x32b_r1_2 =
255                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
256                 i4_res_samp_4x32b_r1_3 =
257                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_3, eights), 4);
258                 i4_res_samp_4x32b_r1_4 =
259                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_4, eights), 4);
260 
261                 i4_res_samp_4x32b_r2_1 =
262                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
263                 i4_res_samp_4x32b_r2_2 =
264                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
265                 i4_res_samp_4x32b_r2_3 =
266                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_3, eights), 4);
267                 i4_res_samp_4x32b_r2_4 =
268                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_4, eights), 4);
269 
270                 /* populate 2 samples based on current coeffs */
271                 _mm_storeu_si128((__m128i *) pi2_out,
272                                  _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
273                 _mm_storeu_si128((__m128i *) (pi2_out + 8),
274                                  _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
275                 pi2_out += i4_out_res_stride;
276 
277                 _mm_storeu_si128((__m128i *) pi2_out,
278                                  _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
279                 _mm_storeu_si128((__m128i *) (pi2_out + 8),
280                                  _mm_packs_epi32(i4_res_samp_4x32b_r2_3, i4_res_samp_4x32b_r2_4));
281                 pi2_out += i4_out_res_stride;
282 
283                 /* store the coeff 2 to coeff 1 */
284                 /* (used in next iteration)     */
285                 i4_horz_samp_4x32b_r1_1 = i4_horz_samp_4x32b_r2_1;
286                 i4_horz_samp_4x32b_r1_2 = i4_horz_samp_4x32b_r2_2;
287                 i4_horz_samp_4x32b_r1_3 = i4_horz_samp_4x32b_r2_3;
288                 i4_horz_samp_4x32b_r1_4 = i4_horz_samp_4x32b_r2_4;
289             }
290 
291             i4_res_samp_4x32b_r1_1 =
292                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
293             i4_res_samp_4x32b_r1_2 =
294                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
295             i4_res_samp_4x32b_r1_3 =
296                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
297             i4_res_samp_4x32b_r1_4 =
298                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
299 
300             _mm_storeu_si128((__m128i *) pi2_out,
301                              _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
302             _mm_storeu_si128((__m128i *) (pi2_out + 8),
303                              _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
304         }
305     }
306     else
307     {
308         /* ----------------------------------------------------------------- */
309         /* LOOP over number of blocks                                        */
310         /* ----------------------------------------------------------------- */
311         for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
312         {
313             /* if reference layer is not coded then no processing */
314             if(0 != (i4_ref_nnz & 0x1))
315             {
316                 __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
317                 __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
318                 __m128i i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1;
319                 __m128i i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1;
320 
321                 __m128i res_8x16b_r1_0, res_8x16b_r1_1;
322                 __m128i res_8x16b_r2_0, res_8x16b_r2_1;
323                 __m128i res_8x16b_r3_0, res_8x16b_r3_1;
324                 __m128i res_8x16b_r4_0, res_8x16b_r4_1;
325                 __m128i final_res_8x16b_r1_0;
326                 __m128i final_res_8x16b_r2_0;
327                 __m128i final_res_8x16b_r3_0;
328                 __m128i final_res_8x16b_r4_0;
329 
330                 __m128i coeff_add_8x16b_r1;
331                 __m128i coeff_add_8x16b_r2;
332                 __m128i coeff_add_8x16b_r3;
333                 __m128i coeff_add_8x16b_r4;
334 
335                 /* ----------- Horizontal Interpolation ---------------- */
336 
337                 i2_coeff_8x16b_r1_0 =
338                     _mm_loadu_si128((__m128i *) pi2_inp_data);         // a0 a1 a2 a3 a4 a5 a6 a7
339                 i2_coeff_8x16b_r2_0 = _mm_loadu_si128(
340                     (__m128i *) (pi2_inp_data + i4_inp_data_stride));  // b0 b1 b2 b3 b4 b5 b6 b7
341                 i2_coeff_8x16b_r3_0 =
342                     _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride << 1)));
343                 i2_coeff_8x16b_r4_0 =
344                     _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride * 3)));
345 
346                 i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0,
347                                                      2);  // a1 a2 a3 a4 a5 a6 a7 0
348                 i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0,
349                                                      2);  // b1 b2 b3 b4 b5 b6 b7 0
350                 i2_coeff_8x16b_r3_1 = _mm_srli_si128(i2_coeff_8x16b_r3_0, 2);
351                 i2_coeff_8x16b_r4_1 = _mm_srli_si128(i2_coeff_8x16b_r4_0, 2);
352 
353                 coeff_add_8x16b_r1 = _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
354                 coeff_add_8x16b_r2 = _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
355                 coeff_add_8x16b_r3 = _mm_add_epi16(i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1);
356                 coeff_add_8x16b_r4 = _mm_add_epi16(i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1);
357 
358                 i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
359                 i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
360                 i2_coeff_8x16b_r3_0 = _mm_slli_epi16(i2_coeff_8x16b_r3_0, 1);
361                 i2_coeff_8x16b_r4_0 = _mm_slli_epi16(i2_coeff_8x16b_r4_0, 1);
362 
363                 i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
364                 i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
365                 i2_coeff_8x16b_r3_1 = _mm_slli_epi16(i2_coeff_8x16b_r3_1, 1);
366                 i2_coeff_8x16b_r4_1 = _mm_slli_epi16(i2_coeff_8x16b_r4_1, 1);
367 
368                 res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
369                 res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
370                 res_8x16b_r3_0 = _mm_add_epi16(i2_coeff_8x16b_r3_0, coeff_add_8x16b_r3);
371                 res_8x16b_r4_0 = _mm_add_epi16(i2_coeff_8x16b_r4_0, coeff_add_8x16b_r4);
372 
373                 res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
374                 res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
375                 res_8x16b_r3_1 = _mm_add_epi16(i2_coeff_8x16b_r3_1, coeff_add_8x16b_r3);
376                 res_8x16b_r4_1 = _mm_add_epi16(i2_coeff_8x16b_r4_1, coeff_add_8x16b_r4);
377 
378                 final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
379                 final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
380                 final_res_8x16b_r3_0 = _mm_unpacklo_epi16(res_8x16b_r3_0, res_8x16b_r3_1);
381                 final_res_8x16b_r4_0 = _mm_unpacklo_epi16(res_8x16b_r4_0, res_8x16b_r4_1);
382 
383                 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1), final_res_8x16b_r1_0);
384                 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9), final_res_8x16b_r2_0);
385                 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17), final_res_8x16b_r3_0);
386                 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25), final_res_8x16b_r4_0);
387 
388                 pi2_refarray_buffer[0] = (pi2_inp_data[0] << 2);
389                 pi2_refarray_buffer[7] = (pi2_inp_data[3] << 2);
390                 pi2_refarray_buffer[8] = (pi2_inp_data[i4_inp_data_stride] << 2);
391                 pi2_refarray_buffer[15] = (pi2_inp_data[i4_inp_data_stride + 3] << 2);
392                 pi2_refarray_buffer[16] = (pi2_inp_data[(i4_inp_data_stride << 1)] << 2);
393                 pi2_refarray_buffer[23] = (pi2_inp_data[(i4_inp_data_stride << 1) + 3] << 2);
394                 pi2_refarray_buffer[24] = (pi2_inp_data[(i4_inp_data_stride * 3)] << 2);
395                 pi2_refarray_buffer[31] = (pi2_inp_data[(i4_inp_data_stride * 3) + 3] << 2);
396 
397                 /* ----------- Vertical Interpolation ---------------- */
398                 {
399                     __m128i i4_horz_samp_8x16b_r0_1, i4_horz_samp_8x16b_r0_2;
400                     __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r1_2;
401                     __m128i i4_horz_samp_8x16b_r2_1, i4_horz_samp_8x16b_r2_2;
402                     __m128i i4_horz_samp_8x16b_r3_1, i4_horz_samp_8x16b_r3_2;
403 
404                     __m128i i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r0_2;
405                     __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2;
406                     __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2;
407                     __m128i i4_horz_samp_4x32b_r3_1, i4_horz_samp_4x32b_r3_2;
408 
409                     __m128i i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2;
410                     __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2;
411                     __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2;
412                     __m128i i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2;
413                     __m128i i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2;
414                     __m128i i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2;
415                     __m128i i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2;
416                     __m128i i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2;
417 
418                     __m128i horz_add_4x32b_r1_1, horz_add_4x32b_r1_2;
419                     __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2;
420                     __m128i horz_add_4x32b_r3_1, horz_add_4x32b_r3_2;
421 
422                     __m128i twos = _mm_set1_epi32(2);
423                     __m128i eights = _mm_set1_epi32(8);
424 
425                     i4_horz_samp_8x16b_r0_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
426                     i4_horz_samp_8x16b_r0_2 =
427                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
428                     i4_horz_samp_8x16b_r1_1 =
429                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLOCK_WIDTH));
430                     i4_horz_samp_8x16b_r1_2 =
431                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLOCK_WIDTH + 4));
432                     i4_horz_samp_8x16b_r2_1 =
433                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH << 1)));
434                     i4_horz_samp_8x16b_r2_2 =
435                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH << 1) + 4));
436                     i4_horz_samp_8x16b_r3_1 =
437                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH * 3)));
438                     i4_horz_samp_8x16b_r3_2 =
439                         _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH * 3) + 4));
440 
441                     i4_horz_samp_4x32b_r0_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_1);
442                     i4_horz_samp_4x32b_r0_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_2);
443                     i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
444                     i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
445                     i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
446                     i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
447                     i4_horz_samp_4x32b_r3_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_1);
448                     i4_horz_samp_4x32b_r3_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_2);
449 
450                     horz_add_4x32b_r1_1 =
451                         _mm_add_epi32(i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r1_1);
452                     horz_add_4x32b_r2_1 =
453                         _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
454                     horz_add_4x32b_r3_1 =
455                         _mm_add_epi32(i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r3_1);
456 
457                     horz_add_4x32b_r1_2 =
458                         _mm_add_epi32(i4_horz_samp_4x32b_r0_2, i4_horz_samp_4x32b_r1_2);
459                     horz_add_4x32b_r2_2 =
460                         _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
461                     horz_add_4x32b_r3_2 =
462                         _mm_add_epi32(i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r3_2);
463 
464                     i4_res_samp_4x32b_r1_1 = _mm_add_epi32(
465                         _mm_slli_epi32(i4_horz_samp_4x32b_r0_1, 1), horz_add_4x32b_r1_1);
466                     i4_res_samp_4x32b_r2_1 = _mm_add_epi32(
467                         _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r1_1);
468                     i4_res_samp_4x32b_r3_1 = _mm_add_epi32(
469                         _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
470                     i4_res_samp_4x32b_r4_1 = _mm_add_epi32(
471                         _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
472                     i4_res_samp_4x32b_r5_1 = _mm_add_epi32(
473                         _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r3_1);
474                     i4_res_samp_4x32b_r6_1 = _mm_add_epi32(
475                         _mm_slli_epi32(i4_horz_samp_4x32b_r3_1, 1), horz_add_4x32b_r3_1);
476 
477                     i4_res_samp_4x32b_r1_2 = _mm_add_epi32(
478                         _mm_slli_epi32(i4_horz_samp_4x32b_r0_2, 1), horz_add_4x32b_r1_2);
479                     i4_res_samp_4x32b_r2_2 = _mm_add_epi32(
480                         _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r1_2);
481                     i4_res_samp_4x32b_r3_2 = _mm_add_epi32(
482                         _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
483                     i4_res_samp_4x32b_r4_2 = _mm_add_epi32(
484                         _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
485                     i4_res_samp_4x32b_r5_2 = _mm_add_epi32(
486                         _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r3_2);
487                     i4_res_samp_4x32b_r6_2 = _mm_add_epi32(
488                         _mm_slli_epi32(i4_horz_samp_4x32b_r3_2, 1), horz_add_4x32b_r3_2);
489 
490                     i4_res_samp_4x32b_r0_1 =
491                         _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_1, twos), 2);
492                     i4_res_samp_4x32b_r1_1 =
493                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
494                     i4_res_samp_4x32b_r2_1 =
495                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
496                     i4_res_samp_4x32b_r3_1 =
497                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_1, eights), 4);
498                     i4_res_samp_4x32b_r4_1 =
499                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_1, eights), 4);
500                     i4_res_samp_4x32b_r5_1 =
501                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_1, eights), 4);
502                     i4_res_samp_4x32b_r6_1 =
503                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_1, eights), 4);
504                     i4_res_samp_4x32b_r7_1 =
505                         _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_1, twos), 2);
506 
507                     i4_res_samp_4x32b_r0_2 =
508                         _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_2, twos), 2);
509                     i4_res_samp_4x32b_r1_2 =
510                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
511                     i4_res_samp_4x32b_r2_2 =
512                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
513                     i4_res_samp_4x32b_r3_2 =
514                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_2, eights), 4);
515                     i4_res_samp_4x32b_r4_2 =
516                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_2, eights), 4);
517                     i4_res_samp_4x32b_r5_2 =
518                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_2, eights), 4);
519                     i4_res_samp_4x32b_r6_2 =
520                         _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_2, eights), 4);
521                     i4_res_samp_4x32b_r7_2 =
522                         _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_2, twos), 2);
523 
524                     /* populate 2 samples based on current coeffs */
525                     _mm_storeu_si128(
526                         (__m128i *) pi2_out_res,
527                         _mm_packs_epi32(i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2));
528                     _mm_storeu_si128(
529                         (__m128i *) (pi2_out_res + i4_out_res_stride),
530                         _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
531                     _mm_storeu_si128(
532                         (__m128i *) (pi2_out_res + (i4_out_res_stride << 1)),
533                         _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
534                     _mm_storeu_si128(
535                         (__m128i *) (pi2_out_res + (i4_out_res_stride * 3)),
536                         _mm_packs_epi32(i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2));
537                     _mm_storeu_si128(
538                         (__m128i *) (pi2_out_res + (i4_out_res_stride << 2)),
539                         _mm_packs_epi32(i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2));
540                     _mm_storeu_si128(
541                         (__m128i *) (pi2_out_res + (i4_out_res_stride * 5)),
542                         _mm_packs_epi32(i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2));
543                     _mm_storeu_si128(
544                         (__m128i *) (pi2_out_res + (i4_out_res_stride * 6)),
545                         _mm_packs_epi32(i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2));
546                     _mm_storeu_si128(
547                         (__m128i *) (pi2_out_res + (i4_out_res_stride * 7)),
548                         _mm_packs_epi32(i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2));
549 
550                     pi2_out_res += BLOCK_WIDTH;
551                 }
552             }
553             else
554             {
555                 pi2_out_res += BLOCK_WIDTH;
556             }
557 
558             /* Block level loop updates */
559             if(1 == i4_blk_ctr)
560             {
561                 pi2_inp_data -= SUB_BLOCK_WIDTH;
562                 pi2_inp_data += (i4_inp_data_stride * SUB_BLOCK_HEIGHT);
563                 pi2_out_res -= MB_WIDTH;
564                 pi2_out_res += (i4_out_res_stride * BLOCK_HEIGHT);
565                 i4_ref_nnz >>= 2;
566             }
567             else
568             {
569                 pi2_inp_data += SUB_BLOCK_WIDTH;
570             }
571 
572             i4_ref_nnz >>= 1;
573         } /* end of loop over all the blocks */
574     }
575     return;
576 }
577 
578 /*****************************************************************************/
579 /*                                                                           */
580 /*  Function Name : isvcd_interpolate_residual_sse42                          */
581 /*                                                                           */
582 /*  Description   :                                                          */
583 /*                                                                           */
584 /*  Inputs        :                                                          */
585 /*  Globals       : none                                                     */
586 /*  Processing    :                                                          */
587 /*                                                                           */
588 /*  Outputs       : none                                                     */
589 /*  Returns       : none                                                     */
590 /*                                                                           */
591 /*  Issues        : none                                                     */
592 /*                                                                           */
593 /*  Revision History:                                                        */
594 /*                                                                           */
595 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
596 /*         25 11 2021   Kishore         creation                             */
597 /*                                                                           */
598 /*****************************************************************************/
599 
isvcd_interpolate_residual_sse42(void * pv_residual_samp_ctxt,WORD16 * pi2_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_chroma_flag)600 void isvcd_interpolate_residual_sse42(void *pv_residual_samp_ctxt, WORD16 *pi2_out,
601                                       WORD32 i4_out_stride, WORD32 i4_refarray_wd, UWORD16 u2_mb_x,
602                                       UWORD16 u2_mb_y, WORD32 i4_chroma_flag)
603 {
604     residual_sampling_ctxt_t *ps_ctxt;
605     residual_samp_map_ctxt_t *ps_map_ctxt;
606     res_lyr_ctxt *ps_lyr_ctxt;
607     ref_pixel_map_t *ps_x_pos_phase;
608     ref_pixel_map_t *ps_y_pos_phase;
609 
610     WORD32 i4_x, i4_y;
611     WORD32 i4_frm_mb_x, i4_frm_mb_y;
612     WORD32 i4_temp_array_ht;
613     WORD32 i4_mb_wd;
614     WORD32 i4_mb_ht;
615     WORD16 *pi2_ref_array;
616     UWORD8 *pu1_ref_x_ptr_incr, *pu1_ref_y_ptr_incr;
617 
618     WORD8 arr_y_ref_pos[16] = {0};
619     WORD8 arr_x_ref_pos[16] = {0};
620     WORD8 arr_x_phase[32] = {0};
621     WORD8 arr_y_phase[32] = {0};
622     WORD8 *pi1_y_ref_pos;
623     WORD8 *pi1_x_ref_pos;
624     WORD8 *pi1_y_phase;
625     WORD8 *pi1_x_phase;
626 
627     ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
628     ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
629     pi2_ref_array = ps_ctxt->pi2_refarray_buffer;
630     pu1_ref_x_ptr_incr = ps_ctxt->pu1_ref_x_ptr_incr;
631     pu1_ref_y_ptr_incr = ps_ctxt->pu1_ref_y_ptr_incr;
632 
633     /* --------------------------------------------------------------------- */
634     /* Extracting information from the mapping context                       */
635     /* --------------------------------------------------------------------- */
636     if(1 == i4_chroma_flag)
637         ps_map_ctxt = &ps_lyr_ctxt->s_chroma_map_ctxt;
638     else
639         ps_map_ctxt = &ps_lyr_ctxt->s_luma_map_ctxt;
640 
641     i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
642     i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
643 
644     ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
645     ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
646 
647     i4_temp_array_ht = i4_mb_ht;
648     i4_frm_mb_y = u2_mb_y * i4_mb_ht;
649     i4_frm_mb_x = u2_mb_x * i4_mb_wd;
650 
651     /* --------------------------------------------------------------------- */
652     /* Loop for interpolation                                                */
653     /* --------------------------------------------------------------------- */
654 
655     if(i4_chroma_flag == 0)
656     {
657         __m128i const_16_8x16b, const_128, const_ones, const_ones_8x16b, mid_indx_16x8b;
658         __m128i ref_arr_8x16b_r0_0;
659         __m128i ref_arr_8x16b_r1_0;
660         __m128i phs_mask_8x16b_0, phs_mask_16min_8x16b_0, phs_mask_16x8b_0;
661         __m128i x_ref_pos_mask_r0, x_ref_rnd_mask_r0_0;
662         __m128i x_ref_pos_mask_temp_r0_0;
663         __m128i x_ref_pos_mask_temp_r1_0;
664         __m128i phs_mask_div8_8x16b_0;
665         __m128i u1_incr_8x16b_r0_0, ref_arr_temp0_8x16b_r0_0, res0_8x16b_r0_0,
666             u1_incr_not_8x16b_r0_0;
667         __m128i u1_incr_8x16b_r1_0, ref_arr_temp1_8x16b_r0_0, res1_8x16b_r0_0;
668 
669         __m128i u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r1_even, x_ref_pos_mask_temp_r0_even,
670             x_ref_pos_mask_temp_r1_even;
671         __m128i u1_incr_not_8x16b_r0_odd, u1_incr_not_8x16b_r1_odd, x_ref_pos_mask_temp_r0_odd,
672             x_ref_pos_mask_temp_r1_odd;
673 
674         __m128i ref_arr_temp0_8x16b_r1_0, res_8x16b_r0_0, res0_8x16b_r1_0, u1_incr_not_8x16b_r1_0;
675         __m128i ref_arr_temp1_8x16b_r1_0, res_8x16b_r1_0, res1_8x16b_r1_0;
676         __m128i u1_y_incr_8x16b_r0_0, u1_y_incr_8x16b_r0_1, u1_y_incr_8x16b_r0_low,
677             u1_y_incr_8x16b_r0_high;
678 
679         __m128i prev_res_8x16b_r0_0;
680         __m128i prev_res_8x16b_r1_0;
681         __m128i prev_res_8x16b_r0_1;
682         __m128i prev_res_8x16b_r1_1;
683 
684         __m128i u1_prev_y_incr_8x16b_r0_0;
685         __m128i u1_prev_y_incr_8x16b_r0_1;
686 
687         __m128i ref_arr_8x16b_r0_1;
688         __m128i ref_arr_8x16b_r1_1;
689         __m128i phs_mask_8x16b_1, phs_mask_div8_8x16b_1, phs_mask_16min_8x16b_1;
690         __m128i x_ref_pos_mask_temp_r0_1;
691         __m128i x_ref_pos_mask_temp_r1_1;
692         __m128i ref_arr_temp0_8x16b_r0_1, res0_8x16b_r0_1, u1_incr_not_8x16b_r0_1;
693         __m128i ref_arr_temp1_8x16b_r0_1, res1_8x16b_r0_1;
694 
695         __m128i ref_arr_temp0_8x16b_r1_1, res_8x16b_r0_1, res0_8x16b_r1_1, u1_incr_not_8x16b_r1_1;
696         __m128i ref_arr_temp1_8x16b_r1_1, res_8x16b_r1_1, res1_8x16b_r1_1;
697 
698         __m128i vert_res0_8x16b_r0_0, vert_res0_8x16b_r0_1, res_4x32b_l_0, res_4x32b_h_0;
699         __m128i vert_res1_8x16b_r0_0, vert_res1_8x16b_r0_1, res_4x32b_l_1, res_4x32b_h_1;
700         __m128i res_8x16b_l, res_8x16b_h;
701         __m128i phs_y_mask_16min_8x16b, phs_y_mask_8x16b, phs_y_mask_mix_8x16b;
702         __m128i zero_8x16b;
703         WORD32 zero_r0_0, zero_r1_0, zero_r0_1, zero_r1_1, zero_r0_r1 = 0;
704         WORD32 strt_indx_h;
705         WORD16 *pi2_ref_array_temp;
706         UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
707         WORD32 i4_y_phase;
708         WORD32 out_stride_temp;
709         const_128 = _mm_set1_epi32(128);
710         zero_8x16b = _mm_set1_epi16(0);
711         const_ones = _mm_set1_epi8(1);
712         const_ones_8x16b = _mm_set1_epi16(1);
713 
714         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
715         {
716             arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
717             arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
718         }
719         pi1_y_ref_pos = arr_y_ref_pos;
720         pi1_y_phase = arr_y_phase;
721 
722         strt_indx_h = 0;
723         strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos);
724         for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
725         {
726             arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
727             arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
728         }
729 
730         pi1_x_ref_pos = arr_x_ref_pos;
731         pi1_x_phase = arr_x_phase;
732 
733         x_ref_pos_mask_r0 = _mm_loadu_si128((__m128i *) (pi1_x_ref_pos));
734         phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi1_x_phase));
735         phs_mask_8x16b_0 = _mm_cvtepi8_epi16(phs_mask_16x8b_0);
736         phs_mask_8x16b_1 = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i *) (pi1_x_phase + 8)));
737 
738         phs_mask_div8_8x16b_0 = _mm_srli_epi16(phs_mask_8x16b_0, 3);
739         phs_mask_div8_8x16b_1 = _mm_srli_epi16(phs_mask_8x16b_1, 3);
740         phs_mask_div8_8x16b_0 = _mm_packs_epi16(phs_mask_div8_8x16b_0, phs_mask_div8_8x16b_1);
741         const_16_8x16b = _mm_set1_epi16(16);
742 
743         phs_mask_16min_8x16b_0 = _mm_sub_epi16(const_16_8x16b, phs_mask_8x16b_0);
744         phs_mask_16min_8x16b_1 = _mm_sub_epi16(const_16_8x16b, phs_mask_8x16b_1);
745 
746         x_ref_rnd_mask_r0_0 = _mm_add_epi8(x_ref_pos_mask_r0, phs_mask_div8_8x16b_0);
747         mid_indx_16x8b = _mm_set1_epi8((strt_indx_h << 1));
748         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
749         {
750             if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
751             {
752                 if(zero_r0_r1)
753                 {
754                     res_8x16b_l = _mm_set1_epi16(0);
755                     res_8x16b_h = _mm_set1_epi16(0);
756                     out_stride_temp = (i4_y * i4_out_stride);
757                     _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), res_8x16b_l);
758                     _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), res_8x16b_h);
759                     continue;
760                 }
761 
762                 res_8x16b_r0_0 = prev_res_8x16b_r0_0;
763                 res_8x16b_r1_0 = prev_res_8x16b_r1_0;
764                 res_8x16b_r0_1 = prev_res_8x16b_r0_1;
765                 res_8x16b_r1_1 = prev_res_8x16b_r1_1;
766 
767                 u1_y_incr_8x16b_r0_0 = u1_prev_y_incr_8x16b_r0_0;
768                 u1_y_incr_8x16b_r0_1 = u1_prev_y_incr_8x16b_r0_1;
769             }
770             else
771             {
772                 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
773                 pu1_ref_x_ptr_incr_temp =
774                     pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
775                 ref_arr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pi2_ref_array_temp));
776                 ref_arr_8x16b_r1_0 =
777                     _mm_loadu_si128((__m128i *) (pi2_ref_array_temp + i4_refarray_wd));
778                 ref_arr_8x16b_r0_1 =
779                     _mm_loadu_si128((__m128i *) (pi2_ref_array_temp + strt_indx_h));
780                 ref_arr_8x16b_r1_1 = _mm_loadu_si128(
781                     (__m128i *) (pi2_ref_array_temp + i4_refarray_wd + strt_indx_h));
782 
783                 zero_r0_0 = _mm_test_all_ones(_mm_cmpeq_epi16(
784                     ref_arr_8x16b_r0_0, zero_8x16b));  // return 1 if all zeros, else 0
785                 zero_r1_0 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r1_0, zero_8x16b));
786                 zero_r0_1 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r0_1, zero_8x16b));
787                 zero_r1_1 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r1_1, zero_8x16b));
788 
789                 zero_r0_r1 = zero_r0_0 && zero_r1_0 && zero_r0_1 && zero_r1_1;
790 
791                 if(!zero_r0_r1)
792                 {
793                     u1_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp));
794                     u1_incr_8x16b_r1_0 =
795                         _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp + i4_refarray_wd));
796 
797                     u1_incr_8x16b_r0_0 = _mm_shuffle_epi8(u1_incr_8x16b_r0_0, x_ref_pos_mask_r0);
798                     u1_incr_8x16b_r1_0 = _mm_shuffle_epi8(u1_incr_8x16b_r1_0, x_ref_pos_mask_r0);
799 
800                     u1_incr_not_8x16b_r0_0 =
801                         _mm_andnot_si128(u1_incr_8x16b_r0_0, phs_mask_div8_8x16b_0);
802                     u1_incr_not_8x16b_r1_0 =
803                         _mm_andnot_si128(u1_incr_8x16b_r1_0, phs_mask_div8_8x16b_0);
804 
805                     u1_incr_not_8x16b_r0_0 =
806                         _mm_add_epi8(u1_incr_not_8x16b_r0_0, x_ref_pos_mask_r0);
807                     u1_incr_not_8x16b_r1_0 =
808                         _mm_add_epi8(u1_incr_not_8x16b_r1_0, x_ref_pos_mask_r0);
809 
810                     x_ref_pos_mask_temp_r0_0 =
811                         _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_8x16b_r0_0);
812                     x_ref_pos_mask_temp_r1_0 =
813                         _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_8x16b_r1_0);
814 
815                     /* _mm_slli_epi8(u1_incr_not_8x16b_r0_0, 1)*/
816                     u1_incr_not_8x16b_r0_even =
817                         _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
818                     u1_incr_not_8x16b_r1_even =
819                         _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
820                     x_ref_pos_mask_temp_r0_even =
821                         _mm_add_epi8(x_ref_pos_mask_temp_r0_0, x_ref_pos_mask_temp_r0_0);
822                     x_ref_pos_mask_temp_r1_even =
823                         _mm_add_epi8(x_ref_pos_mask_temp_r1_0, x_ref_pos_mask_temp_r1_0);
824 
825                     u1_incr_not_8x16b_r0_odd = _mm_add_epi8(u1_incr_not_8x16b_r0_even, const_ones);
826                     u1_incr_not_8x16b_r1_odd = _mm_add_epi8(u1_incr_not_8x16b_r1_even, const_ones);
827                     x_ref_pos_mask_temp_r0_odd =
828                         _mm_add_epi8(x_ref_pos_mask_temp_r0_even, const_ones);
829                     x_ref_pos_mask_temp_r1_odd =
830                         _mm_add_epi8(x_ref_pos_mask_temp_r1_even, const_ones);
831 
832                     u1_incr_not_8x16b_r0_0 =
833                         _mm_unpacklo_epi8(u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r0_odd);
834                     u1_incr_not_8x16b_r1_0 =
835                         _mm_unpacklo_epi8(u1_incr_not_8x16b_r1_even, u1_incr_not_8x16b_r1_odd);
836                     x_ref_pos_mask_temp_r0_0 =
837                         _mm_unpacklo_epi8(x_ref_pos_mask_temp_r0_even, x_ref_pos_mask_temp_r0_odd);
838                     x_ref_pos_mask_temp_r1_0 =
839                         _mm_unpacklo_epi8(x_ref_pos_mask_temp_r1_even, x_ref_pos_mask_temp_r1_odd);
840 
841                     u1_incr_not_8x16b_r0_1 =
842                         _mm_unpackhi_epi8(u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r0_odd);
843                     u1_incr_not_8x16b_r1_1 =
844                         _mm_unpackhi_epi8(u1_incr_not_8x16b_r1_even, u1_incr_not_8x16b_r1_odd);
845                     x_ref_pos_mask_temp_r0_1 =
846                         _mm_unpackhi_epi8(x_ref_pos_mask_temp_r0_even, x_ref_pos_mask_temp_r0_odd);
847                     x_ref_pos_mask_temp_r1_1 =
848                         _mm_unpackhi_epi8(x_ref_pos_mask_temp_r1_even, x_ref_pos_mask_temp_r1_odd);
849 
850                     u1_incr_not_8x16b_r0_1 = _mm_sub_epi8(u1_incr_not_8x16b_r0_1, mid_indx_16x8b);
851                     u1_incr_not_8x16b_r1_1 = _mm_sub_epi8(u1_incr_not_8x16b_r1_1, mid_indx_16x8b);
852                     x_ref_pos_mask_temp_r0_1 =
853                         _mm_sub_epi8(x_ref_pos_mask_temp_r0_1, mid_indx_16x8b);
854                     x_ref_pos_mask_temp_r1_1 =
855                         _mm_sub_epi8(x_ref_pos_mask_temp_r1_1, mid_indx_16x8b);
856 
857                     ref_arr_temp0_8x16b_r0_0 =
858                         _mm_shuffle_epi8(ref_arr_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
859                     ref_arr_temp0_8x16b_r1_0 =
860                         _mm_shuffle_epi8(ref_arr_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
861                     ref_arr_temp1_8x16b_r0_0 =
862                         _mm_shuffle_epi8(ref_arr_8x16b_r0_0, x_ref_pos_mask_temp_r0_0);
863                     ref_arr_temp1_8x16b_r1_0 =
864                         _mm_shuffle_epi8(ref_arr_8x16b_r1_0, x_ref_pos_mask_temp_r1_0);
865                     ref_arr_temp0_8x16b_r0_1 =
866                         _mm_shuffle_epi8(ref_arr_8x16b_r0_1, u1_incr_not_8x16b_r0_1);
867                     ref_arr_temp0_8x16b_r1_1 =
868                         _mm_shuffle_epi8(ref_arr_8x16b_r1_1, u1_incr_not_8x16b_r1_1);
869                     ref_arr_temp1_8x16b_r0_1 =
870                         _mm_shuffle_epi8(ref_arr_8x16b_r0_1, x_ref_pos_mask_temp_r0_1);
871                     ref_arr_temp1_8x16b_r1_1 =
872                         _mm_shuffle_epi8(ref_arr_8x16b_r1_1, x_ref_pos_mask_temp_r1_1);
873 
874                     res0_8x16b_r0_0 =
875                         _mm_mullo_epi16(ref_arr_temp0_8x16b_r0_0, phs_mask_16min_8x16b_0);
876                     res0_8x16b_r1_0 =
877                         _mm_mullo_epi16(ref_arr_temp0_8x16b_r1_0, phs_mask_16min_8x16b_0);
878                     res1_8x16b_r0_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r0_0, phs_mask_8x16b_0);
879                     res1_8x16b_r1_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r1_0, phs_mask_8x16b_0);
880                     res0_8x16b_r0_1 =
881                         _mm_mullo_epi16(ref_arr_temp0_8x16b_r0_1, phs_mask_16min_8x16b_1);
882                     res0_8x16b_r1_1 =
883                         _mm_mullo_epi16(ref_arr_temp0_8x16b_r1_1, phs_mask_16min_8x16b_1);
884                     res1_8x16b_r0_1 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r0_1, phs_mask_8x16b_1);
885                     res1_8x16b_r1_1 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r1_1, phs_mask_8x16b_1);
886 
887                     res_8x16b_r0_0 = _mm_add_epi16(res0_8x16b_r0_0, res1_8x16b_r0_0);
888                     res_8x16b_r1_0 = _mm_add_epi16(res0_8x16b_r1_0, res1_8x16b_r1_0);
889                     res_8x16b_r0_1 = _mm_add_epi16(res0_8x16b_r0_1, res1_8x16b_r0_1);
890                     res_8x16b_r1_1 = _mm_add_epi16(res0_8x16b_r1_1, res1_8x16b_r1_1);
891 
892                     prev_res_8x16b_r0_0 = res_8x16b_r0_0;
893                     prev_res_8x16b_r1_0 = res_8x16b_r1_0;
894                     prev_res_8x16b_r0_1 = res_8x16b_r0_1;
895                     prev_res_8x16b_r1_1 = res_8x16b_r1_1;
896 
897                     pu1_ref_y_ptr_incr_temp =
898                         pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
899                     u1_y_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_y_ptr_incr_temp));
900 
901                     u1_y_incr_8x16b_r0_0 =
902                         _mm_shuffle_epi8(u1_y_incr_8x16b_r0_0, x_ref_rnd_mask_r0_0);
903 
904                     u1_y_incr_8x16b_r0_low = _mm_cvtepi8_epi16(u1_y_incr_8x16b_r0_0);
905                     u1_y_incr_8x16b_r0_high =
906                         _mm_cvtepi8_epi16(_mm_unpackhi_epi64(u1_y_incr_8x16b_r0_0, const_ones));
907 
908                     u1_y_incr_8x16b_r0_0 =
909                         _mm_cmpeq_epi16(u1_y_incr_8x16b_r0_low, const_ones_8x16b);
910                     u1_y_incr_8x16b_r0_1 =
911                         _mm_cmpeq_epi16(u1_y_incr_8x16b_r0_high, const_ones_8x16b);
912 
913                     u1_prev_y_incr_8x16b_r0_0 = u1_y_incr_8x16b_r0_0;
914                     u1_prev_y_incr_8x16b_r0_1 = u1_y_incr_8x16b_r0_1;
915                 }
916             }
917 
918             if(zero_r0_r1)
919             {
920                 res_8x16b_l = _mm_set1_epi16(0);
921                 res_8x16b_h = _mm_set1_epi16(0);
922             }
923             else
924             {
925                 i4_y_phase = pi1_y_phase[i4_y];
926 
927                 if((i4_y_phase) >> 3)
928                 {
929                     vert_res0_8x16b_r0_0 =
930                         _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
931                     vert_res1_8x16b_r0_0 =
932                         _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
933                     vert_res0_8x16b_r0_1 =
934                         _mm_blendv_epi8(res_8x16b_r1_1, res_8x16b_r0_1, u1_y_incr_8x16b_r0_1);
935                     vert_res1_8x16b_r0_1 =
936                         _mm_blendv_epi8(res_8x16b_r1_1, res_8x16b_r1_1, u1_y_incr_8x16b_r0_1);
937                 }
938                 else
939                 {
940                     vert_res0_8x16b_r0_0 =
941                         _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
942                     vert_res1_8x16b_r0_0 =
943                         _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
944                     vert_res0_8x16b_r0_1 =
945                         _mm_blendv_epi8(res_8x16b_r0_1, res_8x16b_r0_1, u1_y_incr_8x16b_r0_1);
946                     vert_res1_8x16b_r0_1 =
947                         _mm_blendv_epi8(res_8x16b_r0_1, res_8x16b_r1_1, u1_y_incr_8x16b_r0_1);
948                 }
949                 res0_8x16b_r0_0 = _mm_unpacklo_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
950                 res1_8x16b_r0_0 = _mm_unpackhi_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
951                 res0_8x16b_r0_1 = _mm_unpacklo_epi16(vert_res0_8x16b_r0_1, vert_res1_8x16b_r0_1);
952                 res1_8x16b_r0_1 = _mm_unpackhi_epi16(vert_res0_8x16b_r0_1, vert_res1_8x16b_r0_1);
953 
954                 phs_y_mask_16min_8x16b = _mm_set1_epi16(16 - i4_y_phase);
955                 phs_y_mask_8x16b = _mm_set1_epi16(i4_y_phase);
956                 phs_y_mask_mix_8x16b = _mm_unpacklo_epi16(phs_y_mask_16min_8x16b, phs_y_mask_8x16b);
957 
958                 res_4x32b_l_0 = _mm_madd_epi16(res0_8x16b_r0_0, phs_y_mask_mix_8x16b);
959                 res_4x32b_l_1 = _mm_madd_epi16(res1_8x16b_r0_0, phs_y_mask_mix_8x16b);
960                 res_4x32b_h_0 = _mm_madd_epi16(res0_8x16b_r0_1, phs_y_mask_mix_8x16b);
961                 res_4x32b_h_1 = _mm_madd_epi16(res1_8x16b_r0_1, phs_y_mask_mix_8x16b);
962 
963                 res_4x32b_l_0 = _mm_add_epi32(res_4x32b_l_0, const_128);
964                 res_4x32b_l_1 = _mm_add_epi32(res_4x32b_l_1, const_128);
965                 res_4x32b_h_0 = _mm_add_epi32(res_4x32b_h_0, const_128);
966                 res_4x32b_h_1 = _mm_add_epi32(res_4x32b_h_1, const_128);
967 
968                 res_4x32b_l_0 = _mm_srai_epi32(res_4x32b_l_0, 8);
969                 res_4x32b_l_1 = _mm_srai_epi32(res_4x32b_l_1, 8);
970                 res_4x32b_h_0 = _mm_srai_epi32(res_4x32b_h_0, 8);
971                 res_4x32b_h_1 = _mm_srai_epi32(res_4x32b_h_1, 8);
972                 res_8x16b_l = _mm_packs_epi32(res_4x32b_l_0, res_4x32b_l_1);
973                 res_8x16b_h = _mm_packs_epi32(res_4x32b_h_0, res_4x32b_h_1);
974             }
975 
976             out_stride_temp = (i4_y * i4_out_stride);
977             _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), res_8x16b_l);
978             _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), res_8x16b_h);
979         }
980     }
981     else
982     {
983         __m128i const_16_8x16b, const_128, const_ones, const_ones_8x16b;
984         __m128i ref_arr_8x16b_r0_0;
985         __m128i ref_arr_8x16b_r1_0;
986         __m128i phs_mask_8x16b_0, phs_mask_div8_8x16b_0, phs_mask_16min_8x16b_0;
987         __m128i x_ref_pos_mask_r0, x_ref_rnd_mask_r0_0;
988         __m128i x_ref_pos_mask_temp_r0_0;
989         __m128i x_ref_pos_mask_temp_r1_0;
990 
991         __m128i u1_incr_8x16b_r0_0, ref_arr_temp0_8x16b_r0_0, res0_8x16b_r0_0,
992             u1_incr_not_8x16b_r0_0;
993         __m128i u1_incr_8x16b_r1_0, ref_arr_temp1_8x16b_r0_0, res1_8x16b_r0_0;
994         __m128i u1_y_incr_8x16b_r0_0;
995 
996         __m128i u1_incr_not_8x16b_r0_odd, u1_incr_not_8x16b_r1_odd, x_ref_pos_mask_temp_r0_odd,
997             x_ref_pos_mask_temp_r1_odd;
998         __m128i u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r1_even, x_ref_pos_mask_temp_r0_even,
999             x_ref_pos_mask_temp_r1_even;
1000 
1001         __m128i ref_arr_temp0_8x16b_r1_0, res_8x16b_r0_0, res0_8x16b_r1_0, u1_incr_not_8x16b_r1_0;
1002         __m128i ref_arr_temp1_8x16b_r1_0, res_8x16b_r1_0, res1_8x16b_r1_0;
1003         __m128i u1_prev_y_incr_8x16b_r0_0;
1004         __m128i prev_res_8x16b_r0_0;
1005         __m128i prev_res_8x16b_r1_0;
1006 
1007         __m128i vert_res0_8x16b_r0_0, res_4x32b_l_0, out_4x32b_l;
1008         __m128i vert_res1_8x16b_r0_0, res_4x32b_l_1, out_4x32b_h;
1009         __m128i phs_y_mask_16min_8x16b, phs_y_mask_8x16b, phs_y_mask_mix_8x16b;
1010         __m128i chroma_mask, chroma_mask2;
1011         __m128i zero_8x16b = _mm_set1_epi16(0);
1012         WORD32 zero_r0_0, zero_r1_0, zero_r0_r1 = 0;
1013         WORD16 *pi2_ref_array_temp;
1014         UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
1015         WORD32 i4_y_phase;
1016         WORD32 out_stride_temp;
1017         const_ones = _mm_set1_epi8(1);
1018         const_ones_8x16b = _mm_set1_epi16(1);
1019         const_128 = _mm_set1_epi32(128);
1020 
1021         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1022         {
1023             arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
1024             arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
1025         }
1026         pi1_y_ref_pos = arr_y_ref_pos;
1027         pi1_y_phase = arr_y_phase;
1028 
1029         for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
1030         {
1031             arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
1032             arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
1033         }
1034 
1035         pi1_x_ref_pos = arr_x_ref_pos;
1036         pi1_x_phase = arr_x_phase;
1037 
1038         phs_mask_8x16b_0 = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i *) (pi1_x_phase)));
1039         x_ref_pos_mask_r0 = _mm_loadu_si128((__m128i *) (pi1_x_ref_pos));
1040 
1041         const_16_8x16b = _mm_set1_epi16(16);
1042         chroma_mask = _mm_set1_epi32(0xFFFF0000);
1043         chroma_mask2 = _mm_set1_epi32(0x0000FFFF);
1044         phs_mask_div8_8x16b_0 = _mm_srli_epi16(phs_mask_8x16b_0, 3);
1045         phs_mask_div8_8x16b_0 = _mm_packs_epi16(phs_mask_div8_8x16b_0, const_ones);
1046 
1047         phs_mask_16min_8x16b_0 = _mm_sub_epi16(const_16_8x16b, phs_mask_8x16b_0);
1048         x_ref_rnd_mask_r0_0 = _mm_add_epi8(x_ref_pos_mask_r0, phs_mask_div8_8x16b_0);
1049 
1050         for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1051         {
1052             if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
1053             {
1054                 if(zero_r0_r1)
1055                 {
1056                     res_4x32b_l_0 = _mm_set1_epi32(0);
1057                     res_4x32b_l_1 = _mm_set1_epi32(0);
1058                     out_stride_temp = (i4_y * i4_out_stride);
1059 
1060                     out_4x32b_l = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp));
1061                     out_4x32b_h = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp + 8));
1062 
1063                     out_4x32b_l = _mm_and_si128(out_4x32b_l, chroma_mask);
1064                     out_4x32b_h = _mm_and_si128(out_4x32b_h, chroma_mask);
1065 
1066                     res_4x32b_l_0 = _mm_and_si128(res_4x32b_l_0, chroma_mask2);
1067                     res_4x32b_l_1 = _mm_and_si128(res_4x32b_l_1, chroma_mask2);
1068 
1069                     out_4x32b_l = _mm_add_epi8(res_4x32b_l_0, out_4x32b_l);
1070                     out_4x32b_h = _mm_add_epi8(res_4x32b_l_1, out_4x32b_h);
1071 
1072                     _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), out_4x32b_l);
1073                     _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), out_4x32b_h);
1074                     continue;
1075                 }
1076 
1077                 res_8x16b_r0_0 = prev_res_8x16b_r0_0;
1078                 res_8x16b_r1_0 = prev_res_8x16b_r1_0;
1079 
1080                 u1_y_incr_8x16b_r0_0 = u1_prev_y_incr_8x16b_r0_0;
1081             }
1082             else
1083             {
1084                 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1085                 pu1_ref_x_ptr_incr_temp =
1086                     pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1087                 ref_arr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pi2_ref_array_temp));
1088                 ref_arr_8x16b_r1_0 =
1089                     _mm_loadu_si128((__m128i *) (pi2_ref_array_temp + i4_refarray_wd));
1090 
1091                 zero_r0_0 = _mm_test_all_ones(_mm_cmpeq_epi16(
1092                     ref_arr_8x16b_r0_0, zero_8x16b));  // return 1 if all zeros, else 0
1093                 zero_r1_0 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r1_0, zero_8x16b));
1094 
1095                 zero_r0_r1 = zero_r0_0 && zero_r1_0;
1096 
1097                 if(!zero_r0_r1)
1098                 {
1099                     u1_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp));
1100                     u1_incr_8x16b_r1_0 =
1101                         _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp + i4_refarray_wd));
1102 
1103                     u1_incr_8x16b_r0_0 = _mm_shuffle_epi8(u1_incr_8x16b_r0_0, x_ref_pos_mask_r0);
1104                     u1_incr_8x16b_r1_0 = _mm_shuffle_epi8(u1_incr_8x16b_r1_0, x_ref_pos_mask_r0);
1105 
1106                     u1_incr_not_8x16b_r0_0 =
1107                         _mm_andnot_si128(u1_incr_8x16b_r0_0, phs_mask_div8_8x16b_0);
1108                     u1_incr_not_8x16b_r1_0 =
1109                         _mm_andnot_si128(u1_incr_8x16b_r1_0, phs_mask_div8_8x16b_0);
1110 
1111                     u1_incr_not_8x16b_r0_0 =
1112                         _mm_add_epi8(u1_incr_not_8x16b_r0_0, x_ref_pos_mask_r0);
1113                     u1_incr_not_8x16b_r1_0 =
1114                         _mm_add_epi8(u1_incr_not_8x16b_r1_0, x_ref_pos_mask_r0);
1115 
1116                     x_ref_pos_mask_temp_r0_0 =
1117                         _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_8x16b_r0_0);
1118                     x_ref_pos_mask_temp_r1_0 =
1119                         _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_8x16b_r1_0);
1120 
1121                     u1_incr_not_8x16b_r0_even =
1122                         _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
1123                     u1_incr_not_8x16b_r1_even =
1124                         _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
1125                     x_ref_pos_mask_temp_r0_even =
1126                         _mm_add_epi8(x_ref_pos_mask_temp_r0_0, x_ref_pos_mask_temp_r0_0);
1127                     x_ref_pos_mask_temp_r1_even =
1128                         _mm_add_epi8(x_ref_pos_mask_temp_r1_0, x_ref_pos_mask_temp_r1_0);
1129 
1130                     u1_incr_not_8x16b_r0_odd = _mm_add_epi8(u1_incr_not_8x16b_r0_even, const_ones);
1131                     u1_incr_not_8x16b_r1_odd = _mm_add_epi8(u1_incr_not_8x16b_r1_even, const_ones);
1132                     x_ref_pos_mask_temp_r0_odd =
1133                         _mm_add_epi8(x_ref_pos_mask_temp_r0_even, const_ones);
1134                     x_ref_pos_mask_temp_r1_odd =
1135                         _mm_add_epi8(x_ref_pos_mask_temp_r1_even, const_ones);
1136 
1137                     u1_incr_not_8x16b_r0_0 =
1138                         _mm_unpacklo_epi8(u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r0_odd);
1139                     u1_incr_not_8x16b_r1_0 =
1140                         _mm_unpacklo_epi8(u1_incr_not_8x16b_r1_even, u1_incr_not_8x16b_r1_odd);
1141                     x_ref_pos_mask_temp_r0_0 =
1142                         _mm_unpacklo_epi8(x_ref_pos_mask_temp_r0_even, x_ref_pos_mask_temp_r0_odd);
1143                     x_ref_pos_mask_temp_r1_0 =
1144                         _mm_unpacklo_epi8(x_ref_pos_mask_temp_r1_even, x_ref_pos_mask_temp_r1_odd);
1145 
1146                     ref_arr_temp0_8x16b_r0_0 =
1147                         _mm_shuffle_epi8(ref_arr_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
1148                     ref_arr_temp0_8x16b_r1_0 =
1149                         _mm_shuffle_epi8(ref_arr_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
1150                     ref_arr_temp1_8x16b_r0_0 =
1151                         _mm_shuffle_epi8(ref_arr_8x16b_r0_0, x_ref_pos_mask_temp_r0_0);
1152                     ref_arr_temp1_8x16b_r1_0 =
1153                         _mm_shuffle_epi8(ref_arr_8x16b_r1_0, x_ref_pos_mask_temp_r1_0);
1154 
1155                     res0_8x16b_r0_0 =
1156                         _mm_mullo_epi16(ref_arr_temp0_8x16b_r0_0, phs_mask_16min_8x16b_0);
1157                     res0_8x16b_r1_0 =
1158                         _mm_mullo_epi16(ref_arr_temp0_8x16b_r1_0, phs_mask_16min_8x16b_0);
1159                     res1_8x16b_r0_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r0_0, phs_mask_8x16b_0);
1160                     res1_8x16b_r1_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r1_0, phs_mask_8x16b_0);
1161 
1162                     res_8x16b_r0_0 = _mm_add_epi16(res0_8x16b_r0_0, res1_8x16b_r0_0);
1163                     res_8x16b_r1_0 = _mm_add_epi16(res0_8x16b_r1_0, res1_8x16b_r1_0);
1164 
1165                     pu1_ref_y_ptr_incr_temp =
1166                         pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
1167                     u1_y_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_y_ptr_incr_temp));
1168 
1169                     u1_y_incr_8x16b_r0_0 =
1170                         _mm_shuffle_epi8(u1_y_incr_8x16b_r0_0, x_ref_rnd_mask_r0_0);
1171 
1172                     u1_y_incr_8x16b_r0_0 = _mm_cvtepi8_epi16(u1_y_incr_8x16b_r0_0);
1173                     u1_y_incr_8x16b_r0_0 = _mm_cmpeq_epi16(u1_y_incr_8x16b_r0_0, const_ones_8x16b);
1174                     u1_prev_y_incr_8x16b_r0_0 = u1_y_incr_8x16b_r0_0;
1175 
1176                     prev_res_8x16b_r0_0 = res_8x16b_r0_0;
1177                     prev_res_8x16b_r1_0 = res_8x16b_r1_0;
1178                 }
1179             }
1180 
1181             if(zero_r0_r1)
1182             {
1183                 res_4x32b_l_0 = _mm_set1_epi32(0);
1184                 res_4x32b_l_1 = _mm_set1_epi32(0);
1185             }
1186             else
1187             {
1188                 i4_y_phase = pi1_y_phase[i4_y];
1189 
1190                 if((i4_y_phase) >> 3)
1191                 {
1192                     vert_res0_8x16b_r0_0 =
1193                         _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
1194                     vert_res1_8x16b_r0_0 =
1195                         _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
1196                 }
1197                 else
1198                 {
1199                     vert_res0_8x16b_r0_0 =
1200                         _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
1201                     vert_res1_8x16b_r0_0 =
1202                         _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
1203                 }
1204 
1205                 res0_8x16b_r0_0 = _mm_unpacklo_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
1206                 res1_8x16b_r0_0 = _mm_unpackhi_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
1207 
1208                 phs_y_mask_16min_8x16b = _mm_set1_epi16(16 - i4_y_phase);
1209                 phs_y_mask_8x16b = _mm_set1_epi16(i4_y_phase);
1210                 phs_y_mask_mix_8x16b = _mm_unpacklo_epi16(phs_y_mask_16min_8x16b, phs_y_mask_8x16b);
1211 
1212                 res_4x32b_l_0 = _mm_madd_epi16(res0_8x16b_r0_0, phs_y_mask_mix_8x16b);
1213                 res_4x32b_l_1 = _mm_madd_epi16(res1_8x16b_r0_0, phs_y_mask_mix_8x16b);
1214                 res_4x32b_l_0 = _mm_add_epi32(res_4x32b_l_0, const_128);
1215                 res_4x32b_l_1 = _mm_add_epi32(res_4x32b_l_1, const_128);
1216 
1217                 res_4x32b_l_0 = _mm_srai_epi32(res_4x32b_l_0, 8);
1218                 res_4x32b_l_1 = _mm_srai_epi32(res_4x32b_l_1, 8);
1219             }
1220             out_stride_temp = (i4_y * i4_out_stride);
1221 
1222             out_4x32b_l = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp));
1223             out_4x32b_h = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp + 8));
1224 
1225             out_4x32b_l = _mm_and_si128(out_4x32b_l, chroma_mask);
1226             out_4x32b_h = _mm_and_si128(out_4x32b_h, chroma_mask);
1227 
1228             res_4x32b_l_0 = _mm_and_si128(res_4x32b_l_0, chroma_mask2);
1229             res_4x32b_l_1 = _mm_and_si128(res_4x32b_l_1, chroma_mask2);
1230 
1231             out_4x32b_l = _mm_add_epi8(res_4x32b_l_0, out_4x32b_l);
1232             out_4x32b_h = _mm_add_epi8(res_4x32b_l_1, out_4x32b_h);
1233 
1234             _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), out_4x32b_l);
1235             _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), out_4x32b_h);
1236         }
1237     }
1238     return;
1239 } /* End of Interpolation Function */
1240 
1241 /*****************************************************************************/
1242 /*                                                                           */
1243 /*  Function Name : isvcd_residual_reflayer_const_non_boundary_mb_sse42       */
1244 /*                                                                           */
1245 /*  Description   :                                                          */
1246 /*                                                                           */
1247 /*  Inputs        :                                                          */
1248 /*  Globals       : none                                                     */
1249 /*  Processing    :                                                          */
1250 /*                                                                           */
1251 /*  Outputs       : none                                                     */
1252 /*  Returns       : none                                                     */
1253 /*                                                                           */
1254 /*  Issues        : none                                                     */
1255 /*                                                                           */
1256 /*  Revision History:                                                        */
1257 /*                                                                           */
1258 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1259 /*         25 11 2021   Kishore         creation                             */
1260 /*                                                                           */
1261 /*****************************************************************************/
1262 
isvcd_residual_reflayer_const_non_boundary_mb_sse42(WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_ref_array,WORD32 i4_refarray_wd,WORD32 i4_refarray_ht,WORD32 i4_ref_mb_type_q0,WORD32 i4_ref_mb_type_q1,WORD32 i4_ref_mb_type_q2,WORD32 i4_ref_mb_type_q3,WORD32 i4_mb_quard1_part_x,WORD32 i4_mb_quard1_part_y,WORD32 i4_chroma_flag)1263 void isvcd_residual_reflayer_const_non_boundary_mb_sse42(
1264     WORD16 *pi2_inp_data, WORD32 i4_inp_data_stride, WORD16 *pi2_ref_array, WORD32 i4_refarray_wd,
1265     WORD32 i4_refarray_ht, WORD32 i4_ref_mb_type_q0, WORD32 i4_ref_mb_type_q1,
1266     WORD32 i4_ref_mb_type_q2, WORD32 i4_ref_mb_type_q3, WORD32 i4_mb_quard1_part_x,
1267     WORD32 i4_mb_quard1_part_y, WORD32 i4_chroma_flag)
1268 {
1269     WORD32 i4_y;
1270 
1271     WORD16 *pi2_ref_data_byte;
1272     WORD16 *pi2_ref_array_temp;
1273     if(i4_chroma_flag == 0)
1274     {
1275         WORD8 index_0[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
1276         __m128i ref_mb_type_8x16_q0, ref_mb_type_8x16_q1, ref_mb_type_8x16_q2, ref_mb_type_8x16_q3,
1277             mb_quard1_part_x_8x16;
1278         __m128i ref_mb_type_8x16_0, ref_mb_type_8x16_1;
1279         __m128i ref_mb_type_8x16_low_0, ref_mb_type_8x16_low_1;
1280         __m128i mb_type_mask_8x16_0 = _mm_set1_epi8(-1);
1281         __m128i mb_type_mask_8x16_1 = _mm_set1_epi8(-1);
1282         __m128i mb_type_mask_8x16_low_0, mb_type_mask_8x16_low_1;
1283         __m128i mask_8x16_0;
1284         __m128i index_arr_0;
1285         __m128i inp_data_16x8_0, inp_data_16x8_1;
1286         __m128i res_16x8_0, res_16x8_1;
1287         __m128i one_8x16 = _mm_set1_epi8(1);
1288         __m128i zero_8x16 = _mm_set1_epi8(0);
1289 
1290         index_arr_0 = _mm_loadu_si128((__m128i *) index_0);
1291         ref_mb_type_8x16_q0 = _mm_set1_epi8(i4_ref_mb_type_q0);
1292         ref_mb_type_8x16_q1 = _mm_set1_epi8(i4_ref_mb_type_q1);
1293         ref_mb_type_8x16_q2 = _mm_set1_epi8(i4_ref_mb_type_q2);
1294         ref_mb_type_8x16_q3 = _mm_set1_epi8(i4_ref_mb_type_q3);
1295         if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1296         {
1297             // Quard 0
1298             ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1299             ref_mb_type_8x16_1 = ref_mb_type_8x16_q0;
1300             mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1301             mb_type_mask_8x16_1 = mb_type_mask_8x16_0;
1302         }
1303         else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1304                 (i4_mb_quard1_part_x < i4_refarray_wd))
1305         {
1306             // Quard 0 & 1
1307             if(i4_mb_quard1_part_x == 8)
1308             {
1309                 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1310                 ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1311             }
1312             else if(i4_mb_quard1_part_x < 8)
1313             {
1314                 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1315                 mask_8x16_0 =
1316                     _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16);  // return 1 if a<b, else 0
1317 
1318                 ref_mb_type_8x16_0 =
1319                     _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1320                 ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1321             }
1322             else
1323             {
1324                 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x - 8) << 1);
1325                 mask_8x16_0 =
1326                     _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16);  // return 1 if a<b, else 0
1327 
1328                 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1329                 ref_mb_type_8x16_1 =
1330                     _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1331             }
1332 
1333             mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1334             mb_type_mask_8x16_1 = _mm_cmpeq_epi8(ref_mb_type_8x16_1, one_8x16);
1335         }
1336         else
1337         {
1338             if(i4_mb_quard1_part_x >= i4_refarray_wd)
1339             {
1340                 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1341                 ref_mb_type_8x16_1 = ref_mb_type_8x16_q0;
1342 
1343                 ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1344                 ref_mb_type_8x16_low_1 = ref_mb_type_8x16_q2;
1345             }
1346             else
1347             {
1348                 // Quard 0, 1, 2, 3
1349                 if(i4_mb_quard1_part_x == 8)
1350                 {
1351                     ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1352                     ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1353 
1354                     ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1355                     ref_mb_type_8x16_low_1 = ref_mb_type_8x16_q3;
1356                 }
1357                 else if(i4_mb_quard1_part_x < 8)
1358                 {
1359                     mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1360                     mask_8x16_0 = _mm_cmplt_epi8(index_arr_0,
1361                                                  mb_quard1_part_x_8x16);  // return 1 if a<b, else 0
1362 
1363                     ref_mb_type_8x16_0 =
1364                         _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1365                     ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1366 
1367                     ref_mb_type_8x16_low_0 =
1368                         _mm_blendv_epi8(ref_mb_type_8x16_q3, ref_mb_type_8x16_q2, mask_8x16_0);
1369                     ref_mb_type_8x16_low_1 = ref_mb_type_8x16_q3;
1370                 }
1371                 else
1372                 {
1373                     mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x - 8) << 1);
1374                     mask_8x16_0 = _mm_cmplt_epi8(index_arr_0,
1375                                                  mb_quard1_part_x_8x16);  // return 1 if a<b, else 0
1376 
1377                     ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1378                     ref_mb_type_8x16_1 =
1379                         _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1380 
1381                     ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1382                     ref_mb_type_8x16_low_1 =
1383                         _mm_blendv_epi8(ref_mb_type_8x16_q3, ref_mb_type_8x16_q2, mask_8x16_0);
1384                 }
1385                 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1386                 mb_type_mask_8x16_1 = _mm_cmpeq_epi8(ref_mb_type_8x16_1, one_8x16);
1387 
1388                 mb_type_mask_8x16_low_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_low_0, one_8x16);
1389                 mb_type_mask_8x16_low_1 = _mm_cmpeq_epi8(ref_mb_type_8x16_low_1, one_8x16);
1390             }
1391         }
1392 
1393         if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1394         {
1395             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1396             {
1397                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1398                 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1399                 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1400 
1401                 if(i4_y < i4_mb_quard1_part_y)
1402                 {
1403                     res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_0, mb_type_mask_8x16_0);
1404                     res_16x8_1 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_1, mb_type_mask_8x16_1);
1405                 }
1406                 else
1407                 {
1408                     res_16x8_0 =
1409                         _mm_blendv_epi8(zero_8x16, inp_data_16x8_0, mb_type_mask_8x16_low_0);
1410                     res_16x8_1 =
1411                         _mm_blendv_epi8(zero_8x16, inp_data_16x8_1, mb_type_mask_8x16_low_1);
1412                 }
1413 
1414                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1415                 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1416                 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp + 8), res_16x8_1);
1417             }
1418         }
1419         else
1420         {
1421             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1422             {
1423                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1424                 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1425                 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1426 
1427                 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_0, mb_type_mask_8x16_0);
1428                 res_16x8_1 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_1, mb_type_mask_8x16_1);
1429 
1430                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1431                 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1432                 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp + 8), res_16x8_1);
1433             }
1434         }
1435     }
1436     else
1437     {
1438         WORD8 index_0[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
1439         WORD8 even_mask_arr[16] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15};
1440         __m128i ref_mb_type_8x16_q0, ref_mb_type_8x16_q1, ref_mb_type_8x16_q2, ref_mb_type_8x16_q3,
1441             mb_quard1_part_x_8x16;
1442         __m128i ref_mb_type_8x16_0;
1443         __m128i ref_mb_type_8x16_low_0;
1444         __m128i mb_type_mask_8x16_0 = _mm_set1_epi8(-1);
1445         __m128i mb_type_mask_8x16_low_0;
1446         __m128i mask_8x16_0;
1447         __m128i index_arr_0, even_mask;
1448         __m128i inp_data_16x8_0, inp_data_16x8_1, inp_data_16x8;
1449         __m128i res_16x8_0;
1450         __m128i one_8x16 = _mm_set1_epi8(1);
1451         __m128i zero_8x16 = _mm_set1_epi8(0);
1452 
1453         index_arr_0 = _mm_loadu_si128((__m128i *) index_0);
1454         even_mask = _mm_loadu_si128((__m128i *) even_mask_arr);
1455 
1456         ref_mb_type_8x16_q0 = _mm_set1_epi8(i4_ref_mb_type_q0);
1457         ref_mb_type_8x16_q1 = _mm_set1_epi8(i4_ref_mb_type_q1);
1458         ref_mb_type_8x16_q2 = _mm_set1_epi8(i4_ref_mb_type_q2);
1459         ref_mb_type_8x16_q3 = _mm_set1_epi8(i4_ref_mb_type_q3);
1460         if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1461         {
1462             // Quard 0
1463             ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1464             mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1465         }
1466         else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1467                 (i4_mb_quard1_part_x < i4_refarray_wd))
1468         {
1469             // Quard 0 & 1
1470             mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1471             mask_8x16_0 =
1472                 _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16);  // return 1 if a<b, else 0
1473 
1474             ref_mb_type_8x16_0 =
1475                 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1476             mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1477         }
1478         else
1479         {
1480             if(i4_mb_quard1_part_x >= i4_refarray_wd)
1481             {
1482                 // Quard 0 & 2
1483                 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1484                 ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1485             }
1486             else
1487             {
1488                 // Quard 0, 1, 2, 3
1489                 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1490                 mask_8x16_0 =
1491                     _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16);  // return 1 if a<b, else 0
1492 
1493                 ref_mb_type_8x16_0 =
1494                     _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1495                 ref_mb_type_8x16_low_0 =
1496                     _mm_blendv_epi8(ref_mb_type_8x16_q3, ref_mb_type_8x16_q2, mask_8x16_0);
1497 
1498                 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1499                 mb_type_mask_8x16_low_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_low_0, one_8x16);
1500             }
1501         }
1502 
1503         if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1504         {
1505             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1506             {
1507                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1508                 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1509                 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1510 
1511                 inp_data_16x8_0 = _mm_shuffle_epi8(inp_data_16x8_0, even_mask);
1512                 inp_data_16x8_1 = _mm_shuffle_epi8(inp_data_16x8_1, even_mask);
1513 
1514                 inp_data_16x8 = _mm_unpacklo_epi64(inp_data_16x8_0, inp_data_16x8_1);
1515                 if(i4_y < i4_mb_quard1_part_y)
1516                 {
1517                     res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8, mb_type_mask_8x16_0);
1518                 }
1519                 else
1520                 {
1521                     res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8, mb_type_mask_8x16_low_0);
1522                 }
1523 
1524                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1525                 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1526             }
1527         }
1528         else
1529         {
1530             for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1531             {
1532                 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1533                 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1534                 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1535 
1536                 inp_data_16x8_0 = _mm_shuffle_epi8(inp_data_16x8_0, even_mask);
1537                 inp_data_16x8_1 = _mm_shuffle_epi8(inp_data_16x8_1, even_mask);
1538                 inp_data_16x8 = _mm_unpacklo_epi64(inp_data_16x8_0, inp_data_16x8_1);
1539 
1540                 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8, mb_type_mask_8x16_0);
1541                 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1542                 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1543             }
1544         }
1545     }
1546 }
1547