1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_residual_resamp_sse42.c
24 *
25 * @brief
26 * Contains function definitions for intra resampling functions
27 *
28 * @author
29 * Kishore
30 *
31 * @par List of Functions:
32 * - isvcd_interpolate_residual_sse42
33 * - isvcd_residual_luma_dyadic_sse42
34 * - isvcd_residual_reflayer_const_non_boundary_mb_sse42
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 #include <immintrin.h>
42 #include <smmintrin.h>
43 #include <emmintrin.h>
44 /* User include files */
45 #include "ih264_typedefs.h"
46 #include "isvcd_structs.h"
47
48 /*****************************************************************************/
49 /* */
50 /* Function Name : isvcd_residual_luma_dyadic_sse42 */
51 /* */
52 /* Description : */
53 /* */
54 /* Inputs : */
55 /* Globals : none */
56 /* Processing : */
57 /* */
58 /* Outputs : none */
59 /* Returns : none */
60 /* */
61 /* Issues : none */
62 /* */
63 /* Revision History: */
64 /* */
65 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
66 /* 25 11 2021 Kishore creation */
67 /* */
68 /*****************************************************************************/
isvcd_residual_luma_dyadic_sse42(void * pv_residual_samp_ctxt,WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_out_res,WORD32 i4_out_res_stride,mem_element_t * ps_ref_mb_mode,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_ref_nnz,WORD32 i4_ref_tx_size)69 void isvcd_residual_luma_dyadic_sse42(void *pv_residual_samp_ctxt, WORD16 *pi2_inp_data,
70 WORD32 i4_inp_data_stride, WORD16 *pi2_out_res,
71 WORD32 i4_out_res_stride, mem_element_t *ps_ref_mb_mode,
72 UWORD16 u2_mb_x, UWORD16 u2_mb_y, WORD32 i4_ref_nnz,
73 WORD32 i4_ref_tx_size)
74
75 {
76 WORD16 *pi2_refarray_buffer;
77 WORD32 i4_blk_ctr;
78 residual_sampling_ctxt_t *ps_ctxt;
79
80 UNUSED(ps_ref_mb_mode);
81 UNUSED(u2_mb_x);
82 UNUSED(u2_mb_y);
83
84 ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
85 pi2_refarray_buffer = ps_ctxt->pi2_refarray_buffer;
86
87 /* based on transform size the counter and interpolation width and */
88 /* height are intialised as follows */
89
90 if((i4_ref_tx_size) && (0 != i4_ref_nnz))
91 {
92 WORD16 *pi2_ref_data_byte;
93 WORD32 i4_i, i4_j;
94 WORD16 *pi2_refarray_buffer_tmp = pi2_refarray_buffer;
95
96 __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
97 __m128i res_8x16b_r1_0, res_8x16b_r1_1;
98 __m128i final_res_8x16b_r1_0, final_res_8x16b_r1_1;
99
100 __m128i coeff_add_8x16b_r1;
101
102 __m128i coeff_add_8x16b_r2;
103 __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
104 __m128i res_8x16b_r2_0, res_8x16b_r2_1;
105 __m128i final_res_8x16b_r2_0, final_res_8x16b_r2_1;
106
107 pi2_ref_data_byte = pi2_inp_data;
108
109 /* ----------- Horizontal Interpolation ---------------- */
110 for(i4_i = 0; i4_i < BLOCK_HEIGHT; i4_i += 2)
111 {
112 i2_coeff_8x16b_r1_0 =
113 _mm_loadu_si128((__m128i *) pi2_ref_data_byte); // a0 a1 a2 a3 a4 a5 a6 a7
114 i2_coeff_8x16b_r2_0 = _mm_loadu_si128(
115 (__m128i *) (pi2_ref_data_byte + i4_inp_data_stride)); // b0 b1 b2 b3 b4 b5 b6 b7
116
117 i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
118 i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
119
120 coeff_add_8x16b_r1 = _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
121 coeff_add_8x16b_r2 = _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
122
123 i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
124 i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
125
126 i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
127 i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
128
129 res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
130 res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
131
132 res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
133 res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
134
135 final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
136 final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
137
138 final_res_8x16b_r1_1 = _mm_unpackhi_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
139 final_res_8x16b_r2_1 = _mm_unpackhi_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
140
141 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1), final_res_8x16b_r1_0);
142 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9), final_res_8x16b_r1_1);
143
144 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17), final_res_8x16b_r2_0);
145 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25), final_res_8x16b_r2_1);
146
147 pi2_refarray_buffer[0] = (pi2_ref_data_byte[0] << 2);
148 pi2_refarray_buffer[15] = (pi2_ref_data_byte[7] << 2);
149 pi2_ref_data_byte += i4_inp_data_stride;
150 pi2_refarray_buffer[16] = (pi2_ref_data_byte[0] << 2);
151 pi2_refarray_buffer[31] = (pi2_ref_data_byte[7] << 2);
152
153 /* vertical loop uopdates */
154 pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
155 pi2_refarray_buffer += 32;
156 }
157
158 /* ----------- Vertical Interpolation ---------------- */
159 pi2_refarray_buffer = pi2_refarray_buffer_tmp;
160
161 {
162 __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r1_3,
163 i4_horz_samp_4x32b_r1_4;
164 __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r2_3,
165 i4_horz_samp_4x32b_r2_4;
166 __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2, i4_res_samp_4x32b_r1_3,
167 i4_res_samp_4x32b_r1_4;
168 __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2, i4_res_samp_4x32b_r2_3,
169 i4_res_samp_4x32b_r2_4;
170 __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2, horz_add_4x32b_r2_3,
171 horz_add_4x32b_r2_4;
172
173 __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r2_1;
174 __m128i i4_horz_samp_8x16b_r1_2, i4_horz_samp_8x16b_r2_2;
175 __m128i i4_horz_samp_8x16b_r1_3, i4_horz_samp_8x16b_r2_3;
176 __m128i i4_horz_samp_8x16b_r1_4, i4_horz_samp_8x16b_r2_4;
177
178 __m128i twos = _mm_set1_epi32(2);
179 __m128i eights = _mm_set1_epi32(8);
180
181 WORD16 *pi2_out;
182
183 pi2_out = pi2_out_res;
184
185 i4_horz_samp_8x16b_r1_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
186 i4_horz_samp_8x16b_r1_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
187 i4_horz_samp_8x16b_r1_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
188 i4_horz_samp_8x16b_r1_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
189
190 i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
191 i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
192 i4_horz_samp_4x32b_r1_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_3);
193 i4_horz_samp_4x32b_r1_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_4);
194
195 /* populate the first inter sample */
196 i4_res_samp_4x32b_r1_1 =
197 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
198 i4_res_samp_4x32b_r1_2 =
199 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
200 i4_res_samp_4x32b_r1_3 =
201 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
202 i4_res_samp_4x32b_r1_4 =
203 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
204
205 _mm_storeu_si128((__m128i *) pi2_out,
206 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
207 _mm_storeu_si128((__m128i *) (pi2_out + 8),
208 _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
209 pi2_out += i4_out_res_stride;
210
211 for(i4_j = 0; i4_j < 14; i4_j += 2)
212 {
213 pi2_refarray_buffer += MB_WIDTH;
214
215 i4_horz_samp_8x16b_r2_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
216 i4_horz_samp_8x16b_r2_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
217 i4_horz_samp_8x16b_r2_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
218 i4_horz_samp_8x16b_r2_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
219
220 i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
221 i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
222 i4_horz_samp_4x32b_r2_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_3);
223 i4_horz_samp_4x32b_r2_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_4);
224
225 horz_add_4x32b_r2_1 =
226 _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
227 horz_add_4x32b_r2_2 =
228 _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
229 horz_add_4x32b_r2_3 =
230 _mm_add_epi32(i4_horz_samp_4x32b_r1_3, i4_horz_samp_4x32b_r2_3);
231 horz_add_4x32b_r2_4 =
232 _mm_add_epi32(i4_horz_samp_4x32b_r1_4, i4_horz_samp_4x32b_r2_4);
233
234 i4_res_samp_4x32b_r1_1 =
235 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
236 i4_res_samp_4x32b_r1_2 =
237 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
238 i4_res_samp_4x32b_r1_3 =
239 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_3, 1), horz_add_4x32b_r2_3);
240 i4_res_samp_4x32b_r1_4 =
241 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_4, 1), horz_add_4x32b_r2_4);
242
243 i4_res_samp_4x32b_r2_1 =
244 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
245 i4_res_samp_4x32b_r2_2 =
246 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
247 i4_res_samp_4x32b_r2_3 =
248 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_3, 1), horz_add_4x32b_r2_3);
249 i4_res_samp_4x32b_r2_4 =
250 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_4, 1), horz_add_4x32b_r2_4);
251
252 i4_res_samp_4x32b_r1_1 =
253 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
254 i4_res_samp_4x32b_r1_2 =
255 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
256 i4_res_samp_4x32b_r1_3 =
257 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_3, eights), 4);
258 i4_res_samp_4x32b_r1_4 =
259 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_4, eights), 4);
260
261 i4_res_samp_4x32b_r2_1 =
262 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
263 i4_res_samp_4x32b_r2_2 =
264 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
265 i4_res_samp_4x32b_r2_3 =
266 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_3, eights), 4);
267 i4_res_samp_4x32b_r2_4 =
268 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_4, eights), 4);
269
270 /* populate 2 samples based on current coeffs */
271 _mm_storeu_si128((__m128i *) pi2_out,
272 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
273 _mm_storeu_si128((__m128i *) (pi2_out + 8),
274 _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
275 pi2_out += i4_out_res_stride;
276
277 _mm_storeu_si128((__m128i *) pi2_out,
278 _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
279 _mm_storeu_si128((__m128i *) (pi2_out + 8),
280 _mm_packs_epi32(i4_res_samp_4x32b_r2_3, i4_res_samp_4x32b_r2_4));
281 pi2_out += i4_out_res_stride;
282
283 /* store the coeff 2 to coeff 1 */
284 /* (used in next iteration) */
285 i4_horz_samp_4x32b_r1_1 = i4_horz_samp_4x32b_r2_1;
286 i4_horz_samp_4x32b_r1_2 = i4_horz_samp_4x32b_r2_2;
287 i4_horz_samp_4x32b_r1_3 = i4_horz_samp_4x32b_r2_3;
288 i4_horz_samp_4x32b_r1_4 = i4_horz_samp_4x32b_r2_4;
289 }
290
291 i4_res_samp_4x32b_r1_1 =
292 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
293 i4_res_samp_4x32b_r1_2 =
294 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
295 i4_res_samp_4x32b_r1_3 =
296 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
297 i4_res_samp_4x32b_r1_4 =
298 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
299
300 _mm_storeu_si128((__m128i *) pi2_out,
301 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
302 _mm_storeu_si128((__m128i *) (pi2_out + 8),
303 _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
304 }
305 }
306 else
307 {
308 /* ----------------------------------------------------------------- */
309 /* LOOP over number of blocks */
310 /* ----------------------------------------------------------------- */
311 for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
312 {
313 /* if reference layer is not coded then no processing */
314 if(0 != (i4_ref_nnz & 0x1))
315 {
316 __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
317 __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
318 __m128i i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1;
319 __m128i i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1;
320
321 __m128i res_8x16b_r1_0, res_8x16b_r1_1;
322 __m128i res_8x16b_r2_0, res_8x16b_r2_1;
323 __m128i res_8x16b_r3_0, res_8x16b_r3_1;
324 __m128i res_8x16b_r4_0, res_8x16b_r4_1;
325 __m128i final_res_8x16b_r1_0;
326 __m128i final_res_8x16b_r2_0;
327 __m128i final_res_8x16b_r3_0;
328 __m128i final_res_8x16b_r4_0;
329
330 __m128i coeff_add_8x16b_r1;
331 __m128i coeff_add_8x16b_r2;
332 __m128i coeff_add_8x16b_r3;
333 __m128i coeff_add_8x16b_r4;
334
335 /* ----------- Horizontal Interpolation ---------------- */
336
337 i2_coeff_8x16b_r1_0 =
338 _mm_loadu_si128((__m128i *) pi2_inp_data); // a0 a1 a2 a3 a4 a5 a6 a7
339 i2_coeff_8x16b_r2_0 = _mm_loadu_si128(
340 (__m128i *) (pi2_inp_data + i4_inp_data_stride)); // b0 b1 b2 b3 b4 b5 b6 b7
341 i2_coeff_8x16b_r3_0 =
342 _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride << 1)));
343 i2_coeff_8x16b_r4_0 =
344 _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride * 3)));
345
346 i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0,
347 2); // a1 a2 a3 a4 a5 a6 a7 0
348 i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0,
349 2); // b1 b2 b3 b4 b5 b6 b7 0
350 i2_coeff_8x16b_r3_1 = _mm_srli_si128(i2_coeff_8x16b_r3_0, 2);
351 i2_coeff_8x16b_r4_1 = _mm_srli_si128(i2_coeff_8x16b_r4_0, 2);
352
353 coeff_add_8x16b_r1 = _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
354 coeff_add_8x16b_r2 = _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
355 coeff_add_8x16b_r3 = _mm_add_epi16(i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1);
356 coeff_add_8x16b_r4 = _mm_add_epi16(i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1);
357
358 i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
359 i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
360 i2_coeff_8x16b_r3_0 = _mm_slli_epi16(i2_coeff_8x16b_r3_0, 1);
361 i2_coeff_8x16b_r4_0 = _mm_slli_epi16(i2_coeff_8x16b_r4_0, 1);
362
363 i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
364 i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
365 i2_coeff_8x16b_r3_1 = _mm_slli_epi16(i2_coeff_8x16b_r3_1, 1);
366 i2_coeff_8x16b_r4_1 = _mm_slli_epi16(i2_coeff_8x16b_r4_1, 1);
367
368 res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
369 res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
370 res_8x16b_r3_0 = _mm_add_epi16(i2_coeff_8x16b_r3_0, coeff_add_8x16b_r3);
371 res_8x16b_r4_0 = _mm_add_epi16(i2_coeff_8x16b_r4_0, coeff_add_8x16b_r4);
372
373 res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
374 res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
375 res_8x16b_r3_1 = _mm_add_epi16(i2_coeff_8x16b_r3_1, coeff_add_8x16b_r3);
376 res_8x16b_r4_1 = _mm_add_epi16(i2_coeff_8x16b_r4_1, coeff_add_8x16b_r4);
377
378 final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
379 final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
380 final_res_8x16b_r3_0 = _mm_unpacklo_epi16(res_8x16b_r3_0, res_8x16b_r3_1);
381 final_res_8x16b_r4_0 = _mm_unpacklo_epi16(res_8x16b_r4_0, res_8x16b_r4_1);
382
383 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1), final_res_8x16b_r1_0);
384 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9), final_res_8x16b_r2_0);
385 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17), final_res_8x16b_r3_0);
386 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25), final_res_8x16b_r4_0);
387
388 pi2_refarray_buffer[0] = (pi2_inp_data[0] << 2);
389 pi2_refarray_buffer[7] = (pi2_inp_data[3] << 2);
390 pi2_refarray_buffer[8] = (pi2_inp_data[i4_inp_data_stride] << 2);
391 pi2_refarray_buffer[15] = (pi2_inp_data[i4_inp_data_stride + 3] << 2);
392 pi2_refarray_buffer[16] = (pi2_inp_data[(i4_inp_data_stride << 1)] << 2);
393 pi2_refarray_buffer[23] = (pi2_inp_data[(i4_inp_data_stride << 1) + 3] << 2);
394 pi2_refarray_buffer[24] = (pi2_inp_data[(i4_inp_data_stride * 3)] << 2);
395 pi2_refarray_buffer[31] = (pi2_inp_data[(i4_inp_data_stride * 3) + 3] << 2);
396
397 /* ----------- Vertical Interpolation ---------------- */
398 {
399 __m128i i4_horz_samp_8x16b_r0_1, i4_horz_samp_8x16b_r0_2;
400 __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r1_2;
401 __m128i i4_horz_samp_8x16b_r2_1, i4_horz_samp_8x16b_r2_2;
402 __m128i i4_horz_samp_8x16b_r3_1, i4_horz_samp_8x16b_r3_2;
403
404 __m128i i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r0_2;
405 __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2;
406 __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2;
407 __m128i i4_horz_samp_4x32b_r3_1, i4_horz_samp_4x32b_r3_2;
408
409 __m128i i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2;
410 __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2;
411 __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2;
412 __m128i i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2;
413 __m128i i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2;
414 __m128i i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2;
415 __m128i i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2;
416 __m128i i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2;
417
418 __m128i horz_add_4x32b_r1_1, horz_add_4x32b_r1_2;
419 __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2;
420 __m128i horz_add_4x32b_r3_1, horz_add_4x32b_r3_2;
421
422 __m128i twos = _mm_set1_epi32(2);
423 __m128i eights = _mm_set1_epi32(8);
424
425 i4_horz_samp_8x16b_r0_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
426 i4_horz_samp_8x16b_r0_2 =
427 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
428 i4_horz_samp_8x16b_r1_1 =
429 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLOCK_WIDTH));
430 i4_horz_samp_8x16b_r1_2 =
431 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLOCK_WIDTH + 4));
432 i4_horz_samp_8x16b_r2_1 =
433 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH << 1)));
434 i4_horz_samp_8x16b_r2_2 =
435 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH << 1) + 4));
436 i4_horz_samp_8x16b_r3_1 =
437 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH * 3)));
438 i4_horz_samp_8x16b_r3_2 =
439 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLOCK_WIDTH * 3) + 4));
440
441 i4_horz_samp_4x32b_r0_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_1);
442 i4_horz_samp_4x32b_r0_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_2);
443 i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
444 i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
445 i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
446 i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
447 i4_horz_samp_4x32b_r3_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_1);
448 i4_horz_samp_4x32b_r3_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_2);
449
450 horz_add_4x32b_r1_1 =
451 _mm_add_epi32(i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r1_1);
452 horz_add_4x32b_r2_1 =
453 _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
454 horz_add_4x32b_r3_1 =
455 _mm_add_epi32(i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r3_1);
456
457 horz_add_4x32b_r1_2 =
458 _mm_add_epi32(i4_horz_samp_4x32b_r0_2, i4_horz_samp_4x32b_r1_2);
459 horz_add_4x32b_r2_2 =
460 _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
461 horz_add_4x32b_r3_2 =
462 _mm_add_epi32(i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r3_2);
463
464 i4_res_samp_4x32b_r1_1 = _mm_add_epi32(
465 _mm_slli_epi32(i4_horz_samp_4x32b_r0_1, 1), horz_add_4x32b_r1_1);
466 i4_res_samp_4x32b_r2_1 = _mm_add_epi32(
467 _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r1_1);
468 i4_res_samp_4x32b_r3_1 = _mm_add_epi32(
469 _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
470 i4_res_samp_4x32b_r4_1 = _mm_add_epi32(
471 _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
472 i4_res_samp_4x32b_r5_1 = _mm_add_epi32(
473 _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r3_1);
474 i4_res_samp_4x32b_r6_1 = _mm_add_epi32(
475 _mm_slli_epi32(i4_horz_samp_4x32b_r3_1, 1), horz_add_4x32b_r3_1);
476
477 i4_res_samp_4x32b_r1_2 = _mm_add_epi32(
478 _mm_slli_epi32(i4_horz_samp_4x32b_r0_2, 1), horz_add_4x32b_r1_2);
479 i4_res_samp_4x32b_r2_2 = _mm_add_epi32(
480 _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r1_2);
481 i4_res_samp_4x32b_r3_2 = _mm_add_epi32(
482 _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
483 i4_res_samp_4x32b_r4_2 = _mm_add_epi32(
484 _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
485 i4_res_samp_4x32b_r5_2 = _mm_add_epi32(
486 _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r3_2);
487 i4_res_samp_4x32b_r6_2 = _mm_add_epi32(
488 _mm_slli_epi32(i4_horz_samp_4x32b_r3_2, 1), horz_add_4x32b_r3_2);
489
490 i4_res_samp_4x32b_r0_1 =
491 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_1, twos), 2);
492 i4_res_samp_4x32b_r1_1 =
493 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
494 i4_res_samp_4x32b_r2_1 =
495 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
496 i4_res_samp_4x32b_r3_1 =
497 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_1, eights), 4);
498 i4_res_samp_4x32b_r4_1 =
499 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_1, eights), 4);
500 i4_res_samp_4x32b_r5_1 =
501 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_1, eights), 4);
502 i4_res_samp_4x32b_r6_1 =
503 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_1, eights), 4);
504 i4_res_samp_4x32b_r7_1 =
505 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_1, twos), 2);
506
507 i4_res_samp_4x32b_r0_2 =
508 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_2, twos), 2);
509 i4_res_samp_4x32b_r1_2 =
510 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
511 i4_res_samp_4x32b_r2_2 =
512 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
513 i4_res_samp_4x32b_r3_2 =
514 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_2, eights), 4);
515 i4_res_samp_4x32b_r4_2 =
516 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_2, eights), 4);
517 i4_res_samp_4x32b_r5_2 =
518 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_2, eights), 4);
519 i4_res_samp_4x32b_r6_2 =
520 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_2, eights), 4);
521 i4_res_samp_4x32b_r7_2 =
522 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_2, twos), 2);
523
524 /* populate 2 samples based on current coeffs */
525 _mm_storeu_si128(
526 (__m128i *) pi2_out_res,
527 _mm_packs_epi32(i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2));
528 _mm_storeu_si128(
529 (__m128i *) (pi2_out_res + i4_out_res_stride),
530 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
531 _mm_storeu_si128(
532 (__m128i *) (pi2_out_res + (i4_out_res_stride << 1)),
533 _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
534 _mm_storeu_si128(
535 (__m128i *) (pi2_out_res + (i4_out_res_stride * 3)),
536 _mm_packs_epi32(i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2));
537 _mm_storeu_si128(
538 (__m128i *) (pi2_out_res + (i4_out_res_stride << 2)),
539 _mm_packs_epi32(i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2));
540 _mm_storeu_si128(
541 (__m128i *) (pi2_out_res + (i4_out_res_stride * 5)),
542 _mm_packs_epi32(i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2));
543 _mm_storeu_si128(
544 (__m128i *) (pi2_out_res + (i4_out_res_stride * 6)),
545 _mm_packs_epi32(i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2));
546 _mm_storeu_si128(
547 (__m128i *) (pi2_out_res + (i4_out_res_stride * 7)),
548 _mm_packs_epi32(i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2));
549
550 pi2_out_res += BLOCK_WIDTH;
551 }
552 }
553 else
554 {
555 pi2_out_res += BLOCK_WIDTH;
556 }
557
558 /* Block level loop updates */
559 if(1 == i4_blk_ctr)
560 {
561 pi2_inp_data -= SUB_BLOCK_WIDTH;
562 pi2_inp_data += (i4_inp_data_stride * SUB_BLOCK_HEIGHT);
563 pi2_out_res -= MB_WIDTH;
564 pi2_out_res += (i4_out_res_stride * BLOCK_HEIGHT);
565 i4_ref_nnz >>= 2;
566 }
567 else
568 {
569 pi2_inp_data += SUB_BLOCK_WIDTH;
570 }
571
572 i4_ref_nnz >>= 1;
573 } /* end of loop over all the blocks */
574 }
575 return;
576 }
577
578 /*****************************************************************************/
579 /* */
580 /* Function Name : isvcd_interpolate_residual_sse42 */
581 /* */
582 /* Description : */
583 /* */
584 /* Inputs : */
585 /* Globals : none */
586 /* Processing : */
587 /* */
588 /* Outputs : none */
589 /* Returns : none */
590 /* */
591 /* Issues : none */
592 /* */
593 /* Revision History: */
594 /* */
595 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
596 /* 25 11 2021 Kishore creation */
597 /* */
598 /*****************************************************************************/
599
isvcd_interpolate_residual_sse42(void * pv_residual_samp_ctxt,WORD16 * pi2_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,UWORD16 u2_mb_x,UWORD16 u2_mb_y,WORD32 i4_chroma_flag)600 void isvcd_interpolate_residual_sse42(void *pv_residual_samp_ctxt, WORD16 *pi2_out,
601 WORD32 i4_out_stride, WORD32 i4_refarray_wd, UWORD16 u2_mb_x,
602 UWORD16 u2_mb_y, WORD32 i4_chroma_flag)
603 {
604 residual_sampling_ctxt_t *ps_ctxt;
605 residual_samp_map_ctxt_t *ps_map_ctxt;
606 res_lyr_ctxt *ps_lyr_ctxt;
607 ref_pixel_map_t *ps_x_pos_phase;
608 ref_pixel_map_t *ps_y_pos_phase;
609
610 WORD32 i4_x, i4_y;
611 WORD32 i4_frm_mb_x, i4_frm_mb_y;
612 WORD32 i4_temp_array_ht;
613 WORD32 i4_mb_wd;
614 WORD32 i4_mb_ht;
615 WORD16 *pi2_ref_array;
616 UWORD8 *pu1_ref_x_ptr_incr, *pu1_ref_y_ptr_incr;
617
618 WORD8 arr_y_ref_pos[16] = {0};
619 WORD8 arr_x_ref_pos[16] = {0};
620 WORD8 arr_x_phase[32] = {0};
621 WORD8 arr_y_phase[32] = {0};
622 WORD8 *pi1_y_ref_pos;
623 WORD8 *pi1_x_ref_pos;
624 WORD8 *pi1_y_phase;
625 WORD8 *pi1_x_phase;
626
627 ps_ctxt = (residual_sampling_ctxt_t *) pv_residual_samp_ctxt;
628 ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
629 pi2_ref_array = ps_ctxt->pi2_refarray_buffer;
630 pu1_ref_x_ptr_incr = ps_ctxt->pu1_ref_x_ptr_incr;
631 pu1_ref_y_ptr_incr = ps_ctxt->pu1_ref_y_ptr_incr;
632
633 /* --------------------------------------------------------------------- */
634 /* Extracting information from the mapping context */
635 /* --------------------------------------------------------------------- */
636 if(1 == i4_chroma_flag)
637 ps_map_ctxt = &ps_lyr_ctxt->s_chroma_map_ctxt;
638 else
639 ps_map_ctxt = &ps_lyr_ctxt->s_luma_map_ctxt;
640
641 i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
642 i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
643
644 ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
645 ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
646
647 i4_temp_array_ht = i4_mb_ht;
648 i4_frm_mb_y = u2_mb_y * i4_mb_ht;
649 i4_frm_mb_x = u2_mb_x * i4_mb_wd;
650
651 /* --------------------------------------------------------------------- */
652 /* Loop for interpolation */
653 /* --------------------------------------------------------------------- */
654
655 if(i4_chroma_flag == 0)
656 {
657 __m128i const_16_8x16b, const_128, const_ones, const_ones_8x16b, mid_indx_16x8b;
658 __m128i ref_arr_8x16b_r0_0;
659 __m128i ref_arr_8x16b_r1_0;
660 __m128i phs_mask_8x16b_0, phs_mask_16min_8x16b_0, phs_mask_16x8b_0;
661 __m128i x_ref_pos_mask_r0, x_ref_rnd_mask_r0_0;
662 __m128i x_ref_pos_mask_temp_r0_0;
663 __m128i x_ref_pos_mask_temp_r1_0;
664 __m128i phs_mask_div8_8x16b_0;
665 __m128i u1_incr_8x16b_r0_0, ref_arr_temp0_8x16b_r0_0, res0_8x16b_r0_0,
666 u1_incr_not_8x16b_r0_0;
667 __m128i u1_incr_8x16b_r1_0, ref_arr_temp1_8x16b_r0_0, res1_8x16b_r0_0;
668
669 __m128i u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r1_even, x_ref_pos_mask_temp_r0_even,
670 x_ref_pos_mask_temp_r1_even;
671 __m128i u1_incr_not_8x16b_r0_odd, u1_incr_not_8x16b_r1_odd, x_ref_pos_mask_temp_r0_odd,
672 x_ref_pos_mask_temp_r1_odd;
673
674 __m128i ref_arr_temp0_8x16b_r1_0, res_8x16b_r0_0, res0_8x16b_r1_0, u1_incr_not_8x16b_r1_0;
675 __m128i ref_arr_temp1_8x16b_r1_0, res_8x16b_r1_0, res1_8x16b_r1_0;
676 __m128i u1_y_incr_8x16b_r0_0, u1_y_incr_8x16b_r0_1, u1_y_incr_8x16b_r0_low,
677 u1_y_incr_8x16b_r0_high;
678
679 __m128i prev_res_8x16b_r0_0;
680 __m128i prev_res_8x16b_r1_0;
681 __m128i prev_res_8x16b_r0_1;
682 __m128i prev_res_8x16b_r1_1;
683
684 __m128i u1_prev_y_incr_8x16b_r0_0;
685 __m128i u1_prev_y_incr_8x16b_r0_1;
686
687 __m128i ref_arr_8x16b_r0_1;
688 __m128i ref_arr_8x16b_r1_1;
689 __m128i phs_mask_8x16b_1, phs_mask_div8_8x16b_1, phs_mask_16min_8x16b_1;
690 __m128i x_ref_pos_mask_temp_r0_1;
691 __m128i x_ref_pos_mask_temp_r1_1;
692 __m128i ref_arr_temp0_8x16b_r0_1, res0_8x16b_r0_1, u1_incr_not_8x16b_r0_1;
693 __m128i ref_arr_temp1_8x16b_r0_1, res1_8x16b_r0_1;
694
695 __m128i ref_arr_temp0_8x16b_r1_1, res_8x16b_r0_1, res0_8x16b_r1_1, u1_incr_not_8x16b_r1_1;
696 __m128i ref_arr_temp1_8x16b_r1_1, res_8x16b_r1_1, res1_8x16b_r1_1;
697
698 __m128i vert_res0_8x16b_r0_0, vert_res0_8x16b_r0_1, res_4x32b_l_0, res_4x32b_h_0;
699 __m128i vert_res1_8x16b_r0_0, vert_res1_8x16b_r0_1, res_4x32b_l_1, res_4x32b_h_1;
700 __m128i res_8x16b_l, res_8x16b_h;
701 __m128i phs_y_mask_16min_8x16b, phs_y_mask_8x16b, phs_y_mask_mix_8x16b;
702 __m128i zero_8x16b;
703 WORD32 zero_r0_0, zero_r1_0, zero_r0_1, zero_r1_1, zero_r0_r1 = 0;
704 WORD32 strt_indx_h;
705 WORD16 *pi2_ref_array_temp;
706 UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
707 WORD32 i4_y_phase;
708 WORD32 out_stride_temp;
709 const_128 = _mm_set1_epi32(128);
710 zero_8x16b = _mm_set1_epi16(0);
711 const_ones = _mm_set1_epi8(1);
712 const_ones_8x16b = _mm_set1_epi16(1);
713
714 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
715 {
716 arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
717 arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
718 }
719 pi1_y_ref_pos = arr_y_ref_pos;
720 pi1_y_phase = arr_y_phase;
721
722 strt_indx_h = 0;
723 strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos);
724 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
725 {
726 arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
727 arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
728 }
729
730 pi1_x_ref_pos = arr_x_ref_pos;
731 pi1_x_phase = arr_x_phase;
732
733 x_ref_pos_mask_r0 = _mm_loadu_si128((__m128i *) (pi1_x_ref_pos));
734 phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi1_x_phase));
735 phs_mask_8x16b_0 = _mm_cvtepi8_epi16(phs_mask_16x8b_0);
736 phs_mask_8x16b_1 = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i *) (pi1_x_phase + 8)));
737
738 phs_mask_div8_8x16b_0 = _mm_srli_epi16(phs_mask_8x16b_0, 3);
739 phs_mask_div8_8x16b_1 = _mm_srli_epi16(phs_mask_8x16b_1, 3);
740 phs_mask_div8_8x16b_0 = _mm_packs_epi16(phs_mask_div8_8x16b_0, phs_mask_div8_8x16b_1);
741 const_16_8x16b = _mm_set1_epi16(16);
742
743 phs_mask_16min_8x16b_0 = _mm_sub_epi16(const_16_8x16b, phs_mask_8x16b_0);
744 phs_mask_16min_8x16b_1 = _mm_sub_epi16(const_16_8x16b, phs_mask_8x16b_1);
745
746 x_ref_rnd_mask_r0_0 = _mm_add_epi8(x_ref_pos_mask_r0, phs_mask_div8_8x16b_0);
747 mid_indx_16x8b = _mm_set1_epi8((strt_indx_h << 1));
748 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
749 {
750 if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
751 {
752 if(zero_r0_r1)
753 {
754 res_8x16b_l = _mm_set1_epi16(0);
755 res_8x16b_h = _mm_set1_epi16(0);
756 out_stride_temp = (i4_y * i4_out_stride);
757 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), res_8x16b_l);
758 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), res_8x16b_h);
759 continue;
760 }
761
762 res_8x16b_r0_0 = prev_res_8x16b_r0_0;
763 res_8x16b_r1_0 = prev_res_8x16b_r1_0;
764 res_8x16b_r0_1 = prev_res_8x16b_r0_1;
765 res_8x16b_r1_1 = prev_res_8x16b_r1_1;
766
767 u1_y_incr_8x16b_r0_0 = u1_prev_y_incr_8x16b_r0_0;
768 u1_y_incr_8x16b_r0_1 = u1_prev_y_incr_8x16b_r0_1;
769 }
770 else
771 {
772 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
773 pu1_ref_x_ptr_incr_temp =
774 pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
775 ref_arr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pi2_ref_array_temp));
776 ref_arr_8x16b_r1_0 =
777 _mm_loadu_si128((__m128i *) (pi2_ref_array_temp + i4_refarray_wd));
778 ref_arr_8x16b_r0_1 =
779 _mm_loadu_si128((__m128i *) (pi2_ref_array_temp + strt_indx_h));
780 ref_arr_8x16b_r1_1 = _mm_loadu_si128(
781 (__m128i *) (pi2_ref_array_temp + i4_refarray_wd + strt_indx_h));
782
783 zero_r0_0 = _mm_test_all_ones(_mm_cmpeq_epi16(
784 ref_arr_8x16b_r0_0, zero_8x16b)); // return 1 if all zeros, else 0
785 zero_r1_0 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r1_0, zero_8x16b));
786 zero_r0_1 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r0_1, zero_8x16b));
787 zero_r1_1 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r1_1, zero_8x16b));
788
789 zero_r0_r1 = zero_r0_0 && zero_r1_0 && zero_r0_1 && zero_r1_1;
790
791 if(!zero_r0_r1)
792 {
793 u1_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp));
794 u1_incr_8x16b_r1_0 =
795 _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp + i4_refarray_wd));
796
797 u1_incr_8x16b_r0_0 = _mm_shuffle_epi8(u1_incr_8x16b_r0_0, x_ref_pos_mask_r0);
798 u1_incr_8x16b_r1_0 = _mm_shuffle_epi8(u1_incr_8x16b_r1_0, x_ref_pos_mask_r0);
799
800 u1_incr_not_8x16b_r0_0 =
801 _mm_andnot_si128(u1_incr_8x16b_r0_0, phs_mask_div8_8x16b_0);
802 u1_incr_not_8x16b_r1_0 =
803 _mm_andnot_si128(u1_incr_8x16b_r1_0, phs_mask_div8_8x16b_0);
804
805 u1_incr_not_8x16b_r0_0 =
806 _mm_add_epi8(u1_incr_not_8x16b_r0_0, x_ref_pos_mask_r0);
807 u1_incr_not_8x16b_r1_0 =
808 _mm_add_epi8(u1_incr_not_8x16b_r1_0, x_ref_pos_mask_r0);
809
810 x_ref_pos_mask_temp_r0_0 =
811 _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_8x16b_r0_0);
812 x_ref_pos_mask_temp_r1_0 =
813 _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_8x16b_r1_0);
814
815 /* _mm_slli_epi8(u1_incr_not_8x16b_r0_0, 1)*/
816 u1_incr_not_8x16b_r0_even =
817 _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
818 u1_incr_not_8x16b_r1_even =
819 _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
820 x_ref_pos_mask_temp_r0_even =
821 _mm_add_epi8(x_ref_pos_mask_temp_r0_0, x_ref_pos_mask_temp_r0_0);
822 x_ref_pos_mask_temp_r1_even =
823 _mm_add_epi8(x_ref_pos_mask_temp_r1_0, x_ref_pos_mask_temp_r1_0);
824
825 u1_incr_not_8x16b_r0_odd = _mm_add_epi8(u1_incr_not_8x16b_r0_even, const_ones);
826 u1_incr_not_8x16b_r1_odd = _mm_add_epi8(u1_incr_not_8x16b_r1_even, const_ones);
827 x_ref_pos_mask_temp_r0_odd =
828 _mm_add_epi8(x_ref_pos_mask_temp_r0_even, const_ones);
829 x_ref_pos_mask_temp_r1_odd =
830 _mm_add_epi8(x_ref_pos_mask_temp_r1_even, const_ones);
831
832 u1_incr_not_8x16b_r0_0 =
833 _mm_unpacklo_epi8(u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r0_odd);
834 u1_incr_not_8x16b_r1_0 =
835 _mm_unpacklo_epi8(u1_incr_not_8x16b_r1_even, u1_incr_not_8x16b_r1_odd);
836 x_ref_pos_mask_temp_r0_0 =
837 _mm_unpacklo_epi8(x_ref_pos_mask_temp_r0_even, x_ref_pos_mask_temp_r0_odd);
838 x_ref_pos_mask_temp_r1_0 =
839 _mm_unpacklo_epi8(x_ref_pos_mask_temp_r1_even, x_ref_pos_mask_temp_r1_odd);
840
841 u1_incr_not_8x16b_r0_1 =
842 _mm_unpackhi_epi8(u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r0_odd);
843 u1_incr_not_8x16b_r1_1 =
844 _mm_unpackhi_epi8(u1_incr_not_8x16b_r1_even, u1_incr_not_8x16b_r1_odd);
845 x_ref_pos_mask_temp_r0_1 =
846 _mm_unpackhi_epi8(x_ref_pos_mask_temp_r0_even, x_ref_pos_mask_temp_r0_odd);
847 x_ref_pos_mask_temp_r1_1 =
848 _mm_unpackhi_epi8(x_ref_pos_mask_temp_r1_even, x_ref_pos_mask_temp_r1_odd);
849
850 u1_incr_not_8x16b_r0_1 = _mm_sub_epi8(u1_incr_not_8x16b_r0_1, mid_indx_16x8b);
851 u1_incr_not_8x16b_r1_1 = _mm_sub_epi8(u1_incr_not_8x16b_r1_1, mid_indx_16x8b);
852 x_ref_pos_mask_temp_r0_1 =
853 _mm_sub_epi8(x_ref_pos_mask_temp_r0_1, mid_indx_16x8b);
854 x_ref_pos_mask_temp_r1_1 =
855 _mm_sub_epi8(x_ref_pos_mask_temp_r1_1, mid_indx_16x8b);
856
857 ref_arr_temp0_8x16b_r0_0 =
858 _mm_shuffle_epi8(ref_arr_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
859 ref_arr_temp0_8x16b_r1_0 =
860 _mm_shuffle_epi8(ref_arr_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
861 ref_arr_temp1_8x16b_r0_0 =
862 _mm_shuffle_epi8(ref_arr_8x16b_r0_0, x_ref_pos_mask_temp_r0_0);
863 ref_arr_temp1_8x16b_r1_0 =
864 _mm_shuffle_epi8(ref_arr_8x16b_r1_0, x_ref_pos_mask_temp_r1_0);
865 ref_arr_temp0_8x16b_r0_1 =
866 _mm_shuffle_epi8(ref_arr_8x16b_r0_1, u1_incr_not_8x16b_r0_1);
867 ref_arr_temp0_8x16b_r1_1 =
868 _mm_shuffle_epi8(ref_arr_8x16b_r1_1, u1_incr_not_8x16b_r1_1);
869 ref_arr_temp1_8x16b_r0_1 =
870 _mm_shuffle_epi8(ref_arr_8x16b_r0_1, x_ref_pos_mask_temp_r0_1);
871 ref_arr_temp1_8x16b_r1_1 =
872 _mm_shuffle_epi8(ref_arr_8x16b_r1_1, x_ref_pos_mask_temp_r1_1);
873
874 res0_8x16b_r0_0 =
875 _mm_mullo_epi16(ref_arr_temp0_8x16b_r0_0, phs_mask_16min_8x16b_0);
876 res0_8x16b_r1_0 =
877 _mm_mullo_epi16(ref_arr_temp0_8x16b_r1_0, phs_mask_16min_8x16b_0);
878 res1_8x16b_r0_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r0_0, phs_mask_8x16b_0);
879 res1_8x16b_r1_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r1_0, phs_mask_8x16b_0);
880 res0_8x16b_r0_1 =
881 _mm_mullo_epi16(ref_arr_temp0_8x16b_r0_1, phs_mask_16min_8x16b_1);
882 res0_8x16b_r1_1 =
883 _mm_mullo_epi16(ref_arr_temp0_8x16b_r1_1, phs_mask_16min_8x16b_1);
884 res1_8x16b_r0_1 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r0_1, phs_mask_8x16b_1);
885 res1_8x16b_r1_1 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r1_1, phs_mask_8x16b_1);
886
887 res_8x16b_r0_0 = _mm_add_epi16(res0_8x16b_r0_0, res1_8x16b_r0_0);
888 res_8x16b_r1_0 = _mm_add_epi16(res0_8x16b_r1_0, res1_8x16b_r1_0);
889 res_8x16b_r0_1 = _mm_add_epi16(res0_8x16b_r0_1, res1_8x16b_r0_1);
890 res_8x16b_r1_1 = _mm_add_epi16(res0_8x16b_r1_1, res1_8x16b_r1_1);
891
892 prev_res_8x16b_r0_0 = res_8x16b_r0_0;
893 prev_res_8x16b_r1_0 = res_8x16b_r1_0;
894 prev_res_8x16b_r0_1 = res_8x16b_r0_1;
895 prev_res_8x16b_r1_1 = res_8x16b_r1_1;
896
897 pu1_ref_y_ptr_incr_temp =
898 pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
899 u1_y_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_y_ptr_incr_temp));
900
901 u1_y_incr_8x16b_r0_0 =
902 _mm_shuffle_epi8(u1_y_incr_8x16b_r0_0, x_ref_rnd_mask_r0_0);
903
904 u1_y_incr_8x16b_r0_low = _mm_cvtepi8_epi16(u1_y_incr_8x16b_r0_0);
905 u1_y_incr_8x16b_r0_high =
906 _mm_cvtepi8_epi16(_mm_unpackhi_epi64(u1_y_incr_8x16b_r0_0, const_ones));
907
908 u1_y_incr_8x16b_r0_0 =
909 _mm_cmpeq_epi16(u1_y_incr_8x16b_r0_low, const_ones_8x16b);
910 u1_y_incr_8x16b_r0_1 =
911 _mm_cmpeq_epi16(u1_y_incr_8x16b_r0_high, const_ones_8x16b);
912
913 u1_prev_y_incr_8x16b_r0_0 = u1_y_incr_8x16b_r0_0;
914 u1_prev_y_incr_8x16b_r0_1 = u1_y_incr_8x16b_r0_1;
915 }
916 }
917
918 if(zero_r0_r1)
919 {
920 res_8x16b_l = _mm_set1_epi16(0);
921 res_8x16b_h = _mm_set1_epi16(0);
922 }
923 else
924 {
925 i4_y_phase = pi1_y_phase[i4_y];
926
927 if((i4_y_phase) >> 3)
928 {
929 vert_res0_8x16b_r0_0 =
930 _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
931 vert_res1_8x16b_r0_0 =
932 _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
933 vert_res0_8x16b_r0_1 =
934 _mm_blendv_epi8(res_8x16b_r1_1, res_8x16b_r0_1, u1_y_incr_8x16b_r0_1);
935 vert_res1_8x16b_r0_1 =
936 _mm_blendv_epi8(res_8x16b_r1_1, res_8x16b_r1_1, u1_y_incr_8x16b_r0_1);
937 }
938 else
939 {
940 vert_res0_8x16b_r0_0 =
941 _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
942 vert_res1_8x16b_r0_0 =
943 _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
944 vert_res0_8x16b_r0_1 =
945 _mm_blendv_epi8(res_8x16b_r0_1, res_8x16b_r0_1, u1_y_incr_8x16b_r0_1);
946 vert_res1_8x16b_r0_1 =
947 _mm_blendv_epi8(res_8x16b_r0_1, res_8x16b_r1_1, u1_y_incr_8x16b_r0_1);
948 }
949 res0_8x16b_r0_0 = _mm_unpacklo_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
950 res1_8x16b_r0_0 = _mm_unpackhi_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
951 res0_8x16b_r0_1 = _mm_unpacklo_epi16(vert_res0_8x16b_r0_1, vert_res1_8x16b_r0_1);
952 res1_8x16b_r0_1 = _mm_unpackhi_epi16(vert_res0_8x16b_r0_1, vert_res1_8x16b_r0_1);
953
954 phs_y_mask_16min_8x16b = _mm_set1_epi16(16 - i4_y_phase);
955 phs_y_mask_8x16b = _mm_set1_epi16(i4_y_phase);
956 phs_y_mask_mix_8x16b = _mm_unpacklo_epi16(phs_y_mask_16min_8x16b, phs_y_mask_8x16b);
957
958 res_4x32b_l_0 = _mm_madd_epi16(res0_8x16b_r0_0, phs_y_mask_mix_8x16b);
959 res_4x32b_l_1 = _mm_madd_epi16(res1_8x16b_r0_0, phs_y_mask_mix_8x16b);
960 res_4x32b_h_0 = _mm_madd_epi16(res0_8x16b_r0_1, phs_y_mask_mix_8x16b);
961 res_4x32b_h_1 = _mm_madd_epi16(res1_8x16b_r0_1, phs_y_mask_mix_8x16b);
962
963 res_4x32b_l_0 = _mm_add_epi32(res_4x32b_l_0, const_128);
964 res_4x32b_l_1 = _mm_add_epi32(res_4x32b_l_1, const_128);
965 res_4x32b_h_0 = _mm_add_epi32(res_4x32b_h_0, const_128);
966 res_4x32b_h_1 = _mm_add_epi32(res_4x32b_h_1, const_128);
967
968 res_4x32b_l_0 = _mm_srai_epi32(res_4x32b_l_0, 8);
969 res_4x32b_l_1 = _mm_srai_epi32(res_4x32b_l_1, 8);
970 res_4x32b_h_0 = _mm_srai_epi32(res_4x32b_h_0, 8);
971 res_4x32b_h_1 = _mm_srai_epi32(res_4x32b_h_1, 8);
972 res_8x16b_l = _mm_packs_epi32(res_4x32b_l_0, res_4x32b_l_1);
973 res_8x16b_h = _mm_packs_epi32(res_4x32b_h_0, res_4x32b_h_1);
974 }
975
976 out_stride_temp = (i4_y * i4_out_stride);
977 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), res_8x16b_l);
978 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), res_8x16b_h);
979 }
980 }
981 else
982 {
983 __m128i const_16_8x16b, const_128, const_ones, const_ones_8x16b;
984 __m128i ref_arr_8x16b_r0_0;
985 __m128i ref_arr_8x16b_r1_0;
986 __m128i phs_mask_8x16b_0, phs_mask_div8_8x16b_0, phs_mask_16min_8x16b_0;
987 __m128i x_ref_pos_mask_r0, x_ref_rnd_mask_r0_0;
988 __m128i x_ref_pos_mask_temp_r0_0;
989 __m128i x_ref_pos_mask_temp_r1_0;
990
991 __m128i u1_incr_8x16b_r0_0, ref_arr_temp0_8x16b_r0_0, res0_8x16b_r0_0,
992 u1_incr_not_8x16b_r0_0;
993 __m128i u1_incr_8x16b_r1_0, ref_arr_temp1_8x16b_r0_0, res1_8x16b_r0_0;
994 __m128i u1_y_incr_8x16b_r0_0;
995
996 __m128i u1_incr_not_8x16b_r0_odd, u1_incr_not_8x16b_r1_odd, x_ref_pos_mask_temp_r0_odd,
997 x_ref_pos_mask_temp_r1_odd;
998 __m128i u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r1_even, x_ref_pos_mask_temp_r0_even,
999 x_ref_pos_mask_temp_r1_even;
1000
1001 __m128i ref_arr_temp0_8x16b_r1_0, res_8x16b_r0_0, res0_8x16b_r1_0, u1_incr_not_8x16b_r1_0;
1002 __m128i ref_arr_temp1_8x16b_r1_0, res_8x16b_r1_0, res1_8x16b_r1_0;
1003 __m128i u1_prev_y_incr_8x16b_r0_0;
1004 __m128i prev_res_8x16b_r0_0;
1005 __m128i prev_res_8x16b_r1_0;
1006
1007 __m128i vert_res0_8x16b_r0_0, res_4x32b_l_0, out_4x32b_l;
1008 __m128i vert_res1_8x16b_r0_0, res_4x32b_l_1, out_4x32b_h;
1009 __m128i phs_y_mask_16min_8x16b, phs_y_mask_8x16b, phs_y_mask_mix_8x16b;
1010 __m128i chroma_mask, chroma_mask2;
1011 __m128i zero_8x16b = _mm_set1_epi16(0);
1012 WORD32 zero_r0_0, zero_r1_0, zero_r0_r1 = 0;
1013 WORD16 *pi2_ref_array_temp;
1014 UWORD8 *pu1_ref_x_ptr_incr_temp, *pu1_ref_y_ptr_incr_temp;
1015 WORD32 i4_y_phase;
1016 WORD32 out_stride_temp;
1017 const_ones = _mm_set1_epi8(1);
1018 const_ones_8x16b = _mm_set1_epi16(1);
1019 const_128 = _mm_set1_epi32(128);
1020
1021 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1022 {
1023 arr_y_phase[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
1024 arr_y_ref_pos[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
1025 }
1026 pi1_y_ref_pos = arr_y_ref_pos;
1027 pi1_y_phase = arr_y_phase;
1028
1029 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
1030 {
1031 arr_x_ref_pos[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
1032 arr_x_phase[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
1033 }
1034
1035 pi1_x_ref_pos = arr_x_ref_pos;
1036 pi1_x_phase = arr_x_phase;
1037
1038 phs_mask_8x16b_0 = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i *) (pi1_x_phase)));
1039 x_ref_pos_mask_r0 = _mm_loadu_si128((__m128i *) (pi1_x_ref_pos));
1040
1041 const_16_8x16b = _mm_set1_epi16(16);
1042 chroma_mask = _mm_set1_epi32(0xFFFF0000);
1043 chroma_mask2 = _mm_set1_epi32(0x0000FFFF);
1044 phs_mask_div8_8x16b_0 = _mm_srli_epi16(phs_mask_8x16b_0, 3);
1045 phs_mask_div8_8x16b_0 = _mm_packs_epi16(phs_mask_div8_8x16b_0, const_ones);
1046
1047 phs_mask_16min_8x16b_0 = _mm_sub_epi16(const_16_8x16b, phs_mask_8x16b_0);
1048 x_ref_rnd_mask_r0_0 = _mm_add_epi8(x_ref_pos_mask_r0, phs_mask_div8_8x16b_0);
1049
1050 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
1051 {
1052 if((i4_y > 0) && (pi1_y_ref_pos[i4_y] == pi1_y_ref_pos[i4_y - 1]))
1053 {
1054 if(zero_r0_r1)
1055 {
1056 res_4x32b_l_0 = _mm_set1_epi32(0);
1057 res_4x32b_l_1 = _mm_set1_epi32(0);
1058 out_stride_temp = (i4_y * i4_out_stride);
1059
1060 out_4x32b_l = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp));
1061 out_4x32b_h = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp + 8));
1062
1063 out_4x32b_l = _mm_and_si128(out_4x32b_l, chroma_mask);
1064 out_4x32b_h = _mm_and_si128(out_4x32b_h, chroma_mask);
1065
1066 res_4x32b_l_0 = _mm_and_si128(res_4x32b_l_0, chroma_mask2);
1067 res_4x32b_l_1 = _mm_and_si128(res_4x32b_l_1, chroma_mask2);
1068
1069 out_4x32b_l = _mm_add_epi8(res_4x32b_l_0, out_4x32b_l);
1070 out_4x32b_h = _mm_add_epi8(res_4x32b_l_1, out_4x32b_h);
1071
1072 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), out_4x32b_l);
1073 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), out_4x32b_h);
1074 continue;
1075 }
1076
1077 res_8x16b_r0_0 = prev_res_8x16b_r0_0;
1078 res_8x16b_r1_0 = prev_res_8x16b_r1_0;
1079
1080 u1_y_incr_8x16b_r0_0 = u1_prev_y_incr_8x16b_r0_0;
1081 }
1082 else
1083 {
1084 pi2_ref_array_temp = pi2_ref_array + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1085 pu1_ref_x_ptr_incr_temp =
1086 pu1_ref_x_ptr_incr + ((pi1_y_ref_pos[i4_y]) * i4_refarray_wd);
1087 ref_arr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pi2_ref_array_temp));
1088 ref_arr_8x16b_r1_0 =
1089 _mm_loadu_si128((__m128i *) (pi2_ref_array_temp + i4_refarray_wd));
1090
1091 zero_r0_0 = _mm_test_all_ones(_mm_cmpeq_epi16(
1092 ref_arr_8x16b_r0_0, zero_8x16b)); // return 1 if all zeros, else 0
1093 zero_r1_0 = _mm_test_all_ones(_mm_cmpeq_epi16(ref_arr_8x16b_r1_0, zero_8x16b));
1094
1095 zero_r0_r1 = zero_r0_0 && zero_r1_0;
1096
1097 if(!zero_r0_r1)
1098 {
1099 u1_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp));
1100 u1_incr_8x16b_r1_0 =
1101 _mm_loadu_si128((__m128i *) (pu1_ref_x_ptr_incr_temp + i4_refarray_wd));
1102
1103 u1_incr_8x16b_r0_0 = _mm_shuffle_epi8(u1_incr_8x16b_r0_0, x_ref_pos_mask_r0);
1104 u1_incr_8x16b_r1_0 = _mm_shuffle_epi8(u1_incr_8x16b_r1_0, x_ref_pos_mask_r0);
1105
1106 u1_incr_not_8x16b_r0_0 =
1107 _mm_andnot_si128(u1_incr_8x16b_r0_0, phs_mask_div8_8x16b_0);
1108 u1_incr_not_8x16b_r1_0 =
1109 _mm_andnot_si128(u1_incr_8x16b_r1_0, phs_mask_div8_8x16b_0);
1110
1111 u1_incr_not_8x16b_r0_0 =
1112 _mm_add_epi8(u1_incr_not_8x16b_r0_0, x_ref_pos_mask_r0);
1113 u1_incr_not_8x16b_r1_0 =
1114 _mm_add_epi8(u1_incr_not_8x16b_r1_0, x_ref_pos_mask_r0);
1115
1116 x_ref_pos_mask_temp_r0_0 =
1117 _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_8x16b_r0_0);
1118 x_ref_pos_mask_temp_r1_0 =
1119 _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_8x16b_r1_0);
1120
1121 u1_incr_not_8x16b_r0_even =
1122 _mm_add_epi8(u1_incr_not_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
1123 u1_incr_not_8x16b_r1_even =
1124 _mm_add_epi8(u1_incr_not_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
1125 x_ref_pos_mask_temp_r0_even =
1126 _mm_add_epi8(x_ref_pos_mask_temp_r0_0, x_ref_pos_mask_temp_r0_0);
1127 x_ref_pos_mask_temp_r1_even =
1128 _mm_add_epi8(x_ref_pos_mask_temp_r1_0, x_ref_pos_mask_temp_r1_0);
1129
1130 u1_incr_not_8x16b_r0_odd = _mm_add_epi8(u1_incr_not_8x16b_r0_even, const_ones);
1131 u1_incr_not_8x16b_r1_odd = _mm_add_epi8(u1_incr_not_8x16b_r1_even, const_ones);
1132 x_ref_pos_mask_temp_r0_odd =
1133 _mm_add_epi8(x_ref_pos_mask_temp_r0_even, const_ones);
1134 x_ref_pos_mask_temp_r1_odd =
1135 _mm_add_epi8(x_ref_pos_mask_temp_r1_even, const_ones);
1136
1137 u1_incr_not_8x16b_r0_0 =
1138 _mm_unpacklo_epi8(u1_incr_not_8x16b_r0_even, u1_incr_not_8x16b_r0_odd);
1139 u1_incr_not_8x16b_r1_0 =
1140 _mm_unpacklo_epi8(u1_incr_not_8x16b_r1_even, u1_incr_not_8x16b_r1_odd);
1141 x_ref_pos_mask_temp_r0_0 =
1142 _mm_unpacklo_epi8(x_ref_pos_mask_temp_r0_even, x_ref_pos_mask_temp_r0_odd);
1143 x_ref_pos_mask_temp_r1_0 =
1144 _mm_unpacklo_epi8(x_ref_pos_mask_temp_r1_even, x_ref_pos_mask_temp_r1_odd);
1145
1146 ref_arr_temp0_8x16b_r0_0 =
1147 _mm_shuffle_epi8(ref_arr_8x16b_r0_0, u1_incr_not_8x16b_r0_0);
1148 ref_arr_temp0_8x16b_r1_0 =
1149 _mm_shuffle_epi8(ref_arr_8x16b_r1_0, u1_incr_not_8x16b_r1_0);
1150 ref_arr_temp1_8x16b_r0_0 =
1151 _mm_shuffle_epi8(ref_arr_8x16b_r0_0, x_ref_pos_mask_temp_r0_0);
1152 ref_arr_temp1_8x16b_r1_0 =
1153 _mm_shuffle_epi8(ref_arr_8x16b_r1_0, x_ref_pos_mask_temp_r1_0);
1154
1155 res0_8x16b_r0_0 =
1156 _mm_mullo_epi16(ref_arr_temp0_8x16b_r0_0, phs_mask_16min_8x16b_0);
1157 res0_8x16b_r1_0 =
1158 _mm_mullo_epi16(ref_arr_temp0_8x16b_r1_0, phs_mask_16min_8x16b_0);
1159 res1_8x16b_r0_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r0_0, phs_mask_8x16b_0);
1160 res1_8x16b_r1_0 = _mm_mullo_epi16(ref_arr_temp1_8x16b_r1_0, phs_mask_8x16b_0);
1161
1162 res_8x16b_r0_0 = _mm_add_epi16(res0_8x16b_r0_0, res1_8x16b_r0_0);
1163 res_8x16b_r1_0 = _mm_add_epi16(res0_8x16b_r1_0, res1_8x16b_r1_0);
1164
1165 pu1_ref_y_ptr_incr_temp =
1166 pu1_ref_y_ptr_incr + (pi1_y_ref_pos[i4_y] * i4_refarray_wd);
1167 u1_y_incr_8x16b_r0_0 = _mm_loadu_si128((__m128i *) (pu1_ref_y_ptr_incr_temp));
1168
1169 u1_y_incr_8x16b_r0_0 =
1170 _mm_shuffle_epi8(u1_y_incr_8x16b_r0_0, x_ref_rnd_mask_r0_0);
1171
1172 u1_y_incr_8x16b_r0_0 = _mm_cvtepi8_epi16(u1_y_incr_8x16b_r0_0);
1173 u1_y_incr_8x16b_r0_0 = _mm_cmpeq_epi16(u1_y_incr_8x16b_r0_0, const_ones_8x16b);
1174 u1_prev_y_incr_8x16b_r0_0 = u1_y_incr_8x16b_r0_0;
1175
1176 prev_res_8x16b_r0_0 = res_8x16b_r0_0;
1177 prev_res_8x16b_r1_0 = res_8x16b_r1_0;
1178 }
1179 }
1180
1181 if(zero_r0_r1)
1182 {
1183 res_4x32b_l_0 = _mm_set1_epi32(0);
1184 res_4x32b_l_1 = _mm_set1_epi32(0);
1185 }
1186 else
1187 {
1188 i4_y_phase = pi1_y_phase[i4_y];
1189
1190 if((i4_y_phase) >> 3)
1191 {
1192 vert_res0_8x16b_r0_0 =
1193 _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
1194 vert_res1_8x16b_r0_0 =
1195 _mm_blendv_epi8(res_8x16b_r1_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
1196 }
1197 else
1198 {
1199 vert_res0_8x16b_r0_0 =
1200 _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r0_0, u1_y_incr_8x16b_r0_0);
1201 vert_res1_8x16b_r0_0 =
1202 _mm_blendv_epi8(res_8x16b_r0_0, res_8x16b_r1_0, u1_y_incr_8x16b_r0_0);
1203 }
1204
1205 res0_8x16b_r0_0 = _mm_unpacklo_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
1206 res1_8x16b_r0_0 = _mm_unpackhi_epi16(vert_res0_8x16b_r0_0, vert_res1_8x16b_r0_0);
1207
1208 phs_y_mask_16min_8x16b = _mm_set1_epi16(16 - i4_y_phase);
1209 phs_y_mask_8x16b = _mm_set1_epi16(i4_y_phase);
1210 phs_y_mask_mix_8x16b = _mm_unpacklo_epi16(phs_y_mask_16min_8x16b, phs_y_mask_8x16b);
1211
1212 res_4x32b_l_0 = _mm_madd_epi16(res0_8x16b_r0_0, phs_y_mask_mix_8x16b);
1213 res_4x32b_l_1 = _mm_madd_epi16(res1_8x16b_r0_0, phs_y_mask_mix_8x16b);
1214 res_4x32b_l_0 = _mm_add_epi32(res_4x32b_l_0, const_128);
1215 res_4x32b_l_1 = _mm_add_epi32(res_4x32b_l_1, const_128);
1216
1217 res_4x32b_l_0 = _mm_srai_epi32(res_4x32b_l_0, 8);
1218 res_4x32b_l_1 = _mm_srai_epi32(res_4x32b_l_1, 8);
1219 }
1220 out_stride_temp = (i4_y * i4_out_stride);
1221
1222 out_4x32b_l = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp));
1223 out_4x32b_h = _mm_loadu_si128((__m128i *) (pi2_out + out_stride_temp + 8));
1224
1225 out_4x32b_l = _mm_and_si128(out_4x32b_l, chroma_mask);
1226 out_4x32b_h = _mm_and_si128(out_4x32b_h, chroma_mask);
1227
1228 res_4x32b_l_0 = _mm_and_si128(res_4x32b_l_0, chroma_mask2);
1229 res_4x32b_l_1 = _mm_and_si128(res_4x32b_l_1, chroma_mask2);
1230
1231 out_4x32b_l = _mm_add_epi8(res_4x32b_l_0, out_4x32b_l);
1232 out_4x32b_h = _mm_add_epi8(res_4x32b_l_1, out_4x32b_h);
1233
1234 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp), out_4x32b_l);
1235 _mm_storeu_si128((__m128i *) (pi2_out + out_stride_temp + 8), out_4x32b_h);
1236 }
1237 }
1238 return;
1239 } /* End of Interpolation Function */
1240
1241 /*****************************************************************************/
1242 /* */
1243 /* Function Name : isvcd_residual_reflayer_const_non_boundary_mb_sse42 */
1244 /* */
1245 /* Description : */
1246 /* */
1247 /* Inputs : */
1248 /* Globals : none */
1249 /* Processing : */
1250 /* */
1251 /* Outputs : none */
1252 /* Returns : none */
1253 /* */
1254 /* Issues : none */
1255 /* */
1256 /* Revision History: */
1257 /* */
1258 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1259 /* 25 11 2021 Kishore creation */
1260 /* */
1261 /*****************************************************************************/
1262
isvcd_residual_reflayer_const_non_boundary_mb_sse42(WORD16 * pi2_inp_data,WORD32 i4_inp_data_stride,WORD16 * pi2_ref_array,WORD32 i4_refarray_wd,WORD32 i4_refarray_ht,WORD32 i4_ref_mb_type_q0,WORD32 i4_ref_mb_type_q1,WORD32 i4_ref_mb_type_q2,WORD32 i4_ref_mb_type_q3,WORD32 i4_mb_quard1_part_x,WORD32 i4_mb_quard1_part_y,WORD32 i4_chroma_flag)1263 void isvcd_residual_reflayer_const_non_boundary_mb_sse42(
1264 WORD16 *pi2_inp_data, WORD32 i4_inp_data_stride, WORD16 *pi2_ref_array, WORD32 i4_refarray_wd,
1265 WORD32 i4_refarray_ht, WORD32 i4_ref_mb_type_q0, WORD32 i4_ref_mb_type_q1,
1266 WORD32 i4_ref_mb_type_q2, WORD32 i4_ref_mb_type_q3, WORD32 i4_mb_quard1_part_x,
1267 WORD32 i4_mb_quard1_part_y, WORD32 i4_chroma_flag)
1268 {
1269 WORD32 i4_y;
1270
1271 WORD16 *pi2_ref_data_byte;
1272 WORD16 *pi2_ref_array_temp;
1273 if(i4_chroma_flag == 0)
1274 {
1275 WORD8 index_0[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
1276 __m128i ref_mb_type_8x16_q0, ref_mb_type_8x16_q1, ref_mb_type_8x16_q2, ref_mb_type_8x16_q3,
1277 mb_quard1_part_x_8x16;
1278 __m128i ref_mb_type_8x16_0, ref_mb_type_8x16_1;
1279 __m128i ref_mb_type_8x16_low_0, ref_mb_type_8x16_low_1;
1280 __m128i mb_type_mask_8x16_0 = _mm_set1_epi8(-1);
1281 __m128i mb_type_mask_8x16_1 = _mm_set1_epi8(-1);
1282 __m128i mb_type_mask_8x16_low_0, mb_type_mask_8x16_low_1;
1283 __m128i mask_8x16_0;
1284 __m128i index_arr_0;
1285 __m128i inp_data_16x8_0, inp_data_16x8_1;
1286 __m128i res_16x8_0, res_16x8_1;
1287 __m128i one_8x16 = _mm_set1_epi8(1);
1288 __m128i zero_8x16 = _mm_set1_epi8(0);
1289
1290 index_arr_0 = _mm_loadu_si128((__m128i *) index_0);
1291 ref_mb_type_8x16_q0 = _mm_set1_epi8(i4_ref_mb_type_q0);
1292 ref_mb_type_8x16_q1 = _mm_set1_epi8(i4_ref_mb_type_q1);
1293 ref_mb_type_8x16_q2 = _mm_set1_epi8(i4_ref_mb_type_q2);
1294 ref_mb_type_8x16_q3 = _mm_set1_epi8(i4_ref_mb_type_q3);
1295 if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1296 {
1297 // Quard 0
1298 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1299 ref_mb_type_8x16_1 = ref_mb_type_8x16_q0;
1300 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1301 mb_type_mask_8x16_1 = mb_type_mask_8x16_0;
1302 }
1303 else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1304 (i4_mb_quard1_part_x < i4_refarray_wd))
1305 {
1306 // Quard 0 & 1
1307 if(i4_mb_quard1_part_x == 8)
1308 {
1309 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1310 ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1311 }
1312 else if(i4_mb_quard1_part_x < 8)
1313 {
1314 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1315 mask_8x16_0 =
1316 _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16); // return 1 if a<b, else 0
1317
1318 ref_mb_type_8x16_0 =
1319 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1320 ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1321 }
1322 else
1323 {
1324 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x - 8) << 1);
1325 mask_8x16_0 =
1326 _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16); // return 1 if a<b, else 0
1327
1328 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1329 ref_mb_type_8x16_1 =
1330 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1331 }
1332
1333 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1334 mb_type_mask_8x16_1 = _mm_cmpeq_epi8(ref_mb_type_8x16_1, one_8x16);
1335 }
1336 else
1337 {
1338 if(i4_mb_quard1_part_x >= i4_refarray_wd)
1339 {
1340 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1341 ref_mb_type_8x16_1 = ref_mb_type_8x16_q0;
1342
1343 ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1344 ref_mb_type_8x16_low_1 = ref_mb_type_8x16_q2;
1345 }
1346 else
1347 {
1348 // Quard 0, 1, 2, 3
1349 if(i4_mb_quard1_part_x == 8)
1350 {
1351 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1352 ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1353
1354 ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1355 ref_mb_type_8x16_low_1 = ref_mb_type_8x16_q3;
1356 }
1357 else if(i4_mb_quard1_part_x < 8)
1358 {
1359 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1360 mask_8x16_0 = _mm_cmplt_epi8(index_arr_0,
1361 mb_quard1_part_x_8x16); // return 1 if a<b, else 0
1362
1363 ref_mb_type_8x16_0 =
1364 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1365 ref_mb_type_8x16_1 = ref_mb_type_8x16_q1;
1366
1367 ref_mb_type_8x16_low_0 =
1368 _mm_blendv_epi8(ref_mb_type_8x16_q3, ref_mb_type_8x16_q2, mask_8x16_0);
1369 ref_mb_type_8x16_low_1 = ref_mb_type_8x16_q3;
1370 }
1371 else
1372 {
1373 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x - 8) << 1);
1374 mask_8x16_0 = _mm_cmplt_epi8(index_arr_0,
1375 mb_quard1_part_x_8x16); // return 1 if a<b, else 0
1376
1377 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1378 ref_mb_type_8x16_1 =
1379 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1380
1381 ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1382 ref_mb_type_8x16_low_1 =
1383 _mm_blendv_epi8(ref_mb_type_8x16_q3, ref_mb_type_8x16_q2, mask_8x16_0);
1384 }
1385 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1386 mb_type_mask_8x16_1 = _mm_cmpeq_epi8(ref_mb_type_8x16_1, one_8x16);
1387
1388 mb_type_mask_8x16_low_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_low_0, one_8x16);
1389 mb_type_mask_8x16_low_1 = _mm_cmpeq_epi8(ref_mb_type_8x16_low_1, one_8x16);
1390 }
1391 }
1392
1393 if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1394 {
1395 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1396 {
1397 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1398 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1399 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1400
1401 if(i4_y < i4_mb_quard1_part_y)
1402 {
1403 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_0, mb_type_mask_8x16_0);
1404 res_16x8_1 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_1, mb_type_mask_8x16_1);
1405 }
1406 else
1407 {
1408 res_16x8_0 =
1409 _mm_blendv_epi8(zero_8x16, inp_data_16x8_0, mb_type_mask_8x16_low_0);
1410 res_16x8_1 =
1411 _mm_blendv_epi8(zero_8x16, inp_data_16x8_1, mb_type_mask_8x16_low_1);
1412 }
1413
1414 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1415 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1416 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp + 8), res_16x8_1);
1417 }
1418 }
1419 else
1420 {
1421 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1422 {
1423 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1424 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1425 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1426
1427 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_0, mb_type_mask_8x16_0);
1428 res_16x8_1 = _mm_blendv_epi8(zero_8x16, inp_data_16x8_1, mb_type_mask_8x16_1);
1429
1430 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1431 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1432 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp + 8), res_16x8_1);
1433 }
1434 }
1435 }
1436 else
1437 {
1438 WORD8 index_0[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
1439 WORD8 even_mask_arr[16] = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15};
1440 __m128i ref_mb_type_8x16_q0, ref_mb_type_8x16_q1, ref_mb_type_8x16_q2, ref_mb_type_8x16_q3,
1441 mb_quard1_part_x_8x16;
1442 __m128i ref_mb_type_8x16_0;
1443 __m128i ref_mb_type_8x16_low_0;
1444 __m128i mb_type_mask_8x16_0 = _mm_set1_epi8(-1);
1445 __m128i mb_type_mask_8x16_low_0;
1446 __m128i mask_8x16_0;
1447 __m128i index_arr_0, even_mask;
1448 __m128i inp_data_16x8_0, inp_data_16x8_1, inp_data_16x8;
1449 __m128i res_16x8_0;
1450 __m128i one_8x16 = _mm_set1_epi8(1);
1451 __m128i zero_8x16 = _mm_set1_epi8(0);
1452
1453 index_arr_0 = _mm_loadu_si128((__m128i *) index_0);
1454 even_mask = _mm_loadu_si128((__m128i *) even_mask_arr);
1455
1456 ref_mb_type_8x16_q0 = _mm_set1_epi8(i4_ref_mb_type_q0);
1457 ref_mb_type_8x16_q1 = _mm_set1_epi8(i4_ref_mb_type_q1);
1458 ref_mb_type_8x16_q2 = _mm_set1_epi8(i4_ref_mb_type_q2);
1459 ref_mb_type_8x16_q3 = _mm_set1_epi8(i4_ref_mb_type_q3);
1460 if((i4_mb_quard1_part_x >= i4_refarray_wd) && (i4_mb_quard1_part_y >= i4_refarray_ht))
1461 {
1462 // Quard 0
1463 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1464 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1465 }
1466 else if((i4_mb_quard1_part_y >= (i4_refarray_ht - 1)) &&
1467 (i4_mb_quard1_part_x < i4_refarray_wd))
1468 {
1469 // Quard 0 & 1
1470 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1471 mask_8x16_0 =
1472 _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16); // return 1 if a<b, else 0
1473
1474 ref_mb_type_8x16_0 =
1475 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1476 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1477 }
1478 else
1479 {
1480 if(i4_mb_quard1_part_x >= i4_refarray_wd)
1481 {
1482 // Quard 0 & 2
1483 ref_mb_type_8x16_0 = ref_mb_type_8x16_q0;
1484 ref_mb_type_8x16_low_0 = ref_mb_type_8x16_q2;
1485 }
1486 else
1487 {
1488 // Quard 0, 1, 2, 3
1489 mb_quard1_part_x_8x16 = _mm_set1_epi8((i4_mb_quard1_part_x << 1));
1490 mask_8x16_0 =
1491 _mm_cmplt_epi8(index_arr_0, mb_quard1_part_x_8x16); // return 1 if a<b, else 0
1492
1493 ref_mb_type_8x16_0 =
1494 _mm_blendv_epi8(ref_mb_type_8x16_q1, ref_mb_type_8x16_q0, mask_8x16_0);
1495 ref_mb_type_8x16_low_0 =
1496 _mm_blendv_epi8(ref_mb_type_8x16_q3, ref_mb_type_8x16_q2, mask_8x16_0);
1497
1498 mb_type_mask_8x16_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_0, one_8x16);
1499 mb_type_mask_8x16_low_0 = _mm_cmpeq_epi8(ref_mb_type_8x16_low_0, one_8x16);
1500 }
1501 }
1502
1503 if(i4_mb_quard1_part_y < i4_refarray_ht - 1)
1504 {
1505 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1506 {
1507 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1508 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1509 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1510
1511 inp_data_16x8_0 = _mm_shuffle_epi8(inp_data_16x8_0, even_mask);
1512 inp_data_16x8_1 = _mm_shuffle_epi8(inp_data_16x8_1, even_mask);
1513
1514 inp_data_16x8 = _mm_unpacklo_epi64(inp_data_16x8_0, inp_data_16x8_1);
1515 if(i4_y < i4_mb_quard1_part_y)
1516 {
1517 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8, mb_type_mask_8x16_0);
1518 }
1519 else
1520 {
1521 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8, mb_type_mask_8x16_low_0);
1522 }
1523
1524 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1525 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1526 }
1527 }
1528 else
1529 {
1530 for(i4_y = 0; i4_y < i4_refarray_ht; i4_y++)
1531 {
1532 pi2_ref_data_byte = pi2_inp_data + (i4_y * i4_inp_data_stride);
1533 inp_data_16x8_0 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte));
1534 inp_data_16x8_1 = _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + 8));
1535
1536 inp_data_16x8_0 = _mm_shuffle_epi8(inp_data_16x8_0, even_mask);
1537 inp_data_16x8_1 = _mm_shuffle_epi8(inp_data_16x8_1, even_mask);
1538 inp_data_16x8 = _mm_unpacklo_epi64(inp_data_16x8_0, inp_data_16x8_1);
1539
1540 res_16x8_0 = _mm_blendv_epi8(zero_8x16, inp_data_16x8, mb_type_mask_8x16_0);
1541 pi2_ref_array_temp = pi2_ref_array + (i4_y * i4_refarray_wd);
1542 _mm_storeu_si128((__m128i *) (pi2_ref_array_temp), res_16x8_0);
1543 }
1544 }
1545 }
1546 }
1547