1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvcd_intra_resamp_sse42.c
24 *
25 * @brief
26 * Contains function definitions for intra resampling functions
27 *
28 * @author
29 * Kishore
30 *
31 * @par List of Functions:
32 * - isvcd_interpolate_base_luma_dyadic_sse42
33 * - isvcd_vert_interpol_chroma_dyadic_1_sse42
34 * - isvcd_vert_interpol_chroma_dyadic_2_sse42
35 * - isvcd_vert_interpol_chroma_dyadic_3_sse42
36 * - isvcd_horz_interpol_chroma_dyadic_1_sse42
37 * - isvcd_horz_interpol_chroma_dyadic_2_sse42
38 *
39 * @remarks
40 * None
41 *
42 *******************************************************************************
43 */
44 #include <immintrin.h>
45 #include <smmintrin.h>
46 #include <emmintrin.h>
47 /* User include files */
48 #include "ih264_typedefs.h"
49 #include "isvcd_structs.h"
50
51 /*****************************************************************************/
52 /* */
53 /* Function Name : isvcd_interpolate_base_luma_dyadic_sse42 */
54 /* */
55 /* Description : This function takes the reference array buffer & performs*/
56 /* intra resampling for dyadic scaling ratios */
57 /* Inputs : pu1_inp_buf : ptr to the 12x12 reference sample buffer */
58 /* pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
59 /* vertically interpolated data */
60 /* pu1_out_buf : output buffer pointer */
61 /* i4_out_stride : output buffer stride */
62 /* Globals : none */
63 /* Processing : it does the interpolation in vertical direction followed */
64 /* by horizontal direction */
65 /* Outputs : resampled pixels */
66 /* Returns : none */
67 /* */
68 /* Issues : none */
69 /* */
70 /* Revision History: */
71 /* */
72 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
73 /* 05 21 2021 Dolan creation */
74 /* */
75 /*****************************************************************************/
76
isvcd_interpolate_base_luma_dyadic_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)77 void isvcd_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
78 UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
79 {
80 WORD32 i4_x, i4_y;
81 WORD32 i4_filt_stride, i4_src_stride;
82 UWORD8 *pu1_inp, *pu1_out;
83 WORD16 *pi2_tmp;
84
85 __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
86 __m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
87 __m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
88 __m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
89
90 /* Filter coefficient values for phase 4 */
91 __m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
92 __m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
93 i4_filt_stride = 12;
94 i4_src_stride = DYADIC_REF_W_Y;
95
96 pu1_inp = pu1_inp_buf;
97 pi2_tmp = pi2_tmp_filt_buf;
98 pu1_out = pu1_out_buf;
99
100 /* Vertical interpolation */
101 /*First 64 bit */
102 for(i4_x = 0; i4_x < 1; i4_x++)
103 {
104 /* y = 0, y_phase = 12 */
105 i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
106 i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
107 i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
108 i4_samp_16x8b_3 =
109 _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
110 pu1_inp += (i4_src_stride << 2);
111 i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
112 i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
113 i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
114 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
115
116 /* since y_phase 12 for y = 0 */
117 /*Multiply by 8 => left shift by 3*/
118 i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
119 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
120 i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
121
122 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
123 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
124 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
125
126 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
127 pi2_tmp += i4_filt_stride;
128
129 i4_samp_8x16b_0 = i4_samp_8x16b_1;
130 i4_samp_8x16b_1 = i4_samp_8x16b_2;
131 i4_samp_8x16b_2 = i4_samp_8x16b_3;
132 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
133
134 /* y_phase is 4 for odd values of y */
135 /* and 12 for even values of y */
136 /*Multiply by 8 => left shift by 3*/
137
138 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
139 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
140 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
141
142 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
143 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
144 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
145
146 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
147 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
148
149 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
150 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
151
152 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
153 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
154
155 /* Storing the results */
156 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
157 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
158 pi2_tmp += (i4_filt_stride << 1);
159 pu1_inp += i4_src_stride;
160
161 i4_samp_8x16b_0 = i4_samp_8x16b_1;
162 i4_samp_8x16b_1 = i4_samp_8x16b_2;
163 i4_samp_8x16b_2 = i4_samp_8x16b_3;
164 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
165
166 /* y_phase is 4 for odd values of y */
167 /* and 12 for even values of y */
168 /*Multiply by 8 => left shift by 3*/
169
170 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
171 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
172 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
173
174 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
175 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
176 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
177
178 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
179 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
180
181 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
182 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
183
184 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
185 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
186
187 /* Storing the results */
188 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
189 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
190 pi2_tmp += (i4_filt_stride << 1);
191 pu1_inp += i4_src_stride;
192
193 i4_samp_8x16b_0 = i4_samp_8x16b_1;
194 i4_samp_8x16b_1 = i4_samp_8x16b_2;
195 i4_samp_8x16b_2 = i4_samp_8x16b_3;
196 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
197
198 /* y_phase is 4 for odd values of y */
199 /* and 12 for even values of y */
200 /*Multiply by 8 => left shift by 3*/
201
202 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
203 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
204 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
205
206 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
207 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
208 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
209
210 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
211 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
212
213 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
214 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
215
216 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
217 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
218
219 /* Storing the results */
220 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
221 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
222 pi2_tmp += (i4_filt_stride << 1);
223 pu1_inp += i4_src_stride;
224
225 i4_samp_8x16b_0 = i4_samp_8x16b_1;
226 i4_samp_8x16b_1 = i4_samp_8x16b_2;
227 i4_samp_8x16b_2 = i4_samp_8x16b_3;
228 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
229
230 /* y_phase is 4 for odd values of y */
231 /* and 12 for even values of y */
232 /*Multiply by 8 => left shift by 3*/
233 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
234 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
235 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
236
237 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
238 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
239 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
240
241 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
242 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
243
244 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
245 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
246
247 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
248 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
249
250 /* Storing the results */
251 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
252 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
253 pi2_tmp += (i4_filt_stride << 1);
254 pu1_inp += i4_src_stride;
255
256 i4_samp_8x16b_0 = i4_samp_8x16b_1;
257 i4_samp_8x16b_1 = i4_samp_8x16b_2;
258 i4_samp_8x16b_2 = i4_samp_8x16b_3;
259 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
260 /* y_phase is 4 for odd values of y */
261 /* and 12 for even values of y */
262 /*Multiply by 8 => left shift by 3*/
263 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
264 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
265 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
266
267 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
268 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
269 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
270
271 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
272 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
273
274 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
275 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
276
277 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
278 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
279
280 /* Storing the results */
281 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
282 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
283 pi2_tmp += (i4_filt_stride << 1);
284 pu1_inp += i4_src_stride;
285
286 i4_samp_8x16b_0 = i4_samp_8x16b_1;
287 i4_samp_8x16b_1 = i4_samp_8x16b_2;
288 i4_samp_8x16b_2 = i4_samp_8x16b_3;
289 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
290 /* y_phase is 4 for odd values of y */
291 /* and 12 for even values of y */
292 /*Multiply by 8 => left shift by 3*/
293
294 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
295 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
296 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
297
298 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
299 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
300 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
301
302 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
303 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
304
305 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
306 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
307
308 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
309 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
310
311 /* Storing the results */
312 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
313 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
314 pi2_tmp += (i4_filt_stride << 1);
315 pu1_inp += i4_src_stride;
316
317 i4_samp_8x16b_0 = i4_samp_8x16b_1;
318 i4_samp_8x16b_1 = i4_samp_8x16b_2;
319 i4_samp_8x16b_2 = i4_samp_8x16b_3;
320 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
321 /* y_phase is 4 for odd values of y */
322 /* and 12 for even values of y */
323 /*Multiply by 8 => left shift by 3*/
324
325 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
326 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
327 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
328
329 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
330 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
331 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
332
333 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
334 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
335
336 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
337 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
338
339 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
340 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
341
342 /* Storing the results */
343 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
344 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
345 pi2_tmp += (i4_filt_stride << 1);
346 pu1_inp += i4_src_stride;
347
348 /* y = 15, y_phase = 4 */
349 i4_samp_8x16b_0 = i4_samp_8x16b_1;
350 i4_samp_8x16b_1 = i4_samp_8x16b_2;
351 i4_samp_8x16b_2 = i4_samp_8x16b_3;
352 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
353
354 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
355 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
356 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
357 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
358
359 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
360 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
361
362 /* Store the output */
363 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
364
365 /* Reinitializing the ptrs */
366 pu1_inp = pu1_inp_buf;
367 pi2_tmp = pi2_tmp_filt_buf;
368 } /* End of loop over x */
369
370 /*Remaining 32 bit */
371 pu1_inp += 8;
372 pi2_tmp += 8;
373 for(i4_x = 0; i4_x < 1; i4_x++)
374 {
375 /* y = 0, y_phase = 12 */
376 i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
377 i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
378 i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
379 i4_samp_16x8b_3 =
380 _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
381 pu1_inp += (i4_src_stride << 2);
382 i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
383 i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
384 i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
385 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
386
387 /* since y_phase 12 for y = 0 */
388 /*Multiply by 8 => left shift by 3*/
389 i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
390 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
391 i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
392
393 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
394 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
395 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
396
397 _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
398 pi2_tmp += i4_filt_stride;
399
400 for(i4_y = 1; i4_y < 15; i4_y += 2)
401 {
402 i4_samp_8x16b_0 = i4_samp_8x16b_1;
403 i4_samp_8x16b_1 = i4_samp_8x16b_2;
404 i4_samp_8x16b_2 = i4_samp_8x16b_3;
405 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
406 /* y_phase is 4 for odd values of y */
407 /* and 12 for even values of y */
408 /*Multiply by 8 => left shift by 3*/
409
410 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
411 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
412 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
413
414 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
415 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
416 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
417
418 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
419 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
420
421 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
422 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
423
424 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
425 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
426
427 /* Storing the results */
428 _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
429 _mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
430 pi2_tmp += (i4_filt_stride << 1);
431 pu1_inp += i4_src_stride;
432 } /* End of loop over y */
433
434 /* y = 15, y_phase = 4 */
435 i4_samp_8x16b_0 = i4_samp_8x16b_1;
436 i4_samp_8x16b_1 = i4_samp_8x16b_2;
437 i4_samp_8x16b_2 = i4_samp_8x16b_3;
438 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
439
440 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
441 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
442 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
443 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
444
445 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
446 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
447
448 /* Store the output */
449 _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
450
451 /* Reinitializing the ptrs */
452 pu1_inp = pu1_inp_buf;
453 pi2_tmp = pi2_tmp_filt_buf;
454 }
455
456 {
457 __m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
458 __m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
459 __m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
460 __m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
461
462 __m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
463 __m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
464 __m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
465 __m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
466 __m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
467
468 __m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
469 __m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
470 __m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
471 __m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
472
473 __m128i res_512 = _mm_set1_epi32(512);
474 /* Horizontal interpolation */
475 for(i4_y = 0; i4_y < 16; i4_y++)
476 {
477 // a0 a1 a2 a3 a4 a5 a6 a7
478 i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
479 // a4 a5 a6 a7 a8 a9 a10 a11
480 i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
481 // a1 a2 a3 a4 a5 a6 a7 0
482 i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
483 // a2 a3 a4 a5 a6 a7 0 0
484 i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
485 // a3 a4 a5 a6 a7 0 0 0
486 i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
487 // a4 a5 a6 a7 0 0 0 0
488 i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
489
490 // a5 a6 a7 a8 a9 a10 a11 0
491 i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
492 // a6 a7 a8 a9 a10 a11 0 0
493 i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
494 // a7 a8 a9 a10 a11 0 0 0
495 i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
496 // a8 a9 a10 a11 0 0 0 0
497 i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
498 // a0 a1 a1 a2 a2 a3 a3 a4
499 i4_samp_8x16b_rpart1_0 =
500 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
501 // a1 a2 a2 a3 a3 a4 a4 a5
502 i4_samp_8x16b_rpart1_1 =
503 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
504 // a2 a3 a3 a4 a4 a5 a5 a6
505 i4_samp_8x16b_rpart1_2 =
506 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
507 // a3 a4 a4 a5 a5 a6 a6 a7
508 i4_samp_8x16b_rpart1_3 =
509 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
510 // a4 a5 a5 a6 a6 a7 a7 a8
511 i4_samp_8x16b_rpart2_0 =
512 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
513 // a5 a6 a6 a7 a7 a8 a8 a9
514 i4_samp_8x16b_rpart2_1 =
515 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
516 // a6 a7 a7 a8 a8 a9 a9 a10
517 i4_samp_8x16b_rpart2_2 =
518 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
519 // a7 a8 a8 a9 a9 a10 a10 a11
520 i4_samp_8x16b_rpart2_3 =
521 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
522 // a0c3+a1c2 a1c3+a2c2 a2c3+a3c2 a3c3+a4c2
523 i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
524 // a2c1+a3c0 a3c1+a4c0 a4c1+a5c0 a5c1+a6c0
525 i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
526 // a1c0+a2c1 a2c0+a3c1 a3c0+a4c1 a4c0+a5c1
527 i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
528 // a3c2+a4c3 a5c2+a5c3 a5c2+a6c3 a6c2+a7c3
529 i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
530 // a4c3+a5c2 a5a3+a6c2 a6c3+a7c2 a7c3+a8c2
531 i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
532 // a6c1+a7c0 a7c1+a8c0 a8c1+a9c0 a9c1+a10c0
533 i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
534 // a5c0+a6c1 a6c0+a7c1 a7c0+a8c1 a8c0+a9c1
535 i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
536 // a7c2+a8c3 a8c2+a9c3 a9c2+a10c3 a10c2+a11c3
537 i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
538 // a0c3+a1c2 + a2c1+a3c0 a1c3+a2c2 + a3c1+a4c0 a2c3+a3c2 + a4c1+a5c0
539 // a3c3+a4c2 +a5c1+a6c0
540 i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
541 // a1c0+a2c1 + a3c2+a4c3 a2c0+a3c1 + a5c2+a5c3 a3c0+a4c1 + a5c2+a6c3
542 // a4c0+a5c1 + a6c2+a7c3
543 i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
544 // a4c3+a5c2 + a6c1+a7c0 a5a3+a6c2 + a7c1+a8c0 a6c3+a7c2 + a8c1+a9c0
545 // a7c3+a8c2+ a9c1+a10c0
546 i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
547 // a5c0+a6c1 + a7c2+a8c3 a6c0+a7c1 + a8c2+a9c3 a7c0+a8c1 + a9c2+a10c3
548 // a8c0+a9c1 + a10c2+a11c3
549 i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
550
551 i4_res_4x32b_rpart1_2 =
552 _mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
553 i4_res_4x32b_rpart1_3 =
554 _mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
555
556 i4_res_4x32b_rpart2_2 =
557 _mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
558 i4_res_4x32b_rpart2_3 =
559 _mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
560
561 i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
562 i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
563
564 i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
565 i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
566
567 i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
568 i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
569
570 i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
571 i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
572
573 _mm_storeu_si128(
574 (__m128i *) pu1_out,
575 _mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
576 _mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
577
578 pi2_tmp += i4_filt_stride;
579 pu1_out += i4_out_stride;
580 } /* End of loop over y */
581 }
582 } /* isvcd_interpolate_base_luma_dyadic */
583
584 /*****************************************************************************/
585 /* */
586 /* Function Name : isvcd_interpolate_intra_base_sse42 */
587 /* */
588 /* Description : This function takes the reference array buffer & performs*/
589 /* interpolation of a component to find the intra */
590 /* resampled value */
591 /* Inputs : pv_intra_samp_ctxt : intra sampling context */
592 /* pu1_out : output buffer pointer */
593 /* i4_out_stride : output buffer stride */
594 /* i4_refarray_wd : reference array width */
595 /* i4_x_offset : offset in reference layer in horz direction*/
596 /* ps_coord : current mb co-ordinate */
597 /* i4_chroma_flag : chroma processing flag */
598 /* Globals : none */
599 /* Processing : it does the interpolation in vertical direction followed */
600 /* by horizontal direction */
601 /* Outputs : resampled pixels */
602 /* Returns : none */
603 /* */
604 /* Issues : none */
605 /* */
606 /* Revision History: */
607 /* */
608 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
609 /* 06 09 2021 Kishore creation */
610 /* */
611 /*****************************************************************************/
isvcd_interpolate_intra_base_sse42(void * pv_intra_samp_ctxt,UWORD8 * pu1_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,WORD32 i4_mb_x,WORD32 i4_mb_y,WORD32 i4_chroma_flag,WORD32 i4_refarray_flag)612 void isvcd_interpolate_intra_base_sse42(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
613 WORD32 i4_out_stride, WORD32 i4_refarray_wd, WORD32 i4_mb_x,
614 WORD32 i4_mb_y, WORD32 i4_chroma_flag,
615 WORD32 i4_refarray_flag)
616 {
617 /* --------------------------------------------------------------------- */
618 /* Index Parameters */
619 /* --------------------------------------------------------------------- */
620 intra_sampling_ctxt_t *ps_ctxt;
621 intra_samp_map_ctxt_t *ps_map_ctxt;
622 intra_samp_lyr_ctxt *ps_lyr_ctxt;
623 WORD32 i4_x, i4_y;
624 WORD32 i4_frm_mb_x, i4_frm_mb_y;
625 UWORD8 *pu1_refarray = NULL;
626 ref_pixel_map_t *ps_x_pos_phase;
627 ref_pixel_map_t *ps_y_pos_phase;
628 WORD32 i4_temp_array_ht;
629 WORD32 *pi4_interp_buff;
630 WORD32 i4_mb_wd;
631 WORD32 i4_mb_ht;
632
633 WORD32 i4_x_min;
634 ref_min_max_map_t *ps_x_min_max;
635 WORD8 arr_y_ref_pos_luma[16] = {0};
636 WORD8 arr_x_ref_pos_luma[16] = {0};
637 WORD8 arr_x_ref_pos_luma_low[16] = {0};
638 WORD8 arr_x_ref_pos_luma_high[16] = {0};
639 WORD8 arr_phase_luma[32] = {0};
640 WORD8 *pi4_y_ref_pos_luma;
641 WORD8 *pi4_x_ref_pos_luma_low;
642 WORD8 *pi4_x_ref_pos_luma_high;
643 WORD8 *pi4_phase_luma;
644 UWORD8 *pu1_refarray_temp;
645
646 /* --------------------------------------------------------------------- */
647 /* Extracting pointers from the context */
648 /* --------------------------------------------------------------------- */
649 ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
650 ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
651
652 if(0 == i4_refarray_flag)
653 {
654 pu1_refarray = ps_ctxt->pu1_refarray_buffer;
655 }
656 else if(1 == i4_refarray_flag)
657 {
658 pu1_refarray = ps_ctxt->pu1_refarray_cb;
659 }
660
661 /* --------------------------------------------------------------------- */
662 /* LUMA or CHROMA */
663 /* --------------------------------------------------------------------- */
664
665 if(1 == i4_chroma_flag)
666 ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
667 else
668 ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
669
670 i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
671 i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
672
673 ps_x_min_max = ps_map_ctxt->ps_x_min_max;
674
675 i4_frm_mb_y = i4_mb_y * i4_mb_ht;
676 i4_frm_mb_x = i4_mb_x * i4_mb_wd;
677 /* get the min position */
678 i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
679
680 /* --------------------------------------------------------------------- */
681 /* Projected frame level pointers */
682 /* --------------------------------------------------------------------- */
683 ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
684 ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
685
686 /* --------------------------------------------------------------------- */
687 /* Pointers and Dimenstion of the temporary buffer */
688 /* --------------------------------------------------------------------- */
689 i4_temp_array_ht = i4_mb_ht;
690 pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
691
692 if(i4_chroma_flag == 0)
693 {
694 /* --------------------------------------------------------------------- */
695 /* Loop for interpolation in vertical direction */
696 /* --------------------------------------------------------------------- */
697 WORD16 *pi2_interp_buff_temp;
698 pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
699 {
700 __m128i out_res_8x16b_0, out_res_8x16b_1;
701
702 __m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
703 out_res_8x16b_r01_0;
704 __m128i inp_8x16b_r1, inp_8x16b_r23_0, phs_mask_16x8b_r1, phs_mask_16x8b_r23_0,
705 out_res_8x16b_r01_1;
706 __m128i inp_8x16b_r2, inp_8x16b_r01_1, phs_mask_16x8b_r2, phs_mask_16x8b_r01_1,
707 out_res_8x16b_r23_0;
708 __m128i inp_8x16b_r3, inp_8x16b_r23_1, phs_mask_16x8b_r3, phs_mask_16x8b_r23_1,
709 out_res_8x16b_r23_1;
710
711 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
712 {
713 arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
714 arr_y_ref_pos_luma[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
715 }
716 pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
717 pi4_phase_luma = arr_phase_luma;
718
719 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
720 {
721 pu1_refarray_temp =
722 pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
723 inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp - i4_refarray_wd));
724 inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
725 inp_8x16b_r2 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
726 inp_8x16b_r3 =
727 _mm_loadu_si128((__m128i *) (pu1_refarray_temp + 2 * i4_refarray_wd));
728
729 inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
730 inp_8x16b_r23_0 = _mm_unpacklo_epi8(inp_8x16b_r2, inp_8x16b_r3);
731 inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
732 inp_8x16b_r23_1 = _mm_unpackhi_epi8(inp_8x16b_r2, inp_8x16b_r3);
733
734 phs_mask_16x8b_r0 = _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
735 phs_mask_16x8b_r1 =
736 _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
737 phs_mask_16x8b_r2 =
738 _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
739 phs_mask_16x8b_r3 =
740 _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
741
742 phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
743 phs_mask_16x8b_r23_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
744 phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
745 phs_mask_16x8b_r23_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
746
747 out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
748 out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
749 out_res_8x16b_r23_0 = _mm_maddubs_epi16(inp_8x16b_r23_0, phs_mask_16x8b_r23_0);
750 out_res_8x16b_r23_1 = _mm_maddubs_epi16(inp_8x16b_r23_1, phs_mask_16x8b_r23_1);
751
752 out_res_8x16b_0 = _mm_add_epi16(out_res_8x16b_r01_0, out_res_8x16b_r23_0);
753 out_res_8x16b_1 = _mm_add_epi16(out_res_8x16b_r01_1, out_res_8x16b_r23_1);
754
755 _mm_storeu_si128(
756 (__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
757 out_res_8x16b_0);
758 _mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
759 (i4_x_min - 1) + 8),
760 out_res_8x16b_1);
761 }
762 }
763 /* --------------------------------------------------------------------- */
764 /* Loop for interpolation in horizontal direction */
765 /* --------------------------------------------------------------------- */
766 {
767 WORD32 strt_indx = 10, strt_indx_h = 0;
768
769 __m128i inp_8x16b_0;
770 __m128i inp_8x16b_1;
771
772 __m128i phs_mask_16x8b_0;
773 __m128i phs_mask_16x8b_1;
774 __m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r0_1, x_ref_pos_luma_mask_r1_0,
775 x_ref_pos_luma_mask_r1_1, x_ref_pos_luma_mask_r2_0, x_ref_pos_luma_mask_r2_1,
776 x_ref_pos_luma_mask_r3_0, x_ref_pos_luma_mask_r3_1;
777
778 __m128i inp_8x16b_2, inp_8x16b_3;
779
780 WORD32 i4_x2 = 0;
781 WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
782 __m128i twos = _mm_set1_epi8(2);
783
784 strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
785 strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
786 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
787 {
788 arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
789 arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
790 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
791 }
792
793 for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
794 {
795 i4_x2 = i4_x << 1;
796 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
797 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
798 }
799 for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
800 {
801 i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
802 arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
803 arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
804 }
805 pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
806 pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
807 pi4_phase_luma = arr_phase_luma;
808
809 phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
810 phs_mask_16x8b_1 = _mm_loadu_si128((__m128i *) (pi4_phase_luma + 8));
811
812 x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
813 x_ref_pos_luma_mask_r0_1 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_high));
814 x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
815 x_ref_pos_luma_mask_r1_1 = _mm_add_epi8(x_ref_pos_luma_mask_r0_1, twos);
816 x_ref_pos_luma_mask_r2_0 = _mm_add_epi8(x_ref_pos_luma_mask_r1_0, twos);
817 x_ref_pos_luma_mask_r2_1 = _mm_add_epi8(x_ref_pos_luma_mask_r1_1, twos);
818 x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
819 x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
820
821 {
822 __m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r0_1,
823 ip_filt_8x16b_r01_l_0, ip_filt_8x16b_r01_h_0;
824 __m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r1_1,
825 ip_filt_8x16b_r23_l_0, ip_filt_8x16b_r23_h_0;
826 __m128i ip_filt_16x8b_r2, ip_filt_8x16b_r2_0, ip_filt_8x16b_r2_1,
827 ip_filt_8x16b_r01_l_1, ip_filt_8x16b_r01_h_1;
828 __m128i ip_filt_16x8b_r3, ip_filt_8x16b_r3_0, ip_filt_8x16b_r3_1,
829 ip_filt_8x16b_r23_l_1, ip_filt_8x16b_r23_h_1;
830
831 __m128i inp_8x16b_r0_0, inp_8x16b_r2_0, inp_8x16b_r01_l_0, inp_8x16b_r01_h_0,
832 out_res_4x32b_r01_l_0, out_res_4x32b_r01_h_0;
833 __m128i inp_8x16b_r0_1, inp_8x16b_r2_1, inp_8x16b_r23_l_0, inp_8x16b_r23_h_0,
834 out_res_4x32b_r01_l_1, out_res_4x32b_r01_h_1;
835 __m128i inp_8x16b_r1_0, inp_8x16b_r3_0, inp_8x16b_r01_l_1, inp_8x16b_r01_h_1,
836 out_res_4x32b_r23_l_0, out_res_4x32b_r23_h_0;
837 __m128i inp_8x16b_r1_1, inp_8x16b_r3_1, inp_8x16b_r23_l_1, inp_8x16b_r23_h_1,
838 out_res_4x32b_r23_l_1, out_res_4x32b_r23_h_1;
839
840 __m128i out_res_4x32b_l_0;
841 __m128i out_res_4x32b_l_1;
842 __m128i out_res_4x32b_h_0;
843 __m128i out_res_4x32b_h_1;
844
845 __m128i out_res_8x16b_l;
846 __m128i out_res_8x16b_h;
847
848 __m128i out_res_16x8b;
849 __m128i const_512 = _mm_set1_epi32(512);
850
851 ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma));
852 ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 16));
853 ip_filt_16x8b_r2 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 32));
854 ip_filt_16x8b_r3 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 48));
855
856 ip_filt_8x16b_r0_0 =
857 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
858 ip_filt_8x16b_r1_0 =
859 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
860 ip_filt_8x16b_r2_0 =
861 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_0));
862 ip_filt_8x16b_r3_0 =
863 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_0));
864
865 ip_filt_8x16b_r0_1 =
866 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_1));
867 ip_filt_8x16b_r1_1 =
868 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_1));
869 ip_filt_8x16b_r2_1 =
870 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_1));
871 ip_filt_8x16b_r3_1 =
872 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_1));
873
874 ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
875 ip_filt_8x16b_r23_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
876 ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
877 ip_filt_8x16b_r23_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
878
879 ip_filt_8x16b_r01_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
880 ip_filt_8x16b_r23_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
881 ip_filt_8x16b_r01_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
882 ip_filt_8x16b_r23_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
883
884 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
885 {
886 inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
887 inp_8x16b_1 = _mm_loadu_si128(
888 (__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h));
889 inp_8x16b_2 =
890 _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx + 3));
891 inp_8x16b_3 = _mm_loadu_si128(
892 (__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
893 pi2_interp_buff_temp += i4_refarray_wd;
894
895 inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
896 inp_8x16b_r0_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r0_1);
897 inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
898 inp_8x16b_r1_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r1_1);
899
900 inp_8x16b_r2_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r2_0);
901 inp_8x16b_r2_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r2_1);
902 inp_8x16b_r3_0 = _mm_shuffle_epi8(inp_8x16b_2, x_ref_pos_luma_mask_r3_0);
903 inp_8x16b_r3_1 = _mm_shuffle_epi8(inp_8x16b_3, x_ref_pos_luma_mask_r3_1);
904
905 inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
906 inp_8x16b_r23_l_0 = _mm_unpacklo_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
907 inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
908 inp_8x16b_r23_l_1 = _mm_unpackhi_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
909
910 inp_8x16b_r01_h_0 = _mm_unpacklo_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
911 inp_8x16b_r23_h_0 = _mm_unpacklo_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
912 inp_8x16b_r01_h_1 = _mm_unpackhi_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
913 inp_8x16b_r23_h_1 = _mm_unpackhi_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
914
915 out_res_4x32b_r01_l_0 =
916 _mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
917 out_res_4x32b_r01_l_1 =
918 _mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
919 out_res_4x32b_r23_l_0 =
920 _mm_madd_epi16(inp_8x16b_r23_l_0, ip_filt_8x16b_r23_l_0);
921 out_res_4x32b_r23_l_1 =
922 _mm_madd_epi16(inp_8x16b_r23_l_1, ip_filt_8x16b_r23_l_1);
923
924 out_res_4x32b_r01_h_0 =
925 _mm_madd_epi16(inp_8x16b_r01_h_0, ip_filt_8x16b_r01_h_0);
926 out_res_4x32b_r01_h_1 =
927 _mm_madd_epi16(inp_8x16b_r01_h_1, ip_filt_8x16b_r01_h_1);
928 out_res_4x32b_r23_h_0 =
929 _mm_madd_epi16(inp_8x16b_r23_h_0, ip_filt_8x16b_r23_h_0);
930 out_res_4x32b_r23_h_1 =
931 _mm_madd_epi16(inp_8x16b_r23_h_1, ip_filt_8x16b_r23_h_1);
932
933 out_res_4x32b_l_0 = _mm_add_epi32(out_res_4x32b_r01_l_0, out_res_4x32b_r23_l_0);
934 out_res_4x32b_l_1 = _mm_add_epi32(out_res_4x32b_r01_l_1, out_res_4x32b_r23_l_1);
935 out_res_4x32b_h_0 = _mm_add_epi32(out_res_4x32b_r01_h_0, out_res_4x32b_r23_h_0);
936 out_res_4x32b_h_1 = _mm_add_epi32(out_res_4x32b_r01_h_1, out_res_4x32b_r23_h_1);
937
938 out_res_4x32b_l_0 =
939 _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_0, const_512), 10);
940 out_res_4x32b_l_1 =
941 _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_1, const_512), 10);
942 out_res_4x32b_h_0 =
943 _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_0, const_512), 10);
944 out_res_4x32b_h_1 =
945 _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_1, const_512), 10);
946
947 out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
948 out_res_8x16b_h = _mm_packs_epi32(out_res_4x32b_h_0, out_res_4x32b_h_1);
949
950 out_res_16x8b = _mm_packus_epi16(out_res_8x16b_l, out_res_8x16b_h);
951 _mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_res_16x8b);
952 }
953 }
954 }
955 }
956 else
957 {
958 WORD16 *pi2_interp_buff_temp;
959 pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
960
961 {
962 __m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
963 out_res_8x16b_r01_0;
964 __m128i inp_8x16b_r1, phs_mask_16x8b_r1, out_res_8x16b_r01_1;
965 __m128i inp_8x16b_r01_1, phs_mask_16x8b_r01_1;
966
967 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
968 {
969 arr_y_ref_pos_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
970 arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
971 }
972 pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
973 pi4_phase_luma = arr_phase_luma;
974
975 for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
976 {
977 pu1_refarray_temp =
978 pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
979 inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
980 inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
981
982 inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
983 inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
984
985 phs_mask_16x8b_r0 = _mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
986 phs_mask_16x8b_r1 =
987 _mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
988
989 phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
990 phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
991
992 out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
993 out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
994
995 _mm_storeu_si128(
996 (__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
997 out_res_8x16b_r01_0);
998 _mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
999 (i4_x_min - 1) + 8),
1000 out_res_8x16b_r01_1);
1001 }
1002 }
1003
1004 {
1005 WORD32 strt_indx = 10;
1006 __m128i inp_8x16b_0, inp_8x16b_r0_0;
1007 __m128i phs_mask_16x8b_0;
1008 __m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r1_0;
1009 __m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r01_l_0;
1010 __m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r01_l_1;
1011 __m128i inp_8x16b_r1_0, inp_8x16b_r01_l_0, out_res_4x32b_r01_l_0;
1012 __m128i inp_8x16b_r01_l_1, out_res_4x32b_r01_l_1;
1013
1014 __m128i out_res_4x32b_l_0;
1015 __m128i out_res_4x32b_l_1;
1016 __m128i out_res_8x16b_l;
1017 __m128i out_16x8b_r1;
1018 __m128i chroma_mask;
1019 __m128i const_512 = _mm_set1_epi32(512);
1020
1021 WORD32 i4_x2 = 0;
1022 __m128i twos = _mm_set1_epi8(2);
1023 strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
1024 for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
1025 {
1026 arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
1027 arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
1028 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
1029 i4_x2 = i4_x << 1;
1030 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
1031 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
1032 }
1033
1034 pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
1035 pi4_phase_luma = arr_phase_luma;
1036 phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
1037 x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
1038 x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
1039
1040 ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma));
1041 ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma + 16));
1042
1043 ip_filt_8x16b_r0_0 =
1044 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
1045 ip_filt_8x16b_r1_0 =
1046 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
1047
1048 ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
1049 ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
1050
1051 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
1052 {
1053 inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
1054 pi2_interp_buff_temp += i4_refarray_wd;
1055
1056 inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
1057 inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
1058
1059 inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
1060 inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
1061
1062 out_res_4x32b_r01_l_0 = _mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
1063 out_res_4x32b_r01_l_1 = _mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
1064
1065 out_res_4x32b_l_0 =
1066 _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_0, const_512), 10);
1067 out_res_4x32b_l_1 =
1068 _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_1, const_512), 10);
1069
1070 out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
1071
1072 chroma_mask = _mm_set1_epi16(0xFF00);
1073 out_16x8b_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)));
1074 out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
1075 out_16x8b_r1 = _mm_add_epi8(out_res_8x16b_l, out_16x8b_r1);
1076 _mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_16x8b_r1);
1077 }
1078 }
1079 }
1080 return;
1081 } /* End of Interpolation Function */
1082
1083 /*****************************************************************************/
1084 /* */
1085 /* Function Name : isvcd_vert_interpol_chroma_dyadic_1_sse42 */
1086 /* */
1087 /* Description : This function takes the reference array buffer & performs*/
1088 /* interpolation of a component to find the intra */
1089 /* resampled value */
1090 /* Inputs : pv_intra_samp_ctxt : intra sampling context */
1091 /* pu1_out : output buffer pointer */
1092 /* i4_out_stride : output buffer stride */
1093 /* i4_refarray_wd : reference array width */
1094 /* i4_x_offset : offset in reference layer in horz direction*/
1095 /* ps_coord : current mb co-ordinate */
1096 /* i4_chroma_flag : chroma processing flag */
1097 /* Globals : none */
1098 /* Processing : it does the interpolation on horizontal direction */
1099 /* Outputs : resampled pixels */
1100 /* Returns : none */
1101 /* */
1102 /* Issues : none */
1103 /* */
1104 /* Revision History: */
1105 /* */
1106 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1107 /* 06 09 2021 Kishore creation */
1108 /* */
1109 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_1_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1110 void isvcd_vert_interpol_chroma_dyadic_1_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1111 WORD32 i4_phase_0, WORD32 i4_phase_1)
1112 {
1113 WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1114 WORD32 i4_filt_stride, i4_src_stride;
1115 UWORD8 *pu1_inp;
1116 WORD16 *pi2_tmp;
1117 __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
1118 i4_samp_16x8b_5;
1119 __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
1120 i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
1121 __m128i i4_res_8x16b_r7_temp;
1122 __m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
1123
1124 i4_coeff_0 = (WORD8) (8 - i4_phase_0);
1125 i4_coeff_1 = (WORD8) (i4_phase_0);
1126 i4_coeff_2 = (WORD8) (8 - i4_phase_1);
1127 i4_coeff_3 = (WORD8) (i4_phase_1);
1128
1129 i4_c0_c1_16x8b =
1130 _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1131 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1132 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
1133 i4_c2_c3_16x8b =
1134 _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1135 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1136 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
1137
1138 pu1_inp = pu1_inp_buf;
1139 pi2_tmp = pi2_tmp_filt_buf;
1140 i4_filt_stride = 6;
1141 i4_src_stride = DYADIC_REF_W_C;
1142
1143 i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
1144 i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
1145 i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
1146 i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
1147 i4_samp_16x8b_4 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2)));
1148 i4_samp_16x8b_5 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
1149
1150 i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
1151 i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
1152 _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
1153
1154 i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
1155 i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
1156 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
1157
1158 i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
1159 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
1160
1161 i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
1162 i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
1163 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
1164 i4_res_8x16b_r3);
1165
1166 i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
1167 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
1168
1169 i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
1170 i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
1171 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
1172 i4_res_8x16b_r5);
1173
1174 i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
1175 _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
1176 i4_res_8x16b_r6);
1177
1178 i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
1179
1180 i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
1181
1182 i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
1183
1184 i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
1185
1186 i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
1187
1188 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
1189 i4_res_8x16b_r7_temp);
1190 }
1191
1192 /*****************************************************************************/
1193 /* */
1194 /* Function Name : isvcd_vert_interpol_chroma_dyadic_2_sse42 */
1195 /* */
1196 /* Description : This function takes the reference array buffer & performs*/
1197 /* vertical intra resampling for dyadic scaling ratios for */
1198 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1199 /* chroma_phase_y_plus1: */
1200 /* ref_lyr cur_lyr */
1201 /* 0 1 */
1202 /* 0 2 */
1203 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1204 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
1205 /* vertically interpolated data */
1206 /* i4_phase_0 : y phase for even values of y */
1207 /* i4_phase_1 : y phase for odd values of y */
1208 /* Globals : none */
1209 /* Processing : it does the interpolation in vertical direction */
1210 /* Outputs : vertically resampled samples */
1211 /* Returns : none */
1212 /* */
1213 /* Issues : none */
1214 /* */
1215 /* Revision History: */
1216 /* */
1217 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1218 /* 21 05 2021 Dolan creation */
1219 /* */
1220 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_2_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1221 void isvcd_vert_interpol_chroma_dyadic_2_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1222 WORD32 i4_phase_0, WORD32 i4_phase_1)
1223 {
1224 WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1225 WORD32 i4_filt_stride, i4_src_stride;
1226 UWORD8 *pu1_inp;
1227 WORD16 *pi2_tmp;
1228 __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
1229 __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
1230 i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
1231 __m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
1232 i4_coeff_0 = (WORD8) (8 - i4_phase_0);
1233 i4_coeff_1 = (WORD8) (i4_phase_0);
1234 i4_coeff_2 = (WORD8) (8 - i4_phase_1);
1235 i4_coeff_3 = (WORD8) (i4_phase_1);
1236
1237 i4_c0_c1_16x8b =
1238 _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1239 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1240 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
1241 i4_c2_c3_16x8b =
1242 _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1243 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1244 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
1245
1246 pi2_tmp = pi2_tmp_filt_buf;
1247 i4_filt_stride = 6;
1248 i4_src_stride = DYADIC_REF_W_C;
1249 pu1_inp = pu1_inp_buf + i4_src_stride;
1250
1251 i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
1252 i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
1253 i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
1254 i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
1255 i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
1256
1257 i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
1258 i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
1259 _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
1260
1261 i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
1262 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
1263
1264 i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
1265 i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
1266 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
1267
1268 i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
1269 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
1270 i4_res_8x16b_r3);
1271
1272 i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
1273 i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
1274 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
1275
1276 i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
1277 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
1278 i4_res_8x16b_r5);
1279
1280 i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
1281 i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
1282 _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
1283 i4_res_8x16b_r6);
1284
1285 i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
1286 i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
1287 i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
1288 i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
1289
1290 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
1291 i4_res_8x16b_r7_temp);
1292 }
1293
1294 /*****************************************************************************/
1295 /* */
1296 /* Function Name : isvcd_vert_interpol_chroma_dyadic_3_sse42 */
1297 /* */
1298 /* Description : This function takes the reference array buffer & performs*/
1299 /* vertical intra resampling for dyadic scaling ratios for */
1300 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1301 /* chroma_phase_y_plus1: */
1302 /* ref_lyr cur_lyr */
1303 /* 2 0 */
1304 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1305 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
1306 /* vertically interpolated data */
1307 /* i4_phase_0 : y phase for even values of y */
1308 /* i4_phase_1 : y phase for odd values of y */
1309 /* Globals : none */
1310 /* Processing : it does the interpolation in vertical direction */
1311 /* Outputs : vertically resampled samples */
1312 /* Returns : none */
1313 /* */
1314 /* Issues : none */
1315 /* */
1316 /* Revision History: */
1317 /* */
1318 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1319 /* 21 05 2021 Dolan creation */
1320 /* */
1321 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_3_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1322 void isvcd_vert_interpol_chroma_dyadic_3_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1323 WORD32 i4_phase_0, WORD32 i4_phase_1)
1324 {
1325 WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1326 WORD32 i4_filt_stride, i4_src_stride;
1327 UWORD8 *pu1_inp;
1328 WORD16 *pi2_tmp;
1329 __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
1330 __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
1331 i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
1332 __m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
1333 i4_coeff_0 = (WORD8) (8 - i4_phase_0);
1334 i4_coeff_1 = (WORD8) (i4_phase_0);
1335 i4_coeff_2 = (WORD8) (8 - i4_phase_1);
1336 i4_coeff_3 = (WORD8) (i4_phase_1);
1337
1338 i4_c0_c1_16x8b =
1339 _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1340 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1341 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
1342 i4_c2_c3_16x8b =
1343 _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1344 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1345 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
1346
1347 pi2_tmp = pi2_tmp_filt_buf;
1348 i4_filt_stride = 6;
1349 i4_src_stride = DYADIC_REF_W_C;
1350 pu1_inp = pu1_inp_buf;
1351
1352 i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
1353 i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
1354 i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
1355 i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
1356 i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
1357
1358 i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
1359 i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
1360 _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
1361
1362 i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
1363 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
1364
1365 i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
1366 i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
1367 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
1368
1369 i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
1370 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
1371 i4_res_8x16b_r3);
1372
1373 i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
1374 i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
1375 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
1376
1377 i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
1378 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
1379 i4_res_8x16b_r5);
1380
1381 i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
1382 i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
1383 _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
1384 i4_res_8x16b_r6);
1385
1386 i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
1387 i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
1388 i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
1389 i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
1390 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
1391 i4_res_8x16b_r7_temp);
1392 }
1393
1394 /*****************************************************************************/
1395 /* */
1396 /* Function Name : isvcd_horz_interpol_chroma_dyadic_1_sse42 */
1397 /* */
1398 /* Description : This function takes the reference array buffer & performs*/
1399 /* vertical intra resampling for dyadic scaling ratios for */
1400 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1401 /* chroma_phase_y_plus1: */
1402 /* ref_lyr cur_lyr */
1403 /* 2 0 */
1404 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1405 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold */
1406 /* vertically interpolated data */
1407 /* i4_phase_0 : y phase for even values of y */
1408 /* i4_phase_1 : y phase for odd values of y */
1409 /* Globals : none */
1410 /* Processing : it does the interpolation in vertical direction */
1411 /* Outputs : vertically resampled samples */
1412 /* Returns : none */
1413 /* */
1414 /* Issues : none */
1415 /* */
1416 /* Revision History: */
1417 /* */
1418 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1419 /* 21 05 2021 Dolan creation */
1420 /* */
1421 /*****************************************************************************/
isvcd_horz_interpol_chroma_dyadic_1_sse42(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1422 void isvcd_horz_interpol_chroma_dyadic_1_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1423 WORD32 i4_out_stride, WORD32 i4_phase_0,
1424 WORD32 i4_phase_1)
1425 {
1426 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1427 WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
1428 UWORD8 *pu1_out;
1429 WORD16 *pi2_tmp;
1430
1431 __m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
1432 __m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
1433 __m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
1434 __m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
1435 __m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
1436 __m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
1437 __m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
1438 __m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
1439
1440 __m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
1441 __m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
1442 __m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
1443 __m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
1444 __m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
1445 __m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
1446 __m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
1447 __m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
1448
1449 __m128i i4_res_final_8x16b_r1;
1450 __m128i i4_res_final_8x16b_r2;
1451 __m128i i4_res_final_8x16b_r3;
1452 __m128i i4_res_final_8x16b_r4;
1453 __m128i i4_res_final_8x16b_r5;
1454 __m128i i4_res_final_8x16b_r6;
1455 __m128i i4_res_final_8x16b_r7;
1456 __m128i i4_res_final_8x16b_r8;
1457
1458 __m128i out_16x8b_r1;
1459 __m128i out_16x8b_r2;
1460 __m128i out_16x8b_r3;
1461 __m128i out_16x8b_r4;
1462 __m128i out_16x8b_r5;
1463 __m128i out_16x8b_r6;
1464 __m128i out_16x8b_r7;
1465 __m128i out_16x8b_r8;
1466 __m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
1467 __m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
1468 __m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
1469 __m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
1470 __m128i chroma_mask, chroma_mask2;
1471 __m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
1472
1473 i4_coeff_0 = 8 - i4_phase_0;
1474 i4_coeff_1 = i4_phase_0;
1475 i4_coeff_2 = 8 - i4_phase_1;
1476 i4_coeff_3 = i4_phase_1;
1477 coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
1478 i4_coeff_0, i4_coeff_1, i4_coeff_0);
1479 coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
1480 i4_coeff_2, i4_coeff_3, i4_coeff_2);
1481 res_32 = _mm_set1_epi32(32);
1482 pu1_out = pu1_out_buf;
1483 pi2_tmp = pi2_tmp_filt_buf;
1484 i4_dst_stride = i4_out_stride;
1485
1486 i4_dst_stride2 = i4_dst_stride << 1;
1487 i4_dst_stride4 = i4_dst_stride << 2;
1488
1489 /* Horizontal interpolation */
1490 /* x = 0, x_phase = phase_0 */
1491 i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
1492 i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6)); // b0 b1 b2 b3 b4 b5 b6 b7
1493 i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12)); // b0 b1 b2 b3 b4 b5 b6 b7
1494 i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18)); // b0 b1 b2 b3 b4 b5 b6 b7
1495 i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24)); // b0 b1 b2 b3 b4 b5 b6 b7
1496 i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30)); // b0 b1 b2 b3 b4 b5 b6 b7
1497 i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36)); // b0 b1 b2 b3 b4 b5 b6 b7
1498 i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42)); // b0 b1 b2 b3 b4 b5 b6 b7
1499
1500 i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
1501 i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4); // a2 a3 a4 a5 a6 a7 0 0
1502
1503 i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1504 i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1505
1506 i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1507 i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1508
1509 i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1510 i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1511
1512 i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1513 i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1514
1515 i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1516 i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1517
1518 i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1519 i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1520
1521 i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1522 i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4); // b2 b3 b4 b5 b6 b7 0 0
1523
1524 i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
1525 i4_samp_8x16b_r1_1); // a0 a1 a1 a2 a2 a3 a3 a4
1526 i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
1527 i4_samp_8x16b_r2_1); // b0 b1 b1 b2 b2 b3 b3 b4
1528 i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
1529 i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
1530 i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
1531 i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
1532 i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
1533 i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
1534
1535 i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1,
1536 i4_samp_8x16b_r1_2); // a1 a2 a2 a3 a3 a4 a4 a5
1537 i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1,
1538 i4_samp_8x16b_r2_2); // b1 b2 b2 b3 b3 b4 b4 b5
1539 i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
1540 i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
1541 i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
1542 i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
1543 i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
1544 i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
1545
1546 // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
1547 i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
1548 // b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
1549 i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
1550 i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
1551 i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
1552 i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
1553 i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
1554 i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
1555 i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
1556
1557 // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
1558 i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
1559 // b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
1560 i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
1561 i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
1562 i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
1563 i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
1564 i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
1565 i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
1566 i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
1567
1568 i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
1569 i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
1570 i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
1571 i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
1572 i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
1573 i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
1574 i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
1575 i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
1576
1577 i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
1578 i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
1579 i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
1580 i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
1581 i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
1582 i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
1583 i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
1584 i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
1585
1586 i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
1587 i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
1588 i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
1589 i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
1590 i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
1591 i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
1592 i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
1593 i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
1594
1595 i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
1596 i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
1597 i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
1598 i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
1599 i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
1600 i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
1601 i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
1602 i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
1603
1604 i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
1605 i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
1606 i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
1607 i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
1608
1609 i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
1610 i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
1611 i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
1612 i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
1613
1614 i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1615 i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1616 i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1617 i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1618 i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1619 i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1620 i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1621 i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1622
1623 chroma_mask = _mm_set1_epi16(0xFF00);
1624 chroma_mask2 = _mm_set1_epi16(0x00FF);
1625 out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
1626 out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
1627 out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
1628 out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
1629 out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
1630 out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
1631 out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
1632 out_16x8b_r8 =
1633 _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
1634
1635 out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
1636 out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
1637 out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
1638 out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
1639 out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
1640 out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
1641 out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
1642 out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
1643
1644 i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
1645 i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
1646 i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
1647 i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
1648 i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
1649 i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
1650 i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
1651 i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
1652
1653 out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
1654 out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
1655 out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
1656 out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
1657 out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
1658 out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
1659 out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
1660 out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
1661
1662 _mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
1663 _mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
1664 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
1665 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
1666 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
1667 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
1668 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
1669 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
1670 /* End of loop over x */
1671 } /* isvcd_horz_interpol_chroma_dyadic_1_sse42 */
1672
1673 /*****************************************************************************/
1674 /* */
1675 /* Function Name : isvcd_horz_interpol_chroma_dyadic_2_sse42 */
1676 /* */
1677 /* Description : This function takes the reference array buffer & performs*/
1678 /* vertical intra resampling for dyadic scaling ratios for */
1679 /* chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1680 /* chroma_phase_y_plus1: */
1681 /* ref_lyr cur_lyr */
1682 /* 2 0 */
1683 /* Inputs : pu1_inp_buf : ptr to the 6x6 reference sample buffer */
1684 /* pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the */
1685 /* vertically interpolated data */
1686 /* i4_phase_0 : y phase for even values of y */
1687 /* i4_phase_1 : y phase for odd values of y */
1688 /* Globals : none */
1689 /* Processing : it does the interpolation in vertical direction */
1690 /* Outputs : vertically resampled samples */
1691 /* Returns : none */
1692 /* */
1693 /* Issues : none */
1694 /* */
1695 /* Revision History: */
1696 /* */
1697 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1698 /* 21 05 2021 Dolan creation */
1699 /* */
1700 /*****************************************************************************/
isvcd_horz_interpol_chroma_dyadic_2_sse42(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1701 void isvcd_horz_interpol_chroma_dyadic_2_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1702 WORD32 i4_out_stride, WORD32 i4_phase_0,
1703 WORD32 i4_phase_1)
1704 {
1705 WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1706 WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
1707 UWORD8 *pu1_out;
1708 WORD16 *pi2_tmp;
1709
1710 __m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1;
1711 __m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1;
1712 __m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1;
1713 __m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1;
1714 __m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1;
1715 __m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1;
1716 __m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1;
1717 __m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1;
1718
1719 __m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
1720 __m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
1721 __m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
1722 __m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
1723 __m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
1724 __m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
1725 __m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
1726 __m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
1727
1728 __m128i i4_res_final_8x16b_r1;
1729 __m128i i4_res_final_8x16b_r2;
1730 __m128i i4_res_final_8x16b_r3;
1731 __m128i i4_res_final_8x16b_r4;
1732 __m128i i4_res_final_8x16b_r5;
1733 __m128i i4_res_final_8x16b_r6;
1734 __m128i i4_res_final_8x16b_r7;
1735 __m128i i4_res_final_8x16b_r8;
1736
1737 __m128i out_16x8b_r1;
1738 __m128i out_16x8b_r2;
1739 __m128i out_16x8b_r3;
1740 __m128i out_16x8b_r4;
1741 __m128i out_16x8b_r5;
1742 __m128i out_16x8b_r6;
1743 __m128i out_16x8b_r7;
1744 __m128i out_16x8b_r8;
1745 __m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
1746 __m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
1747 __m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
1748 __m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
1749 __m128i chroma_mask, chroma_mask2;
1750 __m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
1751
1752 i4_coeff_0 = 8 - i4_phase_0;
1753 i4_coeff_1 = i4_phase_0;
1754 i4_coeff_2 = 8 - i4_phase_1;
1755 i4_coeff_3 = i4_phase_1;
1756 coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
1757 i4_coeff_0, i4_coeff_1, i4_coeff_0);
1758 coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
1759 i4_coeff_2, i4_coeff_3, i4_coeff_2);
1760 res_32 = _mm_set1_epi32(32);
1761 pu1_out = pu1_out_buf;
1762 pi2_tmp = pi2_tmp_filt_buf + 1;
1763 i4_dst_stride = i4_out_stride;
1764
1765 i4_dst_stride2 = i4_dst_stride << 1;
1766 i4_dst_stride4 = i4_dst_stride << 2;
1767
1768 /* Horizontal interpolation */
1769 /* x = 0, x_phase = phase_0 */
1770 i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp); // a0 a1 a2 a3 a4 a5 a6 a7
1771 i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6)); // b0 b1 b2 b3 b4 b5 b6 b7
1772 i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12)); // b0 b1 b2 b3 b4 b5 b6 b7
1773 i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18)); // b0 b1 b2 b3 b4 b5 b6 b7
1774 i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24)); // b0 b1 b2 b3 b4 b5 b6 b7
1775 i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30)); // b0 b1 b2 b3 b4 b5 b6 b7
1776 i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36)); // b0 b1 b2 b3 b4 b5 b6 b7
1777 i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42)); // b0 b1 b2 b3 b4 b5 b6 b7
1778
1779 i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2); // a1 a2 a3 a4 a5 a6 a7 0
1780 i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1781 i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1782 i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1783 i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1784 i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1785 i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1786 i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2); // b1 b2 b3 b4 b5 b6 b7 0
1787
1788 i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
1789 i4_samp_8x16b_r1_1); // a0 a1 a1 a2 a2 a3 a3 a4
1790 i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
1791 i4_samp_8x16b_r2_1); // b0 b1 b1 b2 b2 b3 b3 b4
1792 i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
1793 i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
1794 i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
1795 i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
1796 i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
1797 i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
1798
1799 // a0c0+a1c1 a1c0+a2c1 a2c0+a3c1 a3c0+a4c1
1800 i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
1801 // b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
1802 i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
1803 i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
1804 i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
1805 i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
1806 i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
1807 i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
1808 i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
1809
1810 // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
1811 i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c2_c3_8x16b);
1812 // b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
1813 i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c2_c3_8x16b);
1814 i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c2_c3_8x16b);
1815 i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c2_c3_8x16b);
1816 i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c2_c3_8x16b);
1817 i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c2_c3_8x16b);
1818 i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c2_c3_8x16b);
1819 i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c2_c3_8x16b);
1820
1821 i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
1822 i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
1823 i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
1824 i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
1825 i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
1826 i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
1827 i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
1828 i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
1829
1830 i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
1831 i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
1832 i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
1833 i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
1834 i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
1835 i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
1836 i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
1837 i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
1838
1839 i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
1840 i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
1841 i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
1842 i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
1843 i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
1844 i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
1845 i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
1846 i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
1847
1848 i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
1849 i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
1850 i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
1851 i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
1852 i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
1853 i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
1854 i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
1855 i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
1856
1857 i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
1858 i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
1859 i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
1860 i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
1861
1862 i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
1863 i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
1864 i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
1865 i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
1866
1867 i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1868 i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1869 i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1870 i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1871 i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1872 i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1873 i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1874 i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1875
1876 chroma_mask = _mm_set1_epi16(0xFF00);
1877 chroma_mask2 = _mm_set1_epi16(0x00FF);
1878 out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
1879 out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
1880 out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
1881 out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
1882 out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
1883 out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
1884 out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
1885 out_16x8b_r8 =
1886 _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
1887
1888 out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
1889 out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
1890 out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
1891 out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
1892 out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
1893 out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
1894 out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
1895 out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
1896
1897 i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
1898 i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
1899 i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
1900 i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
1901 i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
1902 i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
1903 i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
1904 i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
1905
1906 out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
1907 out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
1908 out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
1909 out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
1910 out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
1911 out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
1912 out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
1913 out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
1914
1915 _mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
1916 _mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
1917 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
1918 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
1919 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
1920 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
1921 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
1922 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
1923
1924 /* End of loop over x */
1925 }
1926